diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..53f2797
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,6 @@
+.github/actions/ @codethinki
+.github/workflows/ @codethinki
+ci/docker/ @codethinki
+
+LICENSE @codethinki
+.clang-format @codethinki
diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
new file mode 100644
index 0000000..72a9cfb
--- /dev/null
+++ b/.github/actions/build-container/action.yml
@@ -0,0 +1,32 @@
+name: 'Build Container'
+description: 'Builds and pushes the Linux Docker image'
+inputs:
+  dockerfile:
+    description: 'Path to the Dockerfile'
+    required: true
+  tag:
+    description: 'Tag suffix for the image'
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Log in to the Container registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ github.token }}
+
+    - name: Build Docker image
+      uses: docker/build-push-action@v5
+      with:
+        context: .
+        file: ${{ inputs.dockerfile }}
+        push: true
+        tags: ghcr.io/${{ github.repository }}:${{ inputs.tag }}
+        cache-from: type=registry,ref=ghcr.io/${{ github.repository }}:${{ inputs.tag }}-cache
+        cache-to: type=registry,ref=ghcr.io/${{ github.repository }}:${{ inputs.tag }}-cache,mode=max
\ No newline at end of file
diff --git a/.github/actions/build-flashmini/action.yml b/.github/actions/build-flashmini/action.yml
new file mode 100644
index 0000000..d461d1c
--- /dev/null
+++ b/.github/actions/build-flashmini/action.yml
@@ -0,0 +1,62 @@
+name: 'Build flashmini'
+description: 'Handles Caching, Configuration, and Compilation'
+inputs:
+  cache_prefix:
+    description: 'Cache Prefix e.g. OS'
+    required: true
+  compiler:
+    description: 'Compiler'
+    required: true
+  backend:
+    description: 'Backend'
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    # --- 1. Restore Vcpkg ---
+    - name: Restore vcpkg cache
+      id: restore-vcpkg
+      uses: actions/cache/restore@v4
+      with:
+        path: vcpkg_installed
+        key: vcpkg-${{ inputs.cache_prefix }}-${{ inputs.compiler }}-${{ inputs.backend }}-${{ hashFiles('vcpkg.json') }}
+        restore-keys: vcpkg-${{ inputs.cache_prefix }}-${{ inputs.compiler }}-${{ inputs.backend }}-
+
+    # --- 2. Configure ---
+    - name: Configure CMake
+      shell: bash
+      run: |
+        [ -f ci/CMakeUserPresets.json ] && cp ci/CMakeUserPresets.json CMakeUserPresets.json
+        # Preset: ci_gcc_af_cpu
+        cmake --preset ci_${{ inputs.compiler }}_af_${{ inputs.backend }}
+
+    # --- 3. Save Vcpkg ---
+    - name: Save vcpkg cache
+      if: steps.restore-vcpkg.outputs.cache-hit != 'true'
+      uses: actions/cache/save@v4
+      with:
+        path: vcpkg_installed
+        key: ${{ steps.restore-vcpkg.outputs.cache-primary-key }}
+
+    # --- 4. Restore BuildCache ---
+    - name: Restore BuildCache
+      id: restore-buildcache
+      uses: actions/cache/restore@v4
+      with:
+        path: .buildcache
+        key: buildcache-${{ inputs.cache_prefix }}-${{ inputs.compiler }}-${{ inputs.backend }}-${{ github.run_id }}
+        restore-keys: buildcache-${{ inputs.cache_prefix }}-${{ inputs.compiler }}-${{ inputs.backend }}-
+
+    # --- 5. Build ---
+    - name: Build
+      shell: bash
+      run: cmake --build --preset ci_${{ inputs.compiler }}_af_${{ inputs.backend }}
+
+    # --- 6. Save BuildCache ---
+    - name: Save BuildCache
+      if: always() && steps.restore-buildcache.outputs.cache-primary-key != ''
+      uses: actions/cache/save@v4
+      with:
+        path: .buildcache
+        key: ${{ steps.restore-buildcache.outputs.cache-primary-key }}
\ No newline at end of file
diff --git a/.github/actions/setup-windows/action.yml b/.github/actions/setup-windows/action.yml
new file mode 100644
index 0000000..09c1b85
--- /dev/null
+++ b/.github/actions/setup-windows/action.yml
@@ -0,0 +1,85 @@
+name: 'Setup Windows Environment'
+description: 'Restores Vcpkg, BuildCache etc.'
+inputs:
+  compiler:
+    description: 'compiler to CMakeUserPresets.json'
+    required: true
+  backend: 
+    description: "backend to compile for"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - uses: ilammy/msvc-dev-cmd@v1
+
+    - name: Setup Vcpkg Environment
+      shell: pwsh
+      run: |
+        $vcpkgPath = $env:VCPKG_INSTALLATION_ROOT
+        if (-not (Test-Path "$vcpkgPath")) {
+           Write-Error "Vcpkg not found at VCPKG_INSTALLATION_ROOT ($vcpkgPath)"
+           exit 1
+        }
+        "VCPKG_ROOT=$vcpkgPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+
+    - name: Install BuildCache
+      shell: pwsh
+      run: |
+        Invoke-WebRequest -Uri "https://gitlab.com/bits-n-bites/buildcache/-/releases/v0.31.7/downloads/buildcache-windows.zip" -OutFile "buildcache.zip"
+        Expand-Archive buildcache.zip -DestinationPath c:\buildcache
+        echo "c:\buildcache\buildcache\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+
+    - name: Install CUDA (micromamba)
+      if: inputs.backend == 'cuda'
+      uses: mamba-org/setup-micromamba@v1
+      with:
+        environment-name: cuda-env
+        condarc: |
+          channels:
+            - nvidia
+            - conda-forge
+        create-args: >-
+          cuda-toolkit=12.3.0
+        cache-environment: true
+        init-shell: powershell 
+
+    - name: Set CUDA Environment Variables
+      if: inputs.backend == 'cuda'
+      shell: powershell
+      run: |
+        "CUDAToolkit_ROOT=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
+        "CUDA_PATH=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
+        "$env:CONDA_PREFIX\Library\bin" >> $env:GITHUB_PATH
+        "$env:CONDA_PREFIX\bin" >> $env:GITHUB_PATH
+
+    - name: Cache ArrayFire
+      id: cache-arrayfire-windows
+      uses: actions/cache@v4
+      with:
+        path: C:\tools\ArrayFire
+        key: arrayfire-windows-3.10.0
+    
+    - name: "Install ArrayFire"
+      if: steps.cache-arrayfire-windows.outputs.cache-hit != 'true'
+      run: |
+        choco install --no-progress wget -y
+        cd $HOME
+        wget -nv https://arrayfire.gateway.scarf.sh/windows/3.10.0/ArrayFire.exe -O ArrayFire.exe
+        7z.exe x ArrayFire.exe -o"C:\tools\ArrayFire" -y
+        rm ArrayFire.exe
+      shell: bash -el {0}
+
+    - name: Set ArrayFire Env
+      run: |
+        echo "ArrayFire_DIR=C:\tools\ArrayFire" >> $GITHUB_ENV
+        echo "C:\tools\ArrayFire\lib" >> $GITHUB_PATH
+      shell: bash -el {0}
+
+    - uses: actions/setup-python@v5
+      with:
+        python-version: "3.x"
+    
+    - name: Install CMake & Ninja
+      run: pip install --upgrade cmake ninja
+      shell: powershell
\ No newline at end of file
diff --git a/.github/actions/test-flashmini/action.yml b/.github/actions/test-flashmini/action.yml
new file mode 100644
index 0000000..58fc3ff
--- /dev/null
+++ b/.github/actions/test-flashmini/action.yml
@@ -0,0 +1,13 @@
+name: 'Test flashmini'
+description: 'Runs CTest in the specified directory'
+inputs:
+  test_dir:
+    description: 'Directory containing the CTestTestfile.cmake (usually build dir)'
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Test
+      shell: bash
+      run: ctest --test-dir ${{ inputs.test_dir }} -C Release --output-on-failure
\ No newline at end of file
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
new file mode 100644
index 0000000..7f4c5c0
--- /dev/null
+++ b/.github/workflows/build-test.yml
@@ -0,0 +1,40 @@
+name: Build & Test
+
+on:
+  push: { branches: ["master", "_master/add_ci"] }
+  pull_request: { branches: ["master"] }
+  workflow_dispatch:
+
+permissions: { contents: read, packages: write }
+
+jobs:
+  linux:
+    name: Linux (${{ matrix.compiler }}, ${{ matrix.backend }})
+    strategy:
+      fail-fast: false
+      matrix:
+        compiler: [gcc]
+        backend: [cpu, cuda]
+    
+    uses: ./.github/workflows/linux-pipeline.yml
+    with:
+      compiler: ${{ matrix.compiler }}
+      backend: ${{ matrix.backend }}
+      # run_tests is true only if backend is cpu
+      run_tests: ${{ matrix.backend == 'cpu' }}
+    secrets: inherit
+
+  windows:
+    name: Windows (${{ matrix.compiler }}, ${{ matrix.backend }})
+    strategy:
+      fail-fast: false
+      matrix:
+        compiler: [msvc]
+        backend: [cpu, cuda]
+
+    uses: ./.github/workflows/windows-pipeline.yml
+    with:
+      compiler: ${{ matrix.compiler }}
+      backend: ${{ matrix.backend }}
+      run_tests: ${{ matrix.backend == 'cpu' }}
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/docker_image_build.yml b/.github/workflows/docker_image_build.yml
deleted file mode 100644
index 594b0f5..0000000
--- a/.github/workflows/docker_image_build.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-name: Publish Docker images
-on:
-  push:
-    branches:
-      - master
-jobs:
-  cuda_image_build:
-    if: github.repository_owner == 'flashlight'
-    name: CUDA image build
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@master
-    - name: Build the CUDA Docker image
-      run: docker build . --file .docker/Dockerfile-CUDA --tag flml/flashlight:cuda-latest
-    - name: Docker login
-      env:
-          USER: ${{ secrets.DOCKER_USERNAME }}
-          PASSWORD: ${{ secrets.DOCKER_TOKEN }}
-      run: docker login -u=$USER -p=$PASSWORD
-    - name: Push image with the latest tag
-      run: docker push flml/flashlight:cuda-latest
-    - name: Tag revision
-      run: docker tag flml/flashlight:cuda-latest flml/flashlight:cuda-`git rev-parse --short HEAD`
-    - name: Push image with the revision tag
-      run: docker push flml/flashlight:cuda-`git rev-parse --short HEAD`
-    - name: Docker logout
-      run: docker logout
-  cpu_image_build:
-    if: github.repository_owner == 'flashlight'
-    name: CPU image build
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@master
-    - name: Build the CPU Docker image
-      run: docker build . --file .docker/Dockerfile-CPU --tag flml/flashlight:cpu-latest
-    - name: Docker login
-      env:
-          USER: ${{ secrets.DOCKER_USERNAME }}
-          PASSWORD: ${{ secrets.DOCKER_TOKEN }}
-      run: docker login -u=$USER -p=$PASSWORD
-    - name: Push image with the latest tag
-      run: docker push flml/flashlight:cpu-latest
-    - name: Tag revision
-      run: docker tag flml/flashlight:cpu-latest flml/flashlight:cpu-`git rev-parse --short HEAD`
-    - name: Push image with the revision tag
-      run: docker push flml/flashlight:cpu-`git rev-parse --short HEAD`
-    - name: Docker logout
-      run: docker logout
diff --git a/.github/workflows/linux-pipeline.yml b/.github/workflows/linux-pipeline.yml
new file mode 100644
index 0000000..f3047f3
--- /dev/null
+++ b/.github/workflows/linux-pipeline.yml
@@ -0,0 +1,66 @@
+name: Linux Pipeline
+
+on:
+  workflow_call:
+    inputs:
+      compiler:
+        required: true
+        type: string
+      backend:
+        required: true
+        type: string
+      run_tests:
+        required: true
+        type: boolean
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  prepare:
+    name: Build Container (${{ inputs.backend }})
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build & Push
+        uses: ./.github/actions/build-container
+        with:
+          dockerfile: ci/docker/linux/Dockerfile.${{ inputs.backend }}
+          tag: linux-${{ inputs.backend }}
+
+  build:
+    name: Linux (${{ inputs.compiler }}-${{ inputs.backend }})
+    needs: prepare
+    runs-on: ubuntu-latest
+    container:
+      image: ghcr.io/${{ github.repository }}:linux-${{ inputs.backend }}
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ github.token }}
+      options: >-
+        -v /usr/local/share/vcpkg:/vcpkg
+        -e VCPKG_ROOT=/vcpkg
+      env:
+        VCPKG_ROOT: /vcpkg
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Build
+        uses: ./.github/actions/build-flashmini
+        with:
+          cache_prefix: linux
+          compiler: ${{ inputs.compiler }}
+          backend: ${{ inputs.backend }}
+
+      - name: Test
+        if: inputs.run_tests
+        uses: ./.github/actions/test-flashmini
+        with:
+          test_dir: out/build/ci_${{ inputs.compiler }}_af_${{ inputs.backend }}
+      
+      - name: Fix permissions
+        if: always()
+        run: sudo chown -R $(id -u):$(id -g) out/build
\ No newline at end of file
diff --git a/.github/workflows/windows-pipeline.yml b/.github/workflows/windows-pipeline.yml
new file mode 100644
index 0000000..40c75fc
--- /dev/null
+++ b/.github/workflows/windows-pipeline.yml
@@ -0,0 +1,45 @@
+name: Windows Pipeline
+
+on:
+  workflow_call:
+    inputs:
+      compiler:
+        required: true
+        type: string
+      backend:
+        required: true
+        type: string
+      run_tests:
+        required: true
+        type: boolean
+
+jobs:
+  build:
+    name: Windows (${{ inputs.compiler }}-${{ inputs.backend }})
+    runs-on: windows-latest
+    defaults:
+      run:
+        shell: bash # Forces Git Bash for consistency with Linux scripts
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      
+      - uses: ./.github/actions/setup-windows
+        with:
+          compiler: ${{ inputs.compiler }}
+          backend: ${{ inputs.backend }}
+
+      - name: Build
+        uses: ./.github/actions/build-flashmini
+        with:
+          cache_prefix: windows
+          compiler: ${{ inputs.compiler }}
+          backend: ${{ inputs.backend }}
+
+      - name: Test
+        if: inputs.run_tests
+        uses: ./.github/actions/test-flashmini
+        with:
+          test_dir: out/build/ci_${{ inputs.compiler }}_af_${{ inputs.backend }}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 1b55e23..9b1beca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ release/
 debug/
 out/
 vcpkg_installed/
+Testing/
 *.so
 
 # FB
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a360cc2..5bfeed9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,9 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_STANDARD 14)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
+# no modules in this library
+set(CMAKE_CXX_SCAN_FOR_MODULES OFF)
+
 # Default directories for installation
 set(FL_INSTALL_INC_DIR "include" CACHE PATH "Install path for headers")
 set(FL_INSTALL_INC_DIR_HEADER_LOC ${FL_INSTALL_INC_DIR}/flashlight)
diff --git a/CMakePresets.json b/CMakePresets.json
index bf19f1d..aec9e32 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -8,6 +8,11 @@
         "CMAKE_TOOLCHAIN_FILE": "cmake/utils/toolchain.cmake"
       }
     },
+    {
+      "name": "ninja",
+      "hidden": true,
+      "generator": "Ninja Multi-Config"
+    },
     {
       "name": "wsl-settings",
       "hidden": true,
@@ -26,11 +31,7 @@
         }
       }
     },
-    {
-      "name": "ninja",
-      "hidden": true,
-      "generator": "Ninja Multi-Config"
-    },
+
     {
       "name": "base",
       "hidden": true,
@@ -39,7 +40,10 @@
         "ninja",
         "vcpkg"
       ],
-      "binaryDir": "${sourceDir}/out/build/${presetName}"
+      "binaryDir": "${sourceDir}/out/build/${presetName}",
+      "cacheVariables": {
+        "CMAKE_EXPORT_COMPILE_COMMANDS": true
+      }
     },
     {
       "name": "msvc",
@@ -79,10 +83,8 @@
       "cacheVariables": {
         "CMAKE_CUDA_COMPILER": "nvcc",
         "FL_USE_CUDNN": false,
-
         "CMAKE_CUDA_ARCHITECTURES": "native",
         "CMAKE_CUDA_FLAGS": "-allow-unsupported-compiler",
-
         "VCPKG_MANIFEST_FEATURES": "cuda"
       }
     },
@@ -104,7 +106,6 @@
       "name": "af-backend-base",
       "hidden": true,
       "cacheVariables": {
-        "FL_BUILD_ARRAYFIRE": true,
         "FL_USE_ARRAYFIRE": true
       }
     },
@@ -116,7 +117,8 @@
         "af-backend-base"
       ],
       "cacheVariables": {
-        "FL_ARRAYFIRE_USE_CPU": true
+        "FL_ARRAYFIRE_USE_CPU": true,
+        "FL_USE_ONEDNN": true
       }
     },
     {
@@ -162,5 +164,57 @@
         "af-cpu-backend"
       ]
     }
+  ],
+  "buildPresets": [
+    {
+      "name": "release-base",
+      "hidden": true,
+      "configuration": "Release"
+    },
+    {
+      "name": "debug-base",
+      "hidden": true,
+      "configuration": "Debug"
+    },
+    {
+      "name": "msvc_af_cuda_release",
+      "configurePreset": "msvc_af_cuda",
+      "inherits": "release-base"
+    },
+    {
+      "name": "gcc_af_cuda_release",
+      "configurePreset": "gcc_af_cuda",
+      "inherits": "release-base"
+    },
+    {
+      "name": "msvc_af_cpu_release",
+      "configurePreset": "msvc_af_cpu",
+      "inherits": "release-base"
+    },
+    {
+      "name": "gcc_af_cpu_release",
+      "configurePreset": "gcc_af_cpu",
+      "inherits": "release-base"
+    },
+    {
+      "name": "msvc_af_cuda_debug",
+      "configurePreset": "msvc_af_cuda",
+      "inherits": "debug-base"
+    },
+    {
+      "name": "gcc_af_cuda_debug",
+      "configurePreset": "gcc_af_cuda",
+      "inherits": "debug-base"
+    },
+    {
+      "name": "msvc_af_cpu_debug",
+      "configurePreset": "msvc_af_cpu",
+      "inherits": "debug-base"
+    },
+    {
+      "name": "gcc_af_cpu_debug",
+      "configurePreset": "gcc_af_cpu",
+      "inherits": "debug-base"
+    }
   ]
 }
\ No newline at end of file
diff --git a/ci/CMakeUserPresets.json b/ci/CMakeUserPresets.json
new file mode 100644
index 0000000..a82111d
--- /dev/null
+++ b/ci/CMakeUserPresets.json
@@ -0,0 +1,90 @@
+{
+    "version": 3,
+  "configurePresets": [
+    {
+      "name": "ci-vcpkg",
+      "hidden": true,
+      "cacheVariables": {
+        "VCPKG_INSTALLED_DIR": "${sourceDir}/vcpkg_installed"
+      }
+    },
+    {
+      "name": "ci-buildcache",
+      "hidden": true,
+      "environment": {
+        "BUILDCACHE_DIR": "${sourceDir}/.buildcache",
+        "BUILDCACHE_ACCURACY": "SLOPPY",
+        "BUILDCACHE_MAX_CACHE_SIZE": "524288000"
+      }
+    },
+
+
+    {
+      "name": "ci-config-base",
+      "hidden": true,
+      "inherits": [
+        "ci-vcpkg",
+        "ci-buildcache"
+      ],
+      "cacheVariables": {
+        "FL_BUILD_TESTS": "ON",
+        "FL_BUILD_STANDALONE": "ON"
+      }
+    },
+    {
+      "name": "ci_msvc_af_cpu",
+      "inherits": [
+        "ci-config-base",
+        "msvc_af_cpu"
+      ]
+    },
+    {
+      "name": "ci_msvc_af_cuda",
+      "inherits": [
+        "ci-config-base",
+        "msvc_af_cuda"
+      ]
+    },
+    {
+      "name": "ci_gcc_af_cpu",
+      "inherits": [
+        "ci-config-base",
+        "gcc_af_cpu"
+      ]
+    },
+    {
+      "name": "ci_gcc_af_cuda",
+      "inherits": [
+        "ci-config-base",
+        "gcc_af_cuda"
+      ]
+    }
+  ],
+    "buildPresets": [
+        {
+            "name": "ci-build-base",
+            "hidden": true,
+            "configuration": "Release"
+        },
+        {
+            "name": "ci_msvc_af_cpu",
+            "configurePreset": "ci_msvc_af_cpu",
+            "inherits": "ci-build-base"
+        },
+        {
+            "name": "ci_msvc_af_cuda",
+            "configurePreset": "ci_msvc_af_cuda",
+            "inherits": "ci-build-base"
+        },
+        {
+            "name": "ci_gcc_af_cpu",
+            "configurePreset": "ci_gcc_af_cpu",
+            "inherits": "ci-build-base"
+        },
+        {
+            "name": "ci_gcc_af_cuda",
+            "configurePreset": "ci_gcc_af_cuda",
+            "inherits": "ci-build-base"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/ci/docker/linux/Dockerfile.cpu b/ci/docker/linux/Dockerfile.cpu
new file mode 100644
index 0000000..1bbcd0e
--- /dev/null
+++ b/ci/docker/linux/Dockerfile.cpu
@@ -0,0 +1,25 @@
+FROM cachyos/cachyos:latest
+
+RUN pacman -Syu --noconfirm && \
+    pacman -S --noconfirm \
+    base-devel \
+    cmake \
+    ninja \
+    git \
+    openmpi \
+    python \
+    python-pip \
+    vcpkg \
+    wget \
+    cloc \
+    buildcache-git
+
+ENV VCPKG_ROOT=/opt/vcpkg
+
+# Install ArrayFire from script
+RUN wget -nv https://arrayfire.s3.amazonaws.com/3.10.0/ArrayFire-v3.10.0_Linux_x86_64.sh -O af_installer.sh && \
+    chmod +x af_installer.sh && \
+    ./af_installer.sh --include-subdir --prefix=/opt --skip-license --yes && \
+    rm af_installer.sh
+ENV ArrayFire_DIR=/opt/arrayfire
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/arrayfire/lib64
diff --git a/ci/docker/linux/Dockerfile.cuda b/ci/docker/linux/Dockerfile.cuda
new file mode 100644
index 0000000..1d76e18
--- /dev/null
+++ b/ci/docker/linux/Dockerfile.cuda
@@ -0,0 +1,35 @@
+FROM cachyos/cachyos:latest
+
+RUN pacman -Syu --noconfirm && \
+    pacman -S --noconfirm \
+    base-devel \
+    cmake \
+    ninja \
+    git \
+    openmpi \
+    cuda \
+    cudnn \
+    python \
+    python-pip \
+    vcpkg \
+    wget \
+    cloc \
+    buildcache-git
+
+ENV VCPKG_ROOT=/opt/vcpkg
+
+#symlink for cuda stubs
+RUN ln -sf /opt/cuda/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/libcuda.so.1 && \
+    ln -sf /opt/cuda/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/libcuda.so
+
+
+# Install ArrayFire from script
+RUN wget -nv https://arrayfire.s3.amazonaws.com/3.10.0/ArrayFire-v3.10.0_Linux_x86_64.sh -O af_installer.sh && \
+    chmod +x af_installer.sh && \
+    ./af_installer.sh --include-subdir --prefix=/opt --skip-license --yes && \
+    rm af_installer.sh
+
+ENV ArrayFire_DIR=/opt/arrayfire
+ENV CUDA_HOME=/opt/cuda
+ENV PATH=/opt/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/cuda/lib64:/opt/arrayfire/lib64
diff --git a/.circleci/config.yml b/ci/unused/.circleci/config.yml
similarity index 100%
rename from .circleci/config.yml
rename to ci/unused/.circleci/config.yml
diff --git a/.docker/Dockerfile-CPU b/ci/unused/.docker/Dockerfile-CPU
similarity index 100%
rename from .docker/Dockerfile-CPU
rename to ci/unused/.docker/Dockerfile-CPU
diff --git a/.docker/Dockerfile-CPU-Base b/ci/unused/.docker/Dockerfile-CPU-Base
similarity index 100%
rename from .docker/Dockerfile-CPU-Base
rename to ci/unused/.docker/Dockerfile-CPU-Base
diff --git a/.docker/Dockerfile-CUDA b/ci/unused/.docker/Dockerfile-CUDA
similarity index 100%
rename from .docker/Dockerfile-CUDA
rename to ci/unused/.docker/Dockerfile-CUDA
diff --git a/.docker/Dockerfile-CUDA-Base b/ci/unused/.docker/Dockerfile-CUDA-Base
similarity index 100%
rename from .docker/Dockerfile-CUDA-Base
rename to ci/unused/.docker/Dockerfile-CUDA-Base
diff --git a/.docker/README.md b/ci/unused/.docker/README.md
similarity index 100%
rename from .docker/README.md
rename to ci/unused/.docker/README.md
diff --git a/.github/actions/install_core_deps/action.yml b/ci/unused/.github/actions/install-core-deps/action.yml
similarity index 71%
rename from .github/actions/install_core_deps/action.yml
rename to ci/unused/.github/actions/install-core-deps/action.yml
index 98a2e21..6627ddb 100644
--- a/.github/actions/install_core_deps/action.yml
+++ b/ci/unused/.github/actions/install-core-deps/action.yml
@@ -16,6 +16,31 @@ runs:
   steps:
     # ]----- Backend dependencies
     # ArrayFire
+    - name: Cache ArrayFire (Windows)
+      id: cache-arrayfire-windows
+      uses: actions/cache@v4
+      if: runner.os == 'Windows' && inputs.backend == 'ArrayFire'
+      with:
+        path: C:\tools\ArrayFire
+        key: arrayfire-windows-3.10.0
+    
+    - name: "Install ArrayFire (Windows)"
+      if: runner.os == 'Windows' && inputs.backend == 'ArrayFire' && steps.cache-arrayfire-windows.outputs.cache-hit != 'true'
+      run: |
+        choco install --no-progress wget -y
+        cd $HOME
+        wget -nv https://arrayfire.gateway.scarf.sh/windows/3.10.0/ArrayFire.exe -O ArrayFire.exe
+        7z.exe x ArrayFire.exe -o"C:\tools\ArrayFire" -y
+        rm ArrayFire.exe
+      shell: bash -el {0}
+
+    - name: Set ArrayFire Env (Windows)
+      if: runner.os == 'Windows' && inputs.backend == 'ArrayFire'
+      run: |
+        echo "ArrayFire_DIR=C:\tools\ArrayFire" >> $GITHUB_ENV
+        echo "C:\tools\ArrayFire\lib" >> $GITHUB_PATH
+      shell: bash -el {0}
+    
     - name: "Install ArrayFire (Linux)"
       run: |
         sudo apt update
@@ -25,20 +50,11 @@ runs:
         sudo apt install arrayfire-cmake=3.8.1-2 arrayfire-headers=3.8.1-2 arrayfire-cpu3-mkl=3.8.1-2 arrayfire-cpu3-dev=3.8.1-2
       if: runner.os == 'Linux' && inputs.backend == 'ArrayFire'
       shell: bash -el {0}
+      
     - name: "Install ArrayFire (macOS)"
       run: brew install arrayfire
       if: runner.os == 'macOS' && inputs.backend == 'ArrayFire'
       shell: bash -el {0}
-    - name: "Install ArrayFire (Windows)"
-      run: |
-        choco install --no-progress wget -y
-        cd $HOME
-        INSTALLER_NAME="ArrayFire-v3.8.1-CUDA-11.4.exe"
-        wget --quiet https://arrayfire.s3.amazonaws.com/3.8.1/$INSTALLER_NAME
-        7z.exe x $INSTALLER_NAME -o"C:\Program Files\ArrayFire" -y
-        rm $INSTALLER_NAME
-      if: runner.os == 'Windows' && inputs.backend == 'ArrayFire'
-      shell: bash -el {0}
     # oneDNN
     - name: Install oneDNN with micromamba
       uses: mamba-org/setup-micromamba@v1
diff --git a/.github/actions/install_pkg_deps/action.yml b/ci/unused/.github/actions/install_pkg_deps/action.yml
similarity index 100%
rename from .github/actions/install_pkg_deps/action.yml
rename to ci/unused/.github/actions/install_pkg_deps/action.yml
diff --git a/.github/workflows/build.yml b/ci/unused/.github/workflows/build.yml
similarity index 100%
rename from .github/workflows/build.yml
rename to ci/unused/.github/workflows/build.yml
diff --git a/.github/workflows/build_docs.yml b/ci/unused/.github/workflows/build_docs.yml
similarity index 100%
rename from .github/workflows/build_docs.yml
rename to ci/unused/.github/workflows/build_docs.yml
diff --git a/cmake/TestUtils.cmake b/cmake/TestUtils.cmake
index 13c5483..8f9a228 100644
--- a/cmake/TestUtils.cmake
+++ b/cmake/TestUtils.cmake
@@ -1,4 +1,5 @@
-cmake_minimum_required(VERSION 3.5.1)
+cmake_minimum_required(VERSION 3.21)
+include(fm_target_utilities)
 
 set(GTEST_IMPORTED_TARGETS "")
 
@@ -61,6 +62,7 @@ function(build_test)
     PUBLIC
     ${build_test_PREPROC}
     )
+
   if (CMAKE_SYSTEM_NAME STREQUAL "Windows")
     target_compile_definitions(
       ${target}
@@ -70,4 +72,8 @@ function(build_test)
     )
   endif()
   gtest_add_tests(TARGET ${target})
+
+  if(WIN32)
+    fm_target_copy_dependencies(${target})
+  endif()
 endfunction(build_test)
diff --git a/cmake/utils/fm_target_utilities.cmake b/cmake/utils/fm_target_utilities.cmake
index 51dbf09..ce2e437 100644
--- a/cmake/utils/fm_target_utilities.cmake
+++ b/cmake/utils/fm_target_utilities.cmake
@@ -34,11 +34,24 @@ function(fm_glob OUT_VAR)
     set(SUB_PATHS ${ARG_UNPARSED_ARGUMENTS})
 
     set(GLOB_PATTERNS "")
-    foreach(SUB_PATH IN LISTS SUB_PATHS)
-        foreach(PATTERN IN LISTS ARG_PATTERNS)
-            list(APPEND GLOB_PATTERNS "${SUB_PATH}/${PATTERN}")
+    if(SUB_PATHS AND ARG_PATTERNS)
+        # Case 1: Both paths and patterns provided - generate cross-product
+        foreach(SUB_PATH IN LISTS SUB_PATHS)
+            foreach(PATTERN IN LISTS ARG_PATTERNS)
+                if(SUB_PATH)
+                    list(APPEND GLOB_PATTERNS "${SUB_PATH}/${PATTERN}")
+                else()
+                    list(APPEND GLOB_PATTERNS "${PATTERN}")
+                endif()
+            endforeach()
         endforeach()
-    endforeach()
+    elseif(SUB_PATHS)
+        # Case 2: Only sub_paths provided - treat them as full patterns
+        set(GLOB_PATTERNS ${SUB_PATHS})
+    elseif(ARG_PATTERNS)
+        # Case 3: Only patterns provided - treat them as full patterns
+        set(GLOB_PATTERNS ${ARG_PATTERNS})
+    endif()
 
     if(GLOB_PATTERNS)
         file(GLOB_RECURSE FOUND_FILES
@@ -407,4 +420,231 @@ function(fm_add_clang_format_target TARGET_NAME)
         COMMENT "Formatting all source files with clang-format..."
         VERBATIM
     )
+endfunction()
+
+function(_fm_ensure_stub_lib)
+    set(stub_target "fm_link_attachment_stub")
+    if(TARGET ${stub_target})
+        return()
+    endif()
+
+    # Define a predictable, flat output directory
+    set(stub_dir "${CMAKE_BINARY_DIR}/_fm_internal")
+    file(MAKE_DIRECTORY "${stub_dir}")
+
+    # Generate dummy source
+    set(stub_src "${stub_dir}/stub.c")
+    if(NOT EXISTS "${stub_src}")
+        file(WRITE "${stub_src}" "void fm_link_attachment_stub_symbol(void) {}\n")
+    endif()
+
+    # Create the real static library
+    add_library(${stub_target} STATIC "${stub_src}")
+        
+    # FORCE the output location to be flat.
+    set_target_properties(${stub_target} PROPERTIES 
+        ARCHIVE_OUTPUT_DIRECTORY "${stub_dir}"
+        OUTPUT_NAME "fm_link_attachment_stub"
+    )
+        
+    if(CMAKE_CONFIGURATION_TYPES)
+        foreach(config ${CMAKE_CONFIGURATION_TYPES})
+            string(TOUPPER "${config}" config_upper)
+            set_target_properties(${stub_target} PROPERTIES 
+                ARCHIVE_OUTPUT_DIRECTORY_${config_upper} "${stub_dir}"
+            )
+        endforeach()
+    endif()
+endfunction()
+
+#[[.rst:
+.. command:: fm_target_attach_dependency
+
+   .. code-block:: cmake
+
+      fm_target_attach_dependency(<target> <mode> <files...>)
+
+   Attaches external file dependencies (like DLLs) to a target by creating imported targets.
+   This allows CMake to track these dependencies.
+
+   :param target: The target to attach dependencies to.
+   :type target: string
+   :param mode: Attachment mode. Must be ``LINK`` or ``NOLINK``.
+                - ``LINK``: The file behaves like a linked library (implicit link).
+                - ``NOLINK``: The file is attached but not linked (e.g., runtime-only DLL).
+   :type mode: string
+   :param files: List of file paths to attach.
+   :type files: list of strings
+
+   :pre: ``target`` must exist.
+   :pre: ``mode`` must be ``LINK`` or ``NOLINK``.
+
+   .. note::
+      Creates internal targets named ``<target>_<mode>_attachment_<filename>`` for each file.
+      In ``NOLINK`` mode, a dummy stub library is linked to satisfy CMake's requirement 
+      for SHARED libraries on Windows, preventing LNK1107 errors.
+#]]
+function(fm_target_attach_dependency target mode)
+    fm_assert_target(${target})
+    fm_assert_true(
+        "${mode}" MATCHES "^(LINK|NOLINK)$"
+    )
+
+    _fm_ensure_stub_lib()
+    
+    set(stub_dir "${CMAKE_BINARY_DIR}/_fm_internal")
+    set(
+        stub_lib_path 
+        "${stub_dir}/${CMAKE_STATIC_LIBRARY_PREFIX}fm_link_attachment_stub${CMAKE_STATIC_LIBRARY_SUFFIX}"
+    )
+
+    foreach(file_path ${ARGN})
+        # Resolve full path immediately
+        get_filename_component(abs_path "${file_path}" ABSOLUTE)
+        get_filename_component(file_name "${file_path}" NAME)
+        
+        # Generate unique target name
+        string(MD5 path_hash "${abs_path}")
+        set(leaf_target_name "_${target}_${mode}_${path_hash}")
+
+        if(NOT TARGET ${leaf_target_name})
+            
+            # --- PLATFORM LOGIC ---
+            if(WIN32)
+                # Windows: Always SHARED IMPORTED.
+                add_library(${leaf_target_name} SHARED IMPORTED GLOBAL)
+                set_target_properties(${leaf_target_name} PROPERTIES 
+                    IMPORTED_LOCATION "${abs_path}"
+                )
+
+                if(mode STREQUAL "NOLINK")
+                    # NOLINK: Point IMPLIB to the dummy stub.
+                    set_target_properties(${leaf_target_name} PROPERTIES 
+                        IMPORTED_IMPLIB "${stub_lib_path}"
+                    )
+                    # Ensure stub is built before linking
+                    add_dependencies(${leaf_target_name} fm_stub_lib)
+                else()
+                    # LINK: Calculate the real import library path.
+                    get_filename_component(dir_name "${abs_path}" DIRECTORY)
+                    get_filename_component(name_we "${abs_path}" NAME_WE)
+                    
+                    # Construct path: dir / [prefix]filename[suffix]
+                    set(implib_path "${dir_name}/${CMAKE_IMPORT_LIBRARY_PREFIX}${name_we}${CMAKE_IMPORT_LIBRARY_SUFFIX}")
+                    
+                    if(EXISTS "${implib_path}")
+                        set_target_properties(${leaf_target_name} PROPERTIES 
+                            IMPORTED_IMPLIB "${implib_path}"
+                        )
+                    else()
+                        message(WARNING "fm_target_attach_dependency: LINK mode for ${file_name}, but import lib not found at: ${implib_path}")
+                    endif()
+                endif()
+
+            else()
+                # Unix/macOS: 
+                # NOLINK -> MODULE (Loadable, not linked)
+                # LINK   -> SHARED (Linked)
+                if(mode STREQUAL "NOLINK")
+                    add_library(${leaf_target_name} MODULE IMPORTED GLOBAL)
+                else()
+                    add_library(${leaf_target_name} SHARED IMPORTED GLOBAL)
+                endif()
+                
+                set_target_properties(${leaf_target_name} PROPERTIES 
+                    IMPORTED_LOCATION "${abs_path}"
+                )
+            endif()
+
+        endif()
+
+        # Link the imported target to the main target
+        target_link_libraries(${target} PRIVATE ${leaf_target_name})
+    endforeach()
+endfunction()
+
+#[[.rst:
+.. command:: fm_target_copy_dependencies
+
+   .. code-block:: cmake
+
+      fm_target_copy_dependencies(<target>)
+
+   Adds a post-build step to copy runtime dependencies (DLLs) to the target's output directory.
+   Uses ``$<TARGET_RUNTIME_DLLS:...>`` generator expression.
+
+   :param target: The target to copy dependencies for.
+   :type target: string
+
+   :pre: ``target`` must exist.
+   :pre: ``target`` must be an ``EXECUTABLE`` or ``SHARED_LIBRARY``.
+   :post: Runtime dependencies are copied to ``$<TARGET_FILE_DIR:target>`` after build.
+#]]
+function(fm_target_copy_dependencies target)
+    fm_assert_target("${target}")
+
+    get_target_property(TGT_TYPE ${target} TYPE)
+    fm_assert_true("${TGT_TYPE}" MATCHES "^(EXECUTABLE|SHARED_LIBRARY)$"
+        REASON "fm_target_copy_dependencies: Target '${target}' is of type '${TGT_TYPE}'. This function only supports EXECUTABLES or SHARED_LIBRARIES."
+    )
+
+    get_target_property(_registered ${target} _FM_COPY_DEPS_REGISTERED)
+    if(_registered)
+        message(WARNING "fm_target_copy_dependencies(${target}) called multiple times!")
+        return()
+    endif()
+    set_property(TARGET ${target} PROPERTY _FM_COPY_DEPS_REGISTERED TRUE)
+
+    set(RETRY_SCRIPT "${CMAKE_CURRENT_BINARY_DIR}/${target}_copy_retry_$<CONFIG>.cmake")
+
+    # Generate the script. We use file(GENERATE) so generator expressions resolve correctly.
+    file(GENERATE OUTPUT "${RETRY_SCRIPT}" CONTENT "
+        cmake_minimum_required(VERSION 3.21)
+
+        set(DLLS \"$<TARGET_RUNTIME_DLLS:${target}>\")
+        set(DEST \"$<TARGET_FILE_DIR:${target}>\")
+
+        # Exit early if no DLLs to copy
+        if(NOT DLLS)
+            return()
+        endif()
+
+        # Retry Loop: Try up to 5 times
+        foreach(i RANGE 1 5)
+            # 1. Try to copy ALL files in one go (Fast).
+            #    'copy_if_different' is idempotent; if 49/50 files succeed, 
+            #    the next attempt only copies the 1 failed file.
+            execute_process(
+                COMMAND \${CMAKE_COMMAND} -E copy_if_different \${DLLS} \${DEST}
+                RESULT_VARIABLE CMD_RESULT
+                ERROR_VARIABLE CMD_ERR
+                OUTPUT_VARIABLE CMD_OUT
+            )
+
+            # 2. Check success
+            if(CMD_RESULT EQUAL 0)
+                return()
+            endif()
+
+            # 3. Handle failure
+            if(\${i} LESS 5)
+                # Print a warning but don't fail yet
+                message(STATUS \"[${target}] Copy failed (Attempt \${i}/5). Retrying in 1s...\")
+
+                # sleep
+                execute_process(COMMAND \${CMAKE_COMMAND} -E sleep 1)
+            else()
+                # Final attempt failed, print error and exit with failure code
+                message(STATUS \"\${CMD_OUT}\")
+                message(STATUS \"\${CMD_ERR}\")
+                message(FATAL_ERROR \"[${target}] Failed to copy dependencies after 5 attempts.\")
+            endif()
+        endforeach()
+    ")
+
+    # Add the post-build step to run the generated script
+    add_custom_command(TARGET ${target} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -P "${RETRY_SCRIPT}"
+        COMMENT "Propagating runtime dependencies for ${target} ..."
+    )
 endfunction()
\ No newline at end of file
diff --git a/cmake/utils/fm_tool_utilities.cmake b/cmake/utils/fm_tool_utilities.cmake
index ad297a4..48c144f 100644
--- a/cmake/utils/fm_tool_utilities.cmake
+++ b/cmake/utils/fm_tool_utilities.cmake
@@ -8,25 +8,38 @@ include(fm_assertions)
 
    .. code-block:: cmake
 
-      fm_find_program(<out_var> <prog> [args...])
+      fm_find_program(<out_var> <prog> [OPTIONAL] [args...])
 
    Locates an external program and exports its path to the parent scope.
+   If OPTIONAL is specified, does not error if program is not found.
 
    :param OUT_VAR variable to export program path to
    :param prog: Name of the program to find
-   :type prog: string
-   :param args: Additional arguments to pass to find_program (e.g., PATHS, HINTS)
-   :type args: optional arguments
+   :param OPTIONAL: If specified, do not raise FATAL_ERROR if program is not found
+   :param args: Additional arguments to pass to find_program
+
+   :post: <OUT_VAR> variable is set in PARENT_SCOPE with the full path to the program, or configuration terminates with FATAL_ERROR if not found and not OPTIONAL
 
-   :post: <OUT_VAR> variable is set in PARENT_SCOPE with the full path to the program, or configuration terminates with FATAL_ERROR if not found
 #]]
 function(fm_find_program OUT_VAR prog)
+    cmake_parse_arguments(PARSE_ARGV 2 ARG "OPTIONAL" "" "")
+
+    find_program(${OUT_VAR} "${prog}" ${ARG_UNPARSED_ARGUMENTS})
     
-    find_program(${OUT_VAR} "${prog}" ${ARGN})
-    
+    if(${OUT_VAR})
+        message(STATUS "${prog} found: ${${OUT_VAR}}")
+        message(VERBOSE "${prog} location: ${${OUT_VAR}}")
+        set(${OUT_VAR} "${${OUT_VAR}}" PARENT_SCOPE)
+        return()
+    endif()
+
+    if(ARG_OPTIONAL)
+        message(STATUS "${prog} not found")
+        set(${OUT_VAR} "" PARENT_SCOPE)
+        return()
+    endif()
+
     fm_assert_true(${OUT_VAR} REASON "Program '${prog}' not found")
-    
-    set(${OUT_VAR} "${${OUT_VAR}}" PARENT_SCOPE)
 endfunction()
 
 #[[.rst:
@@ -34,10 +47,11 @@ endfunction()
 
    .. code-block:: cmake
 
-      fm_enable_build_cache()
+      fm_enable_build_cache([OPTIONAL])
 
    Enables BuildCache globally for all targets by setting compiler launcher variables.
 
+   :param OPTIONAL: If specified, do not raise FATAL_ERROR if buildcache is not found
    :pre: buildcache program is found in PATH
    :post: CMAKE_C_COMPILER_LAUNCHER and CMAKE_CXX_COMPILER_LAUNCHER are set to buildcache in PARENT_SCOPE
 
@@ -54,9 +68,20 @@ endfunction()
 
 #]]
 function(fm_enable_build_cache)
-    fm_find_program(BUILDCACHE_EXECUTABLE buildcache)
+    cmake_parse_arguments(PARSE_ARGV 0 ARG "OPTIONAL" "" "")
 
-    message(STATUS "Enabling buildcache globally: ${BUILDCACHE_EXECUTABLE}")
+    if (ARG_OPTIONAL)
+        fm_find_program(BUILDCACHE_EXECUTABLE buildcache OPTIONAL)
+    else()
+        fm_find_program(BUILDCACHE_EXECUTABLE buildcache)
+    endif()
+    
+    if(BUILDCACHE_EXECUTABLE)
+        message(STATUS "BuildCache globally enabled")
+    else()
+        message(STATUS "Couldn't enable BuildCache globally")
+        return()
+    endif()
 
     set(CMAKE_C_COMPILER_LAUNCHER "${BUILDCACHE_EXECUTABLE}" PARENT_SCOPE)
     set(CMAKE_CXX_COMPILER_LAUNCHER "${BUILDCACHE_EXECUTABLE}" PARENT_SCOPE)
@@ -67,27 +92,25 @@ endfunction()
 
    .. code-block:: cmake
 
-      fm_find_clang_format()
+      fm_find_clang_format([OPTIONAL])
 
-   Locates the clang-format executable and exports its path to the parent scope.
+   Locates a required clang-format executable and exports its path to the parent scope.
+   If OPTIONAL is specified, does not error if clang-format is not found.
 
-   :post: CLANG_FORMAT_EXECUTABLE is set in PARENT_SCOPE with the full path to clang-format, or configuration terminates with FATAL_ERROR if not found
-
-   .. note::
-      The clang-format executable must be available in PATH.
-
-   .. warning::
-      This function will fail with FATAL_ERROR if clang-format is not found.
-      Ensure clang-format is installed and available in your system PATH.
+   :post: CLANG_FORMAT_EXECUTABLE is set in PARENT_SCOPE with the full path to clang-format, or configuration terminates with FATAL_ERROR if not found and not OPTIONAL
 
    .. seealso::
       Use ``fm_add_clang_format_target()`` from fm_target_utilities to create a format target.
 
 #]]
 function(fm_find_clang_format)
-   fm_find_program(CLANG_FORMAT_EXECUTABLE clang-format)
-   
-   message(STATUS "Found external clang-format: ${CLANG_FORMAT_EXECUTABLE}")
+   cmake_parse_arguments(PARSE_ARGV 0 ARG "OPTIONAL" "" "")
 
+   if(ARG_OPTIONAL)
+      fm_find_program(CLANG_FORMAT_EXECUTABLE clang-format OPTIONAL)
+   else()
+      fm_find_program(CLANG_FORMAT_EXECUTABLE clang-format)
+   endif()
+   
    set(CLANG_FORMAT_EXECUTABLE ${CLANG_FORMAT_EXECUTABLE} PARENT_SCOPE)
-endfunction()
\ No newline at end of file
+endfunction()
diff --git a/cmake/utils/toolchain.cmake b/cmake/utils/toolchain.cmake
index bca31f5..57071c2 100644
--- a/cmake/utils/toolchain.cmake
+++ b/cmake/utils/toolchain.cmake
@@ -19,6 +19,9 @@ message(STATUS "appended (${FM_CMAKE_LIBRARY_DIR}/../) cmake/ to cmake module pa
 include(fm_assertions)
 include(fm_tool_utilities)
 
+#enable BuildCache
+fm_enable_build_cache(OPTIONAL)
+
 #delegate to vcpkg
 
 fm_assert_program(vcpkg REASON "fm needs vcpkg" HINTS "$ENV{VCPKG_ROOT}")
diff --git a/flashlight/fl/autograd/tensor/AutogradExtensionBackends.h b/flashlight/fl/autograd/tensor/AutogradExtensionBackends.h
index 519971d..27e69e9 100644
--- a/flashlight/fl/autograd/tensor/AutogradExtensionBackends.h
+++ b/flashlight/fl/autograd/tensor/AutogradExtensionBackends.h
@@ -16,6 +16,9 @@
 #if FL_USE_CUDNN
   #include "flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.h"
 #endif // FL_USE_CUDNN
+#if FL_USE_ONEDNN
+  #include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h"
+#endif // FL_USE_ONEDNN
 
 namespace fl {
 
@@ -28,4 +31,10 @@ FL_REGISTER_TENSOR_EXTENSION(CudnnAutogradExtension, ArrayFire);
   #endif // FL_USE_ARRAYFIRE && FL_ARRAYFIRE_USE_CUDA
 #endif // FL_USE_CUDNN
 
+#if FL_USE_ONEDNN
+  #if FL_USE_ARRAYFIRE && (FL_ARRAYFIRE_USE_CPU || FL_ARRAYFIRE_USE_OPENCL)
+FL_REGISTER_TENSOR_EXTENSION(OneDnnAutogradExtension, ArrayFire);
+  #endif 
+#endif // FL_USE_ONEDNN
+
 } // namespace fl
diff --git a/flashlight/fl/autograd/tensor/CMakeLists.txt b/flashlight/fl/autograd/tensor/CMakeLists.txt
index 944fdc4..8e5ef4d 100644
--- a/flashlight/fl/autograd/tensor/CMakeLists.txt
+++ b/flashlight/fl/autograd/tensor/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.16)
 
 option(FL_USE_CUDNN "Build with cuDNN support" OFF)
+option(FL_USE_ONEDNN "Build with OneDNN support" OFF)
 
 if(FL_USE_CUDNN)
   find_package(CUDNN)
@@ -21,11 +22,18 @@ if (FL_USE_CUDNN)
   include(${CMAKE_CURRENT_LIST_DIR}/backend/cudnn/CMakeLists.txt)
 endif()
 
+if (FL_USE_ONEDNN)
+  find_package(dnnl CONFIG REQUIRED)
+  include(${CMAKE_CURRENT_LIST_DIR}/backend/onednn/CMakeLists.txt)
+  target_link_libraries(flashlight PRIVATE DNNL::dnnl)
+endif()
+
 
 target_compile_definitions(
   flashlight
   PUBLIC
   FL_USE_CUDNN=$<BOOL:${FL_USE_CUDNN}>
+  FL_USE_ONEDNN=$<BOOL:${FL_USE_ONEDNN}>
 )
 
 target_sources(
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp b/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp
new file mode 100644
index 0000000..f495e24
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h"
+
+#include <algorithm>
+#include <stdexcept>
+#include <utility>
+
+#include <dnnl.hpp>
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h"
+#include "flashlight/fl/tensor/Index.h"
+
+namespace fl {
+
+namespace {
+
+// Flashlight accept HWCN order according to docs
+constexpr size_t kHIdx = 0;
+constexpr size_t kWIdx = 1;
+constexpr size_t kChannelSizeIdx = 2;
+constexpr size_t kBatchSizeIdx = 3;
+
+constexpr auto formatNCHW = dnnl::memory::format_tag::nchw;
+constexpr auto formatX = dnnl::memory::format_tag::x;
+
+int getNfeatures(const Shape& inputShape, const std::vector<int>& axes) {
+  int nfeatures = 1;
+  for (auto ax : axes) {
+    nfeatures *= inputShape.dim(ax);
+  }
+  return nfeatures;
+}
+
+dnnl::memory::dims getInputOutputDims(
+    const int minAxis,
+    const int maxAxis,
+    const Tensor& input,
+    const int nfeatures) {
+  Shape inDescDims;
+  if (minAxis == 0) {
+    inDescDims = Shape(
+        {1,
+         1,
+         nfeatures,
+         static_cast<long long>(input.elements() / nfeatures)});
+  } else {
+    int batchsz = 1;
+    for (int i = maxAxis + 1; i < input.ndim(); ++i) {
+      batchsz *= input.dim(i);
+    }
+    inDescDims = Shape(
+        {1,
+         static_cast<long long>(input.elements() / (nfeatures * batchsz)),
+         nfeatures,
+         batchsz});
+  }
+
+  dnnl::memory::dims inputOutputDims = {
+      inDescDims[kBatchSizeIdx],
+      inDescDims[kChannelSizeIdx],
+      inDescDims[kHIdx],
+      inDescDims[kWIdx]};
+
+  return inputOutputDims;
+}
+
+struct OneDnnBatchNormPayload : detail::AutogradPayloadData {
+  dnnl::batch_normalization_forward::primitive_desc fwdPrimDesc;
+  Tensor weights; // combined weight and bias
+  Tensor bias;
+  dnnl::memory::dims weightsDims;
+  dnnl::memory::dims biasDims;
+  dnnl::memory::desc outputMemoryDescriptor;
+  dnnl::memory meanMemory;
+  dnnl::memory varMemory;
+  dnnl::memory weightsMemory;
+  dnnl::memory biasMemory;
+};
+
+} // namespace
+
+Tensor OneDnnAutogradExtension::batchnorm(
+    Tensor& saveMean,
+    Tensor& saveVar,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    Tensor& runningMean,
+    Tensor& runningVar,
+    const std::vector<int>& axes,
+    const bool train,
+    const double momentum,
+    const double epsilon,
+    std::shared_ptr<detail::AutogradPayload> autogradPayload) {
+  if (momentum != 0.) {
+    throw std::runtime_error("OneDNN batchnorm op doesn't support momentum.");
+  }
+  if (input.type() == fl::dtype::f16) {
+    throw std::runtime_error("OneDNN batchnorm op - f16 inputs not supported.");
+  }
+
+  auto payload = std::make_shared<OneDnnBatchNormPayload>();
+  if (train && autogradPayload) {
+    autogradPayload->data = payload;
+  }
+
+  auto output = Tensor(input.shape(), input.type());
+  int nfeatures = getNfeatures(input.shape(), axes);
+
+  if (runningVar.isEmpty()) {
+    runningVar = fl::full({nfeatures}, 1., input.type());
+  }
+
+  if (runningMean.isEmpty()) {
+    runningMean = fl::full({nfeatures}, 0., input.type());
+  }
+
+  // Check if axes are valid
+  auto maxAxis = *std::max_element(axes.begin(), axes.end());
+  auto minAxis = *std::min_element(axes.begin(), axes.end());
+  bool axesContinuous = (axes.size() == (maxAxis - minAxis + 1));
+  if (!axesContinuous) {
+    throw std::invalid_argument("axis array should be continuous");
+  }
+
+  auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine();
+
+  // Prepare combined weights
+  // If empty, user specifies affine to false. Both not trainable.
+  auto weightNonempty =
+      weight.isEmpty() ? fl::full({nfeatures}, 1., fl::dtype::f32) : weight;
+  auto biasNonempty =
+      bias.isEmpty() ? fl::full({nfeatures}, 0., fl::dtype::f32) : bias;
+
+  // DNNL only accepts weight and bias as a combined input.
+  // https://git.io/JLn9X
+  payload->weights = weightNonempty;
+  payload->bias = biasNonempty;
+  payload->weightsDims = detail::convertToDnnlDims({nfeatures});
+  payload->biasDims = detail::convertToDnnlDims({nfeatures});
+  auto inputOutputDims = getInputOutputDims(minAxis, maxAxis, input, nfeatures);
+
+  // Memory for forward
+  const detail::DnnlMemoryWrapper inputMemory(
+      input, inputOutputDims, formatNCHW);
+  const detail::DnnlMemoryWrapper outputMemory(
+      output, inputOutputDims, formatNCHW);
+  const detail::DnnlMemoryWrapper meanMemory(
+      runningMean, {runningMean.dim(0)}, formatX);
+  const detail::DnnlMemoryWrapper varMemory(
+      runningVar, {runningVar.dim(0)}, formatX);
+  // combined scale and shift (weight and bias)
+  const detail::DnnlMemoryWrapper weightsMemory(
+      payload->weights, payload->weightsDims, formatX);
+  const detail::DnnlMemoryWrapper biasMemory(
+      payload->bias, payload->biasDims, formatX);
+  payload->meanMemory = meanMemory.getMemory();
+  payload->varMemory = varMemory.getMemory();
+  payload->weightsMemory = weightsMemory.getMemory();
+  payload->biasMemory = biasMemory.getMemory();
+  // Primitives and descriptors
+  auto kind = train ? dnnl::prop_kind::forward_training
+                    : dnnl::prop_kind::forward_inference;
+  // https://fburl.com/6latj733
+  dnnl::normalization_flags flag = train
+      ? dnnl::normalization_flags::none
+      : dnnl::normalization_flags::use_global_stats;
+  flag = flag | dnnl::normalization_flags::use_scale |
+      dnnl::normalization_flags::use_shift;
+  payload->fwdPrimDesc = dnnl::batch_normalization_forward::primitive_desc(
+      dnnlEngine,
+      kind,
+      inputMemory.getDescriptor(),
+      outputMemory.getDescriptor(),
+      epsilon,
+      flag);
+  payload->outputMemoryDescriptor = outputMemory.getDescriptor();
+  auto bn = dnnl::batch_normalization_forward(payload->fwdPrimDesc);
+  std::unordered_map<int, dnnl::memory> bnFwdArgs = {
+      {DNNL_ARG_SRC, inputMemory.getMemory()},
+      {DNNL_ARG_MEAN, meanMemory.getMemory()},
+      {DNNL_ARG_VARIANCE, varMemory.getMemory()},
+      {DNNL_ARG_DST, outputMemory.getMemory()},
+      {DNNL_ARG_SCALE, weightsMemory.getMemory()},
+      {DNNL_ARG_SHIFT, biasMemory.getMemory()}};
+
+  // Execute
+  std::vector<dnnl::primitive> network;
+  std::vector<std::unordered_map<int, dnnl::memory>> fwdArgs = {bnFwdArgs};
+  network.push_back(bn);
+  detail::executeNetwork(network, fwdArgs);
+
+  return output;
+}
+
+std::tuple<Tensor, Tensor, Tensor> OneDnnAutogradExtension::batchnormBackward(
+    const Tensor& gradOutput,
+    const Tensor& saveMean,
+    const Tensor& saveVar,
+    const Tensor& input,
+    const Tensor& weight,
+    const std::vector<int>& axes,
+    const bool train,
+    const float epsilon,
+    std::shared_ptr<detail::AutogradPayload> autogradPayload) {
+  if (!autogradPayload) {
+    throw std::invalid_argument(
+        "OneDnnAutogradExtension::pool2dBackward given null detail::AutogradPayload");
+  }
+  auto payload =
+      std::static_pointer_cast<OneDnnBatchNormPayload>(autogradPayload->data);
+
+  auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine();
+
+  auto maxAxis = *std::max_element(axes.begin(), axes.end());
+  auto minAxis = *std::min_element(axes.begin(), axes.end());
+  const bool axesContinuous = (axes.size() == (maxAxis - minAxis + 1));
+  if (!axesContinuous) {
+    throw std::invalid_argument("axis array should be continuous");
+  }
+
+  const int nfeatures = getNfeatures(input.shape(), axes);
+  auto inputOutputDims = getInputOutputDims(minAxis, maxAxis, input, nfeatures);
+
+  auto gradInput = Tensor(input.shape(), input.type());
+  auto gradWeights = Tensor(payload->weights.shape(), payload->weights.type());
+  auto gradBias = Tensor(payload->bias.shape(), payload->bias.type());
+
+  const detail::DnnlMemoryWrapper inputMemory(
+      input, inputOutputDims, formatNCHW);
+
+  // Memory for gradient computation
+  const detail::DnnlMemoryWrapper gradOutputMem(
+      gradOutput, inputOutputDims, formatNCHW);
+  const detail::DnnlMemoryWrapper gradInputMem(
+      gradInput, inputOutputDims, formatNCHW);
+  const detail::DnnlMemoryWrapper gradWeightsMem(
+      gradWeights, payload->weightsDims, formatX);
+  const detail::DnnlMemoryWrapper gradBiasMem(
+      gradBias, payload->biasDims, formatX);
+
+  // Primitives and descriptors
+  auto bwdPrimitiveDesc = dnnl::batch_normalization_backward::primitive_desc(
+      dnnlEngine,
+      dnnl::prop_kind::backward,
+      gradOutputMem.getDescriptor(),
+      payload->outputMemoryDescriptor,
+      gradOutputMem.getDescriptor(),
+      epsilon,
+      dnnl::normalization_flags::use_scale |
+          dnnl::normalization_flags::use_shift,
+      payload->fwdPrimDesc // hint
+  );
+  auto bwdPrim =
+      std::make_shared<dnnl::batch_normalization_backward>(bwdPrimitiveDesc);
+  // Execute
+  std::vector<dnnl::primitive> networkBackwards;
+  std::vector<std::unordered_map<int, dnnl::memory>> bwdArgs = {
+      {{DNNL_ARG_SRC, inputMemory.getMemory()},
+       {DNNL_ARG_MEAN, payload->meanMemory},
+       {DNNL_ARG_VARIANCE, payload->varMemory},
+       {DNNL_ARG_SCALE, payload->weightsMemory},
+       //TODO dnnl_arg_shift was here, check if something can be optimized bc it's not needed
+       {DNNL_ARG_DIFF_SRC, gradInputMem.getMemory()},
+       {DNNL_ARG_DIFF_DST, gradOutputMem.getMemory()},
+       {DNNL_ARG_DIFF_SCALE, gradWeightsMem.getMemory()},
+       {DNNL_ARG_DIFF_SHIFT, gradBiasMem.getMemory()}}};
+
+  networkBackwards.push_back(*bwdPrim);
+  detail::executeNetwork(networkBackwards, bwdArgs);
+
+  return {gradInput, gradWeights, gradBias};
+};
+
+} // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/CMakeLists.txt b/flashlight/fl/autograd/tensor/backend/onednn/CMakeLists.txt
new file mode 100644
index 0000000..f4dc0fb
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/onednn/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.16)
+
+target_sources(
+  flashlight
+  PRIVATE
+  ${CMAKE_CURRENT_LIST_DIR}/OneDnnAutogradExtension.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/Conv2D.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/Pool2D.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/RNN.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/BatchNorm.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/DnnlUtils.cpp
+)
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/Conv2D.cpp b/flashlight/fl/autograd/tensor/backend/onednn/Conv2D.cpp
new file mode 100644
index 0000000..d6e558f
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/onednn/Conv2D.cpp
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h"
+
+#include <array>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include <dnnl.hpp>
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h"
+
+using namespace dnnl;
+
+namespace fl {
+
+namespace {
+
+// Input, output: WHCN; weights: WHIO
+constexpr size_t kWIdx = 0;
+constexpr size_t kHIdx = 1;
+constexpr size_t kIOChannelSizeIdx = 2;
+constexpr size_t kIOBatchSizeIdx = 3;
+constexpr size_t kWeightOutputChannelSizeIdx = 3;
+
+// Use memory::format_tag::any for memory formatting even if convolution
+// inputs are shaped in a particular way.
+constexpr auto formatAny = memory::format_tag::any;
+constexpr auto formatNCHW = memory::format_tag::nchw;
+constexpr auto formatBias = memory::format_tag::x;
+
+struct OneDnnConv2DData {
+  memory::dims inputDims;
+  memory::dims weightDims;
+  memory::dims outputDims;
+  memory::dims biasDims;
+  memory::dims strideDims;
+  memory::dims dilationDims;
+  memory::dims paddingDims;
+  // Memory descriptors
+  memory::desc inputMemDesc;
+  memory::desc outputMemDesc;
+  memory::desc weightMemDesc;
+  memory::desc biasMemDesc;
+  // used for creating a backward desc
+  convolution_forward::primitive_desc fwdPrimDesc;
+};
+
+OneDnnConv2DData createOneDnnConv2DData(
+    fl::dtype inputType,
+    const Shape& inputShape,
+    const Shape& weightsShape,
+    const Shape& biasShape,
+    const Shape& outputShape,
+    const int sx,
+    const int sy,
+    const int px,
+    const int py,
+    const int dx,
+    const int dy,
+    const int groups) {
+  const dnnl::memory::data_type dataType = detail::dnnlMapToType(inputType);
+  const auto formatWeight =
+      (groups == 1) ? memory::format_tag::oihw : memory::format_tag::goihw;
+  const bool hasBias = biasShape.elements() > 0;
+
+  OneDnnConv2DData out;
+  // Create memory dims
+  out.inputDims = detail::convertToDnnlDims(
+      {inputShape.dim(kIOBatchSizeIdx),
+       inputShape.dim(kIOChannelSizeIdx),
+       inputShape.dim(kHIdx),
+       inputShape.dim(kWIdx)});
+  if (groups == 1) {
+    out.weightDims = detail::convertToDnnlDims(
+        {weightsShape.dim(kWeightOutputChannelSizeIdx),
+         inputShape.dim(kIOChannelSizeIdx),
+         weightsShape.dim(kHIdx),
+         weightsShape.dim(kWIdx)});
+  } else {
+    out.weightDims = detail::convertToDnnlDims(
+        {groups,
+         weightsShape.dim(kWeightOutputChannelSizeIdx) / groups,
+         inputShape.dim(kIOChannelSizeIdx) / groups,
+         weightsShape.dim(kHIdx),
+         weightsShape.dim(kWIdx)});
+  }
+  out.outputDims = detail::convertToDnnlDims(
+      {inputShape.dim(kIOBatchSizeIdx),
+       weightsShape.dim(kWeightOutputChannelSizeIdx),
+       outputShape.dim(kHIdx),
+       outputShape.dim(kWIdx)});
+  out.biasDims = detail::convertToDnnlDims(
+      {weightsShape.dim(kWeightOutputChannelSizeIdx)});
+  out.strideDims = {sy, sx};
+  out.paddingDims = {py, px};
+  // NB: DNNL treats a dilation of 0 as a standard convolution and indexes
+  // larger dilations accordingly. See https://git.io/fhAT2 for more.
+  out.dilationDims = {dy - 1, dx - 1};
+
+  // Create memory descriptors. using format::any gives the best performance
+  out.inputMemDesc = memory::desc({out.inputDims}, dataType, formatAny);
+  out.outputMemDesc = memory::desc({out.outputDims}, dataType, formatAny);
+  out.weightMemDesc = memory::desc({out.weightDims}, dataType, formatWeight);
+  out.biasMemDesc = memory::desc({out.biasDims}, dataType, formatAny);
+
+  //
+  const auto forwardMode = prop_kind::forward_training;
+  // TODO: determine train mode/assess perf impact of always choosing training
+  // (primitive cache storage overhead?)
+  // const auto forwardMode =
+  //     train ? prop_kind::forward_training : prop_kind::forward_inference;
+
+  auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine();
+  convolution_forward::primitive_desc fwdPrimitiveDescriptor;
+  if (hasBias) {
+    fwdPrimitiveDescriptor = convolution_forward::primitive_desc(
+        dnnlEngine,
+        forwardMode,
+        algorithm::convolution_direct,
+        out.inputMemDesc,
+        out.weightMemDesc,
+        out.biasMemDesc,
+        out.outputMemDesc,
+        out.strideDims,
+        out.dilationDims,
+        out.paddingDims,
+        out.paddingDims);
+  } else {
+    fwdPrimitiveDescriptor = convolution_forward::primitive_desc(
+        dnnlEngine,
+        forwardMode,
+        algorithm::convolution_direct,
+        out.inputMemDesc,
+        out.weightMemDesc,
+        out.outputMemDesc,
+        out.strideDims,
+        out.dilationDims,
+        out.paddingDims,
+        out.paddingDims);
+  }
+  out.fwdPrimDesc = std::move(fwdPrimitiveDescriptor);
+
+  return out;
+}
+
+} // namespace
+
+Tensor OneDnnAutogradExtension::conv2d(
+    const Tensor& input,
+    const Tensor& weights,
+    const Tensor& bias,
+    const int sx,
+    const int sy,
+    const int px,
+    const int py,
+    const int dx,
+    const int dy,
+    const int groups,
+    std::shared_ptr<detail::AutogradPayload>) {
+  if (input.type() == fl::dtype::f16) {
+    throw std::runtime_error("Half precision is not supported in CPU.");
+  }
+
+  // flashlight input, weight, and output shapes in column-major:
+  // - Input is WHCN
+  // - Weights are WHIO
+  // - Output is WHCN
+  // Since ArrayFire is column major, getting a raw pointer (1D
+  // representation) of these shapes and viewing as if the representation is
+  // row major transposes along all axis into NCHW for the input and output
+  // and OIHW for the weights
+  auto output = Tensor(
+      {1 +
+           (input.dim(kWIdx) + (2 * px) - (1 + (weights.dim(kWIdx) - 1) * dx)) /
+               sx,
+       1 +
+           (input.dim(kHIdx) + (2 * py) - (1 + (weights.dim(kHIdx) - 1) * dy)) /
+               sy,
+       weights.dim(kWeightOutputChannelSizeIdx),
+       input.dim(kIOBatchSizeIdx)},
+      input.type());
+  auto hasBias = bias.elements() > 0;
+
+  auto formatWeight =
+      (groups == 1) ? memory::format_tag::oihw : memory::format_tag::goihw;
+  auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine();
+
+  /********************************* Forward *******************************/
+  OneDnnConv2DData conv2DData = createOneDnnConv2DData(
+      input.type(),
+      input.shape(),
+      weights.shape(),
+      bias.shape(),
+      output.shape(),
+      sx,
+      sy,
+      px,
+      py,
+      dx,
+      dy,
+      groups);
+
+  // Create memory
+  const detail::DnnlMemoryWrapper inputMemInit(
+      input, {conv2DData.inputDims}, formatNCHW);
+  const detail::DnnlMemoryWrapper outputMemInit(
+      output, {conv2DData.outputDims}, formatNCHW);
+  const detail::DnnlMemoryWrapper weightsMem(
+      weights, {conv2DData.weightDims}, formatWeight);
+
+  // Network for execution
+  std::vector<primitive> network;
+  std::vector<std::unordered_map<int, dnnl::memory>> fwdArgs;
+
+  // DNNL suggests checking if the layout requested for the convolution
+  // is different from NCHW/OIHW (even if specified), and reordering if
+  // necessary, since the convolution itself may request a different
+  // ordering
+  auto inputDesc = conv2DData.fwdPrimDesc.src_desc();
+  auto weightsDesc = conv2DData.fwdPrimDesc.weights_desc();
+  auto outputDesc = conv2DData.fwdPrimDesc.dst_desc();
+  // Input
+  auto inputMemory = detail::dnnlAlignOrdering(
+      network, fwdArgs, inputMemInit.getMemory(), inputDesc);
+  auto weightsMemory = detail::dnnlAlignOrdering(
+      network, fwdArgs, weightsMem.getMemory(), weightsDesc);
+  // Output - adds a reorder after the conv if needed
+  auto outputMemory = outputMemInit.getMemory();
+  if (outputMemInit.getMemory().get_desc() != outputDesc) {
+    outputMemory = memory(outputDesc, dnnlEngine);
+  }
+
+  // Create convolution
+  std::shared_ptr<convolution_forward> conv;
+  const detail::DnnlMemoryWrapper biasMemory(
+      bias, conv2DData.biasDims, formatBias);
+  conv = std::make_shared<convolution_forward>(conv2DData.fwdPrimDesc);
+
+  network.push_back(*conv);
+
+  // Conv fwd args
+  std::unordered_map<int, dnnl::memory> convFwdArgs = {
+      {DNNL_ARG_SRC, inputMemory},
+      {DNNL_ARG_WEIGHTS, weightsMemory},
+      {DNNL_ARG_DST, outputMemory}};
+  if (hasBias) {
+    convFwdArgs[DNNL_ARG_BIAS] = biasMemory.getMemory();
+  }
+  fwdArgs.push_back(convFwdArgs);
+
+  // Add output reordering if needed
+  if (outputMemory != outputMemInit.getMemory()) {
+    network.push_back(dnnl::reorder(outputMemory, outputMemInit.getMemory()));
+    fwdArgs.push_back(
+        {{DNNL_ARG_FROM, outputMemory},
+         {DNNL_ARG_TO, outputMemInit.getMemory()}});
+  }
+
+  // Run
+  detail::executeNetwork(network, fwdArgs);
+
+  return output;
+}
+
+Tensor OneDnnAutogradExtension::conv2dBackwardData(
+    const Tensor& gradOutput,
+    const Tensor& input,
+    const Tensor& weights,
+    const int sx,
+    const int sy,
+    const int px,
+    const int py,
+    const int dx,
+    const int dy,
+    const int groups,
+    std::shared_ptr<DynamicBenchmark>,
+    std::shared_ptr<detail::AutogradPayload>) {
+  auto gradInput = Tensor(input.shape(), input.type()); // Result
+
+  auto formatWeight =
+      (groups == 1) ? memory::format_tag::oihw : memory::format_tag::goihw;
+  auto& dnnlEngineBwd = detail::DnnlEngine::getInstance().getEngine();
+
+  Tensor bias; // dummy
+  OneDnnConv2DData conv2DData = createOneDnnConv2DData(
+      input.type(),
+      input.shape(),
+      weights.shape(),
+      bias.shape(),
+      gradOutput.shape(), // has the same shape as the Conv output
+      sx,
+      sy,
+      px,
+      py,
+      dx,
+      dy,
+      groups);
+
+  // Backward descriptor
+  convolution_backward_data::primitive_desc bwdDataPrimitiveDesc(
+      dnnlEngineBwd,
+      algorithm::convolution_direct,
+      conv2DData.inputMemDesc,
+      conv2DData.weightMemDesc,
+      conv2DData.outputMemDesc,
+      conv2DData.strideDims,
+      conv2DData.dilationDims,
+      conv2DData.paddingDims,
+      conv2DData.paddingDims,
+      conv2DData.fwdPrimDesc);
+  // Primitive descriptor
+  auto bwdData =
+      std::make_shared<convolution_backward_data>(bwdDataPrimitiveDesc);
+
+  // Create memory
+  const detail::DnnlMemoryWrapper gradOutputMemInit(
+      gradOutput, conv2DData.outputDims, formatNCHW);
+  const detail::DnnlMemoryWrapper gradInputMemInit(
+      gradInput, conv2DData.inputDims, formatNCHW);
+  const detail::DnnlMemoryWrapper weightsMemInitBwd(
+      weights, conv2DData.weightDims, formatWeight);
+
+  std::vector<primitive> networkBackwards;
+  std::vector<std::unordered_map<int, dnnl::memory>> bwdDataArgs;
+
+  // Check for reorderings
+  auto gradOutputDesc = bwdDataPrimitiveDesc.diff_dst_desc();
+  auto weightsDesc = bwdDataPrimitiveDesc.weights_desc();
+  auto gradInputDesc = bwdDataPrimitiveDesc.diff_src_desc();
+  auto gradOutputMemory = detail::dnnlAlignOrdering(
+      networkBackwards,
+      bwdDataArgs,
+      gradOutputMemInit.getMemory(),
+      gradOutputDesc);
+  auto weightsMemoryBackwards = detail::dnnlAlignOrdering(
+      networkBackwards,
+      bwdDataArgs,
+      weightsMemInitBwd.getMemory(),
+      weightsDesc);
+  auto gradInputMemory = gradInputMemInit.getMemory();
+  // Don't reorder the gradient until after the conv
+  if (gradInputMemInit.getMemory().get_desc() != gradInputDesc) {
+    gradInputMemory = memory(gradInputDesc, dnnlEngineBwd);
+  }
+
+  // Convolution backwards
+  auto convBwdData =
+      std::make_shared<convolution_backward_data>(bwdDataPrimitiveDesc);
+
+  bwdDataArgs.push_back(
+      {{DNNL_ARG_DIFF_SRC, gradInputMemory},
+       {DNNL_ARG_WEIGHTS, weightsMemoryBackwards},
+       {DNNL_ARG_DIFF_DST, gradOutputMemory}});
+  networkBackwards.push_back(*convBwdData);
+
+  // Reorder the output (which is gradInput here) if necessary
+  if (gradInputMemory != gradInputMemInit.getMemory()) {
+    networkBackwards.push_back(
+        dnnl::reorder(gradInputMemory, gradInputMemInit.getMemory()));
+    bwdDataArgs.push_back(
+        {{DNNL_ARG_FROM, gradInputMemory},
+         {DNNL_ARG_TO, gradInputMemInit.getMemory()}});
+  }
+
+  detail::executeNetwork(networkBackwards, bwdDataArgs);
+
+  return gradInput;
+}
+
+std::pair<Tensor, Tensor> OneDnnAutogradExtension::conv2dBackwardFilterBias(
+    const Tensor& gradOutput,
+    const Tensor& input,
+    const Tensor& weights,
+    const Tensor& bias,
+    const int sx,
+    const int sy,
+    const int px,
+    const int py,
+    const int dx,
+    const int dy,
+    const int groups,
+    std::shared_ptr<DynamicBenchmark>,
+    std::shared_ptr<DynamicBenchmark>,
+    std::shared_ptr<detail::AutogradPayload>) {
+  auto gradWeights = Tensor(weights.shape(), weights.type()); // Result
+
+  auto formatWeight =
+      (groups == 1) ? memory::format_tag::oihw : memory::format_tag::goihw;
+  auto& dnnlEngineBwd = detail::DnnlEngine::getInstance().getEngine();
+  OneDnnConv2DData conv2DData = createOneDnnConv2DData(
+      input.type(),
+      input.shape(),
+      weights.shape(),
+      bias.shape(),
+      gradOutput.shape(), // has the same shape as the Conv output
+      sx,
+      sy,
+      px,
+      py,
+      dx,
+      dy,
+      groups);
+
+  Tensor gradBias;
+  bool computeBiasGrad = !bias.isEmpty() && !conv2DData.biasMemDesc.is_zero();
+  if (computeBiasGrad) {
+    gradBias = Tensor(bias.shape(), bias.type());
+  }
+
+  // Weight backward descriptor
+  convolution_backward_weights::primitive_desc bwdWeightPrimitiveDesc;
+  if (computeBiasGrad) {
+    bwdWeightPrimitiveDesc = convolution_backward_weights::primitive_desc(
+        dnnlEngineBwd,
+        algorithm::convolution_direct,
+        conv2DData.inputMemDesc,
+        conv2DData.weightMemDesc,
+        conv2DData.biasMemDesc,
+        conv2DData.outputMemDesc,
+        conv2DData.strideDims,
+        conv2DData.dilationDims,
+        conv2DData.paddingDims,
+        conv2DData.paddingDims,
+        conv2DData.fwdPrimDesc);
+  } else {
+    bwdWeightPrimitiveDesc = convolution_backward_weights::primitive_desc(
+        dnnlEngineBwd,
+        algorithm::convolution_direct,
+        conv2DData.inputMemDesc,
+        conv2DData.weightMemDesc,
+        conv2DData.outputMemDesc,
+        conv2DData.strideDims,
+        conv2DData.dilationDims,
+        conv2DData.paddingDims,
+        conv2DData.paddingDims,
+        conv2DData.fwdPrimDesc);
+  }
+  // Weight backward primitive descriptor
+  auto bwdWeights =
+      std::make_shared<convolution_backward_weights>(bwdWeightPrimitiveDesc);
+
+  // Create memory
+  const detail::DnnlMemoryWrapper inputRawMemInitBwd(
+      input, conv2DData.inputDims, formatNCHW);
+  const detail::DnnlMemoryWrapper gradOutputMemInit(
+      gradOutput, conv2DData.outputDims, formatNCHW);
+  const detail::DnnlMemoryWrapper gradWeightsMemInit(
+      gradWeights, conv2DData.weightDims, formatWeight);
+
+  std::vector<primitive> networkBackwards;
+  std::vector<std::unordered_map<int, dnnl::memory>> bwdWeightsArgs;
+
+  // Check for reorderings, reorder if needed
+  auto inputDesc = bwdWeightPrimitiveDesc.src_desc();
+  auto gradOutputDesc = bwdWeightPrimitiveDesc.diff_dst_desc();
+  auto gradWeightsDesc = bwdWeightPrimitiveDesc.diff_weights_desc();
+  auto inputMemoryBackwards = detail::dnnlAlignOrdering(
+      networkBackwards,
+      bwdWeightsArgs,
+      inputRawMemInitBwd.getMemory(),
+      inputDesc);
+  auto gradOutputMemory = detail::dnnlAlignOrdering(
+      networkBackwards,
+      bwdWeightsArgs,
+      gradOutputMemInit.getMemory(),
+      gradOutputDesc);
+  // Don't reorder the grads until after the conv bwd
+  auto gradWeightsMemory = gradWeightsMemInit.getMemory();
+  if (gradWeightsMemInit.getMemory().get_desc() != gradWeightsDesc) {
+    gradWeightsMemory = memory(gradWeightsDesc, dnnlEngineBwd);
+  }
+
+  // Create the convolution backward weight
+  std::unordered_map<int, dnnl::memory> bwdConvWeightsArgs = {
+      {DNNL_ARG_SRC, inputMemoryBackwards},
+      {DNNL_ARG_DIFF_WEIGHTS, gradWeightsMemory},
+      {DNNL_ARG_DIFF_DST, gradOutputMemory}};
+
+  if (computeBiasGrad) {
+    const detail::DnnlMemoryWrapper gradBiasMem(
+        gradBias, conv2DData.biasDims, formatBias);
+    bwdConvWeightsArgs[DNNL_ARG_DIFF_BIAS] = gradBiasMem.getMemory();
+  } else {
+  }
+  networkBackwards.push_back(*bwdWeights);
+  bwdWeightsArgs.push_back(bwdConvWeightsArgs);
+
+  // Reorder weight gradients if necessary
+  if (gradWeightsMemory != gradWeightsMemInit.getMemory()) {
+    networkBackwards.push_back(
+        dnnl::reorder(gradWeightsMemory, gradWeightsMemInit.getMemory()));
+    bwdWeightsArgs.push_back(
+        {{DNNL_ARG_FROM, gradWeightsMemory},
+         {DNNL_ARG_TO, gradWeightsMemInit.getMemory()}});
+  }
+
+  detail::executeNetwork(networkBackwards, bwdWeightsArgs);
+
+  return {gradWeights, gradBias};
+}
+
+} // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.cpp b/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.cpp
new file mode 100644
index 0000000..5fa5530
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h"
+
+#include <stdexcept>
+#include <utility>
+
+#if FL_BACKEND_OPENCL
+  #include <dnnl_ocl.hpp>
+#endif
+
+#include "flashlight/fl/common/Defines.h"
+#include "flashlight/fl/tensor/Compute.h"
+#include "flashlight/fl/tensor/TensorBase.h"
+
+#if FL_BACKEND_OPENCL
+  #include "flashlight/fl/common/OpenClUtils.h"
+#endif
+
+namespace fl::detail {
+
+DnnlStream::DnnlStream(dnnl::engine engine) {
+#if FL_BACKEND_OPENCL
+  stream_ = dnnl::ocl_interop::make_stream(engine, fl::ocl::getQueue());
+#else
+  stream_ = dnnl::stream(engine);
+#endif
+}
+
+dnnl::stream& DnnlStream::getStream() {
+  return stream_;
+}
+
+DnnlStream& DnnlStream::getInstance() {
+  static DnnlStream instance(DnnlEngine::getInstance().getEngine());
+  return instance;
+}
+
+DnnlEngine::DnnlEngine() {
+#if FL_BACKEND_OPENCL
+  engine_ = dnnl::ocl_interop::make_engine(
+      fl::ocl::getDeviceId(), fl::ocl::getContext());
+#else
+  engine_ = dnnl::engine(dnnl::engine::kind::cpu, 0);
+#endif
+}
+
+dnnl::engine& DnnlEngine::getEngine() {
+  return engine_;
+}
+
+DnnlEngine& DnnlEngine::getInstance() {
+  static DnnlEngine instance;
+  return instance;
+}
+
+dnnl::memory::dims convertToDnnlDims(const std::vector<Dim>& shape) {
+  return dnnl::memory::dims(shape.begin(), shape.end());
+}
+
+dnnl::memory::dims convertShapeToDnnlDims(const Shape& shape) {
+  return convertToDnnlDims(shape.get());
+}
+
+DnnlMemoryWrapper::DnnlMemoryWrapper(
+    const Tensor& tensor,
+    dnnl::memory::dims dims,
+    dnnl::memory::format_tag format) {
+#if FL_BACKEND_OPENCL
+  fl::ocl::DevicePtrOpenCl _devicePtr(tensor);
+  cl_mem* buffer = _devicePtr.getAsClMem();
+  devicePtr_ = std::move(_devicePtr);
+#else
+  devicePtr_ = fl::DevicePtr(tensor);
+  void* buffer = devicePtr_.get();
+#endif
+  descriptor_ =
+      dnnl::memory::desc({dims}, detail::dnnlMapToType(tensor.type()), format);
+  memory_ = dnnl::memory(
+      descriptor_, detail::DnnlEngine::getInstance().getEngine(), buffer);
+}
+
+DnnlMemoryWrapper& DnnlMemoryWrapper::operator=(DnnlMemoryWrapper&& other) {
+  devicePtr_ = std::move(other.devicePtr_);
+  memory_ = std::move(other.memory_);
+  descriptor_ = std::move(other.descriptor_);
+  return *this;
+}
+
+dnnl::memory DnnlMemoryWrapper::getMemory() const {
+  return memory_;
+}
+
+dnnl::memory::desc DnnlMemoryWrapper::getDescriptor() const {
+  return descriptor_;
+}
+
+dnnl::memory dnnlAlignOrdering(
+    std::vector<dnnl::primitive>& net,
+    std::vector<std::unordered_map<int, dnnl::memory>>& netArgs,
+    const dnnl::memory& memory,
+    const dnnl::memory::desc& desc) {
+  auto memoryOut = memory;
+  if (memory.get_desc() != desc) {
+    // use the ordering requested by the descriptor
+    memoryOut =
+        dnnl::memory(desc, detail::DnnlEngine::getInstance().getEngine());
+    net.push_back(dnnl::reorder(memory, memoryOut));
+    netArgs.push_back({{DNNL_ARG_FROM, memory}, {DNNL_ARG_TO, memoryOut}});
+  }
+  return memoryOut;
+}
+
+void executeNetwork(
+    std::vector<dnnl::primitive>& net,
+    std::vector<std::unordered_map<int, dnnl::memory>>& netArgs) {
+  if (net.size() != netArgs.size()) {
+    throw std::invalid_argument(
+        "executeNetwork - given different size nets and netArgs");
+  }
+  // TODO{fl::Tensor}{macros} -- improve this to work with other backend interop
+  // If on the CPU backend, there isn't a AF computation stream that facilitates
+  // enforcing that inputs to computation are ready; we're required to wait
+  // until all AF operations are done
+  if (FL_BACKEND_CPU) {
+    fl::sync();
+  }
+
+  for (size_t i = 0; i < net.size(); ++i) {
+    net.at(i).execute(DnnlStream::getInstance().getStream(), netArgs.at(i));
+  }
+
+  // TODO{fl::Tensor}{macros} -- improve this to work with other backend interop
+  if (FL_BACKEND_CPU) {
+    // Block the executing thread until the work is complete
+    DnnlStream::getInstance().getStream().wait();
+  }
+}
+
+dnnl::algorithm dnnlMapToPoolingMode(const PoolingMode mode) {
+  switch (mode) {
+    case PoolingMode::MAX:
+      return dnnl::algorithm::pooling_max;
+    case PoolingMode::AVG_INCLUDE_PADDING:
+      return dnnl::algorithm::pooling_avg_include_padding;
+    case PoolingMode::AVG_EXCLUDE_PADDING:
+      return dnnl::algorithm::pooling_avg_exclude_padding;
+    default:
+      throw std::invalid_argument("unsupported pooling mode for cuDNN");
+  }
+}
+
+} // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h b/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h
new file mode 100644
index 0000000..07be6db
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <array>
+
+#include <dnnl.hpp>
+
+#include "flashlight/fl/common/Defines.h"
+#include "flashlight/fl/common/DevicePtr.h"
+#include "flashlight/fl/tensor/Shape.h"
+#include "flashlight/fl/tensor/Types.h"
+
+namespace fl {
+
+class Tensor;
+
+namespace detail {
+
+/**
+ * A singleton class that contains a static instance of a dnnl::stream.
+ */
+class DnnlStream {
+ public:
+  DnnlStream(dnnl::engine engine);
+  ~DnnlStream() = default;
+
+  /// Prohibit assignment
+  DnnlStream& operator=(DnnlStream const& s) = delete;
+
+  dnnl::stream& getStream();
+
+  static DnnlStream& getInstance();
+
+ private:
+  dnnl::stream stream_;
+};
+
+/**
+ * A singleton class that contains a static instance of a dnnl::engine.
+ */
+class DnnlEngine {
+ public:
+  DnnlEngine();
+  ~DnnlEngine() = default;
+
+  /// Prohibit assignment
+  DnnlEngine& operator=(DnnlEngine const& e) = delete;
+
+  dnnl::engine& getEngine();
+
+  static DnnlEngine& getInstance();
+
+ private:
+  dnnl::engine engine_;
+};
+
+/**
+ * Helper for converting a Flashlight Shape into an DNNL-compatible input
+ * for dnnl::memory::dims.
+ */
+dnnl::memory::dims convertToDnnlDims(const std::vector<Dim>& dims);
+dnnl::memory::dims convertShapeToDnnlDims(const Shape& shape);
+
+/**
+ * A light wrapper around dnnl::memory that manages underlying memory lifetime
+ * in accordance with fl::DevicePtr.
+ */
+class DnnlMemoryWrapper {
+ public:
+  DnnlMemoryWrapper(
+      const Tensor& tensor,
+      dnnl::memory::dims dims,
+      dnnl::memory::format_tag format);
+  DnnlMemoryWrapper() = default;
+
+  DnnlMemoryWrapper& operator=(DnnlMemoryWrapper&& other);
+
+  dnnl::memory getMemory() const;
+
+  dnnl::memory::desc getDescriptor() const;
+
+ private:
+  dnnl::memory::desc descriptor_;
+  dnnl::memory memory_;
+  fl::DevicePtr devicePtr_;
+};
+
+/**
+ * Given some an dnnl network (a ``std::vector<dnnl::primitive>``), a
+ * ``dnnl::memory`` with some ordering, and a
+ * ``dnnl::memory::primitive_desc``, determines whether or not the memory
+ * needs to be ordered based on the primitive descriptor's required ordering.
+ *
+ * If so, adds a ``dnnl::reorder`` layer to the network, and returns a new
+ * memory descriptor that will be properly reordered.
+ */
+dnnl::memory dnnlAlignOrdering(
+    std::vector<dnnl::primitive>& net,
+    std::vector<std::unordered_map<int, dnnl::memory>>& netArgs,
+    const dnnl::memory& memory,
+    const dnnl::memory::desc& desc);
+
+/**
+ * Executes a sequence of DNNL primitives in the default execution stream with
+ * the default execution engine.
+ *
+ * For each primitive, passes the corresponding arguments map for that index
+ * to the execution stream. The number of primitives and the number of
+ * arguments must be equal, else throws.
+ *
+ * Blocks calling thread until the enqueued work has been completed.
+ */
+void executeNetwork(
+    std::vector<dnnl::primitive>& net,
+    std::vector<std::unordered_map<int, dnnl::memory>>& args);
+
+/**
+ * Given a flashlight pooling mode, returns the corresponding dnnl pooling
+ * mode.
+ */
+dnnl::algorithm dnnlMapToPoolingMode(const PoolingMode mode);
+
+/**
+ * Maps an ArrayFire array datatype into the corresponding DNNL datatype.
+ *
+ * Needs to be explicitly inlined due to a bug with DNNL.
+ */
+inline dnnl::memory::data_type dnnlMapToType(const fl::dtype t) {
+  if (t == fl::dtype::f16) {
+    return dnnl::memory::data_type::f16;
+  } else if (t == fl::dtype::f32) {
+    return dnnl::memory::data_type::f32;
+  } else if (t == fl::dtype::f64) {
+    throw std::invalid_argument("float64 is not supported by DNNL");
+  } else {
+    throw std::invalid_argument("data type not supported with DNNL");
+  }
+}
+
+} // namespace detail
+} // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.cpp b/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.cpp
new file mode 100644
index 0000000..d180fec
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h"
+
+namespace fl {
+
+bool OneDnnAutogradExtension::isDataTypeSupported(
+    const fl::dtype& dtype) const {
+  // fp16 computation is not supported with onednn
+  return dtype != fl::dtype::f16;
+}
+
+} // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h b/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h
new file mode 100644
index 0000000..310ecb9
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "flashlight/fl/autograd/tensor/AutogradExtension.h"
+
+namespace fl {
+
+class OneDnnAutogradExtension : public AutogradExtension {
+  // TODO(jacobkahn): implement getEngine
+
+ public:
+  bool isDataTypeSupported(const fl::dtype& dtype) const override;
+
+  /**************************** Forward ****************************/
+  Tensor conv2d(
+      const Tensor& input,
+      const Tensor& weights,
+      const Tensor& bias,
+      const int sx,
+      const int sy,
+      const int px,
+      const int py,
+      const int dx,
+      const int dy,
+      const int groups,
+      std::shared_ptr<detail::AutogradPayload> payload) override;
+
+  Tensor pool2d(
+      const Tensor& input,
+      const int wx,
+      const int wy,
+      const int sx,
+      const int sy,
+      const int px,
+      const int py,
+      const PoolingMode mode,
+      std::shared_ptr<detail::AutogradPayload> payload) override;
+
+  Tensor batchnorm(
+      Tensor& saveMean,
+      Tensor& saveVar,
+      const Tensor& input,
+      const Tensor& weight,
+      const Tensor& bias,
+      Tensor& runningMean,
+      Tensor& runningVar,
+      const std::vector<int>& axes,
+      const bool train,
+      const double momentum,
+      const double epsilon,
+      std::shared_ptr<detail::AutogradPayload> payload) override;
+
+  std::tuple<Tensor, Tensor, Tensor> rnn(
+      const Tensor& input,
+      const Tensor& hiddenState,
+      const Tensor& cellState,
+      const Tensor& weights,
+      const int hiddenSize,
+      const int numLayers,
+      const RnnMode mode,
+      const bool bidirectional,
+      const float dropout,
+      std::shared_ptr<detail::AutogradPayload> payload) override;
+
+  /**************************** Backward ****************************/
+  // ]----- Convolution
+  Tensor conv2dBackwardData(
+      const Tensor& gradOutput,
+      const Tensor& input,
+      const Tensor& weight,
+      const int sx,
+      const int sy,
+      const int px,
+      const int py,
+      const int dx,
+      const int dy,
+      const int groups,
+      std::shared_ptr<DynamicBenchmark> dataGradBenchmark,
+      std::shared_ptr<detail::AutogradPayload> payload) override;
+
+  std::pair<Tensor, Tensor> conv2dBackwardFilterBias(
+      const Tensor& gradOutput,
+      const Tensor& input,
+      const Tensor& weights,
+      const Tensor& bias,
+      const int sx,
+      const int sy,
+      const int px,
+      const int py,
+      const int dx,
+      const int dy,
+      const int groups,
+      std::shared_ptr<DynamicBenchmark> filterBench,
+      std::shared_ptr<DynamicBenchmark> biasBench,
+      std::shared_ptr<detail::AutogradPayload> autogradPayload) override;
+
+  // ]----- pool2D
+  Tensor pool2dBackward(
+      const Tensor& gradOutput,
+      const Tensor& input,
+      const Tensor& poolOutput,
+      const int wx,
+      const int wy,
+      const int sx,
+      const int sy,
+      const int px,
+      const int py,
+      const PoolingMode mode,
+      std::shared_ptr<detail::AutogradPayload> payload) override;
+
+  // ]----- batchnorm
+  std::tuple<Tensor, Tensor, Tensor> batchnormBackward(
+      const Tensor& gradOutput,
+      const Tensor& saveMean,
+      const Tensor& saveVar,
+      const Tensor& input,
+      const Tensor& weight,
+      const std::vector<int>& axes,
+      const bool train,
+      const float epsilon,
+      std::shared_ptr<detail::AutogradPayload> payload) override;
+
+  // ]----- rnn
+  std::tuple<Tensor, Tensor, Tensor, Tensor> rnnBackward(
+      const Tensor& input,
+      const Tensor& hiddenState,
+      const Tensor& cellState,
+      const Tensor& weights,
+      const std::shared_ptr<detail::RNNGradData> gradData,
+      const Tensor& output,
+      const int numLayers,
+      const int hiddenSize,
+      const RnnMode mode,
+      const bool bidirectional,
+      const float dropProb,
+      std::shared_ptr<detail::AutogradPayload> payload) override;
+};
+
+} // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/Pool2D.cpp b/flashlight/fl/autograd/tensor/backend/onednn/Pool2D.cpp
new file mode 100644
index 0000000..bf094b6
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/onednn/Pool2D.cpp
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h"
+
+#include <unordered_map>
+#include <vector>
+
+#include <dnnl.h>
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h"
+#include "flashlight/fl/tensor/Shape.h"
+#include "flashlight/fl/tensor/TensorBase.h"
+
+using namespace dnnl;
+
+namespace fl {
+
+namespace {
+
+constexpr size_t kWIdx = 0;
+constexpr size_t kHIdx = 1;
+constexpr size_t kChannelSizeIdx = 2;
+constexpr size_t kBatchSizeIdx = 3;
+
+// Use memory::format_tag::any for memory formatting even if pool
+// inputs are shaped in a particular way.
+constexpr auto formatAny = memory::format_tag::any;
+constexpr auto formatNCHW = memory::format_tag::nchw;
+
+struct DimsData {
+  memory::dims inputDims;
+  memory::dims outputDims;
+  memory::dims windowDims;
+  memory::dims strideDims;
+  std::vector<int64_t> paddingDims;
+};
+
+DimsData getDimsData(
+    const Shape& input,
+    const Shape& output,
+    const int wx,
+    const int wy,
+    const int sx,
+    const int sy,
+    const int px,
+    const int py) {
+  DimsData d;
+  d.inputDims = detail::convertToDnnlDims(
+      {input.dim(kBatchSizeIdx),
+       input.dim(kChannelSizeIdx),
+       input.dim(kHIdx),
+       input.dim(kWIdx)});
+  d.outputDims = detail::convertToDnnlDims(
+      {input.dim(kBatchSizeIdx),
+       input.dim(kChannelSizeIdx),
+       output.dim(kHIdx),
+       output.dim(kWIdx)});
+  d.windowDims = {wy, wx};
+  d.strideDims = {sy, sx};
+  d.paddingDims = {py, px};
+  return d;
+}
+
+} // namespace
+
+struct OneDnnPool2DPayload : detail::AutogradPayloadData {
+  memory workspace;
+  memory outputMemory;
+  DimsData dimsData;
+  pooling_forward::primitive_desc poolingFwdPrimDesc;
+};
+
+Tensor OneDnnAutogradExtension::pool2d(
+    const Tensor& input,
+    const int wx,
+    const int wy,
+    const int sx,
+    const int sy,
+    const int px,
+    const int py,
+    const PoolingMode mode,
+    std::shared_ptr<detail::AutogradPayload> autogradPayload) {
+  const bool train = (autogradPayload != nullptr);
+  auto payload = std::make_shared<OneDnnPool2DPayload>();
+  if (train) {
+    autogradPayload->data = payload;
+  }
+
+  // inputX x inputY x channels x batch
+  auto ix = input.dim(kWIdx);
+  auto iy = input.ndim() > kHIdx ? input.dim(kHIdx) : 1;
+  auto c = input.ndim() > kChannelSizeIdx ? input.dim(kChannelSizeIdx) : 1;
+  auto b = input.ndim() > kBatchSizeIdx ? input.dim(kBatchSizeIdx) : 1;
+
+  auto output = Tensor(
+      {1 + (ix + 2 * px - wx) / sx, 1 + (iy + 2 * py - wy) / sy, c, b},
+      input.type());
+
+  payload->dimsData =
+      getDimsData({ix, iy, c, b}, output.shape(), wx, wy, sx, sy, px, py);
+  auto& d = payload->dimsData;
+  auto dataType = detail::dnnlMapToType(input.type());
+
+  // Memory desc
+  auto inputMD = memory::desc({d.inputDims}, dataType, formatNCHW);
+  auto outputMD = memory::desc({d.outputDims}, dataType, formatAny);
+
+  // Memory
+  auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine();
+  const detail::DnnlMemoryWrapper inputMemInit(
+      input, {d.inputDims}, formatNCHW);
+  const detail::DnnlMemoryWrapper outputMemInit(
+      output, {d.outputDims}, formatNCHW);
+
+  // Choose a mode based on whether gradients are needed
+  auto forwardMode = train ? prop_kind::forward : prop_kind::forward_inference;
+
+  // Descriptors
+  auto poolingMode = detail::dnnlMapToPoolingMode(mode);
+  payload->poolingFwdPrimDesc = pooling_forward::primitive_desc(
+      dnnlEngine,
+      forwardMode,
+      poolingMode,
+      inputMD,
+      outputMD,
+      d.strideDims,
+      d.windowDims,
+      memory::dims{0, 0}, // dilation -- TODO: add to API
+      d.paddingDims,
+      d.paddingDims);
+  auto& primDesc = payload->poolingFwdPrimDesc;
+
+  // Network
+  std::vector<primitive> network;
+  std::vector<std::unordered_map<int, dnnl::memory>> fwdArgs;
+  // Reorder if needed
+  auto inputDesc = primDesc.src_desc();
+  auto outputDesc = primDesc.dst_desc();
+  auto inputMemory = detail::dnnlAlignOrdering(
+      network, fwdArgs, inputMemInit.getMemory(), inputDesc);
+  payload->outputMemory = outputMemInit.getMemory();
+  if (outputMemInit.getMemory().get_desc() != outputDesc) {
+    payload->outputMemory = memory(outputDesc, dnnlEngine);
+  }
+  // Workspace and layer (only training mode requires a workspace)
+  std::shared_ptr<pooling_forward> pooling;
+  std::unordered_map<int, dnnl::memory> fwdPoolingArgs;
+  fwdPoolingArgs[DNNL_ARG_SRC] = inputMemory;
+  fwdPoolingArgs[DNNL_ARG_DST] = payload->outputMemory;
+  if (train) {
+    payload->workspace = memory(primDesc.workspace_desc(), dnnlEngine);
+    pooling = std::make_shared<pooling_forward>(primDesc);
+    fwdPoolingArgs[DNNL_ARG_WORKSPACE] = payload->workspace;
+  } else {
+    pooling = std::make_shared<pooling_forward>(primDesc);
+  }
+  network.push_back(*pooling);
+  fwdArgs.push_back(fwdPoolingArgs);
+
+  // Add output reordering if needed
+  if (payload->outputMemory != outputMemInit.getMemory()) {
+    network.push_back(
+        dnnl::reorder(payload->outputMemory, outputMemInit.getMemory()));
+    fwdArgs.push_back(
+        {{DNNL_ARG_FROM, payload->outputMemory},
+         {DNNL_ARG_TO, outputMemInit.getMemory()}});
+  }
+
+  detail::executeNetwork(network, fwdArgs);
+  return output;
+}
+
+Tensor OneDnnAutogradExtension::pool2dBackward(
+    const Tensor& gradOutput,
+    const Tensor& input,
+    const Tensor& poolOutput,
+    const int wx,
+    const int wy,
+    const int sx,
+    const int sy,
+    const int px,
+    const int py,
+    const PoolingMode mode,
+    std::shared_ptr<detail::AutogradPayload> autogradPayload) {
+  if (!autogradPayload) {
+    throw std::invalid_argument(
+        "OneDnnAutogradExtension::pool2dBackward given null detail::AutogradPayload");
+  }
+  auto payload =
+      std::static_pointer_cast<OneDnnPool2DPayload>(autogradPayload->data);
+
+  auto gradInput = Tensor(input.shape(), fl::dtype::f32);
+  auto& dnnlEngineBwd = detail::DnnlEngine::getInstance().getEngine();
+
+  DimsData& d = payload->dimsData;
+  auto poolingMode = detail::dnnlMapToPoolingMode(mode);
+
+  // Memory
+  const detail::DnnlMemoryWrapper gradInputMemInit(
+      gradInput, {d.inputDims}, formatNCHW);
+  const detail::DnnlMemoryWrapper gradOutputMemInit(
+      gradOutput, {d.outputDims}, formatNCHW);
+
+  // Descriptors
+  // Memory descriptors from initialized memory must be used since
+  // pooling_backward descriptors require an ordering
+  auto gradInputMD = gradInputMemInit.getMemory().get_desc();
+  auto gradOutputMD = gradOutputMemInit.getMemory().get_desc();
+  auto bwdPrimitiveDesc = pooling_backward::primitive_desc(
+      dnnlEngineBwd,
+      poolingMode,
+      gradInputMD,
+      gradOutputMD,
+      d.strideDims,
+      d.windowDims,
+      memory::dims{0, 0}, // dilation - TODO: add to API
+      d.paddingDims,
+      d.paddingDims,
+      payload->poolingFwdPrimDesc // hint
+  );
+
+  std::vector<primitive> networkBackward;
+  std::vector<std::unordered_map<int, dnnl::memory>> bwdArgs;
+  // Reorder output memory if required
+  auto gradOutputMemory = detail::dnnlAlignOrdering(
+      networkBackward,
+      bwdArgs,
+      gradOutputMemInit.getMemory(),
+      payload->outputMemory.get_desc());
+
+  auto poolBwd = pooling_backward(bwdPrimitiveDesc);
+  std::unordered_map<int, dnnl::memory> bwdPoolingArgs = {
+      {DNNL_ARG_DIFF_SRC, gradInputMemInit.getMemory()},
+      {DNNL_ARG_DIFF_DST, gradOutputMemory},
+      {DNNL_ARG_WORKSPACE, payload->workspace}};
+  bwdArgs.push_back(bwdPoolingArgs);
+  networkBackward.push_back(poolBwd);
+
+  detail::executeNetwork(networkBackward, bwdArgs);
+
+  return gradInput;
+}
+
+} // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp b/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp
new file mode 100644
index 0000000..dd1d8a0
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h"
+
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include <dnnl.hpp>
+
+#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h"
+#include "flashlight/fl/tensor/Index.h"
+
+namespace fl {
+namespace {
+
+struct ParsedWeightsAndBias {
+  // First layer - will be empty if inSize == hiddenSize
+  Tensor weightsInput1L;
+  Tensor weightsHidden1L;
+  Tensor bias1L;
+  // All other layers
+  Tensor weightsInput;
+  Tensor weightsHidden;
+  Tensor bias;
+};
+
+// Each gate's weights have dimensions d1 x d2
+Tensor reorderLbrGruWeights(int d1, int d2, const Tensor& weights) {
+  // LBR GRU requires switch the given the r, u, o gate order from cuDNN to u,
+  // r, o as required by oneDNN (this from empirical verification)
+  int weightsSize = d1 * d2;
+  if (weights.elements() != weightsSize * 3) {
+    throw std::invalid_argument(
+        "RNN reorderLbrGruWeights given invalid weights tensor or dims - "
+        "weights of size " +
+        std::to_string(weights.elements()) + " which should be exactly " +
+        std::to_string(weightsSize * 3));
+  }
+  return fl::concatenate(
+      0,
+      weights.flat(fl::range(weightsSize, 2 * weightsSize)),
+      weights.flat(fl::range(0, weightsSize)),
+      weights.flat(fl::range(2 * weightsSize, fl::end)));
+}
+
+/**
+ * Converts flat cuDNN weights into the corresponding oneDNN onednn RNN weights.
+ */
+ParsedWeightsAndBias parseWeights(
+    const Tensor& weights,
+    RnnMode mode,
+    int numLayers,
+    int directionMult,
+    int inSize,
+    int numGates,
+    int hiddenSize) {
+  ParsedWeightsAndBias out;
+
+  // Per-layer sizes for weightsInput and weightsHidden.
+  // If inSize == hiddenSize, then weightsInputSize == weightsHiddenSize for all
+  // layers, else all but the first layer
+  int weightsInputSize1L = directionMult * inSize * numGates * hiddenSize;
+  int weightsHiddenSize = directionMult * hiddenSize * numGates * hiddenSize;
+  int weightsInputSize = weightsHiddenSize;
+  int lbrGruBias = mode == RnnMode::GRU ? 1 : 0;
+  int biasSize =
+      numLayers * directionMult * (numGates + lbrGruBias) * hiddenSize;
+
+  bool firstLayerDifferent = inSize != hiddenSize;
+  // Adjusted if skipping first layer parsing
+  int numWeightsLayers = firstLayerDifferent ? numLayers - 1 : numLayers;
+  int weightsOffset =
+      firstLayerDifferent ? weightsInputSize1L + weightsHiddenSize : 0;
+  // If skipping the first layer, parse then skip over the first layer
+  // weights and parse the remaining layers. Parsing all bias layers is still
+  // fine since biases for each layer have the same size
+  if (firstLayerDifferent) {
+    out.weightsInput1L = weights.flat(fl::range(weightsInputSize1L));
+    out.weightsHidden1L = weights.flat(
+        fl::range(weightsInputSize1L, weightsInputSize1L + weightsHiddenSize));
+
+    if (mode == RnnMode::GRU) {
+      out.weightsInput1L =
+          reorderLbrGruWeights(inSize, hiddenSize, out.weightsInput1L);
+      out.weightsHidden1L =
+          reorderLbrGruWeights(hiddenSize, hiddenSize, out.weightsHidden1L);
+    }
+  }
+
+  auto weightsFlat = weights.flatten().astype(weights.type());
+  // cuDNN RNN weights, for each layer, are arranged with a chunk of
+  // input-hidden weights for each layer followed by a chunk of hidden-hidden
+  // weights for each layer:
+  // {[layers x [hiddenSize, inputSize]], [layers x  [hiddenSize, hiddenSize]] }
+  // Rearrange this to what oneDNN expects (or will reorder if not optimal),
+  // which is numLayers chunks of two chunks containing input-hidden and
+  // hidden-hidden:
+  // {[layers x [[hiddenSize x inSize], [hiddenSize x hiddenSize]]]}
+  // Note that the loop is over the total number of layers in case we'r doing a
+  // single-layer operation where input size and hidden size are different but
+  // we'll call another primitive with the output of that first layer as the
+  // input to the next layers
+  auto weightsInput = Tensor({0}, weights.type());
+  auto weightsHidden = Tensor({0}, weights.type());
+  Tensor weightsFlatOffset =
+      weightsFlat.flat(fl::range(weightsOffset, fl::end));
+  // Specifically ignore the first layer's weights, so inSize == hiddenSize
+  for (int i = 0; i < numWeightsLayers; ++i) {
+    // number of input/hidden weights
+    // TODO: Will change for bidirectional
+    int chunkSize = hiddenSize * hiddenSize * numGates;
+    // weights per layer
+    int layerChunkSize = chunkSize + chunkSize;
+
+    // Grab input-hidden weights and chunk them together
+    auto inputWeightsChunk = weightsFlatOffset.flat(
+        fl::range(layerChunkSize * i, layerChunkSize * i + chunkSize));
+    // Grab hidden-hidden weights and chunk them together
+    auto inputHiddenChunk = weightsFlatOffset.flat(fl::range(
+        layerChunkSize * i + chunkSize,
+        layerChunkSize * i + chunkSize + chunkSize));
+
+    if (mode == RnnMode::GRU) {
+      inputWeightsChunk =
+          reorderLbrGruWeights(hiddenSize, hiddenSize, inputWeightsChunk);
+      inputHiddenChunk =
+          reorderLbrGruWeights(hiddenSize, hiddenSize, inputHiddenChunk);
+    }
+
+    weightsInput = fl::concatenate(2, weightsInput, inputWeightsChunk);
+    weightsHidden = fl::concatenate(2, weightsHidden, inputHiddenChunk);
+  }
+  out.weightsInput = weightsInput;
+  out.weightsHidden = weightsHidden;
+
+  // Reduce the weights to form biases. cuDNN uses two separate bias terms:
+  // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNMode_t -
+  // oneDNN expects only one bias term. Sum together the coefficients for both
+  // bias terms to get a single bias term for oneDNN. The gradients for
+  // each term can be computed as one since the gradients with respect to
+  // the bias subarrays will simply be half of the computed gradient with
+  // oneDNN
+  Tensor bias(weights.type());
+  int biasStartOffset = numLayers * weightsHiddenSize +
+      (numLayers - 1) * weightsInputSize + weightsInputSize1L;
+  // In vanilla RNN modes, the biases can be simply added:
+  // two biases for each bias in fl cuDNN with CUDNN_RNN_DOUBLE_BIAS (default)
+  int numBiases = 2;
+  // First, grab a subarray which contains only both bias terms; then add them
+  Tensor biasFlat = weightsFlat.flat(fl::range(biasStartOffset, fl::end));
+  // Layout is: {numLayers x [numBiases x [bias shape]]}
+  for (int i = 0; i < numLayers; ++i) {
+    if (mode == RnnMode::GRU) {
+      int lbrGruChunkSize = hiddenSize * 6;
+      // In the case of the LBR GRU, there's an extra bias term which shouldn't
+      // be combined with the first two pairs of biases. Six chunks total.
+      // cuDNN --> oneDNN transformation for ordering:
+      // r1, u1, o, r2, u2, u' --> u1 + u2, r1 + r2, o, u'
+      int base = i * lbrGruChunkSize;
+      // The sum of the following tensors yields the correct bias
+      // u1, r1, o, u'
+      auto biases1 = fl::concatenate(
+          0,
+          // u1 -- [1, 2]
+          biasFlat.flat(
+              fl::range(base + hiddenSize * 1, base + hiddenSize * 2)),
+          // r1 -- [0, 1]
+          biasFlat.flat(
+              fl::range(base + hiddenSize * 0, base + hiddenSize * 1)),
+          // o -- [2, 3]
+          biasFlat.flat(
+              fl::range(base + hiddenSize * 2, base + hiddenSize * 3)),
+          // 'u -- [5, 6]
+          biasFlat.flat(
+              fl::range(base + hiddenSize * 5, base + hiddenSize * 6)));
+      // u2, r2, 0, 0
+      auto biases2 = fl::concatenate(
+          0,
+          // u2 -- [4, 5]
+          biasFlat.flat(
+              fl::range(base + hiddenSize * 4, base + hiddenSize * 5)),
+          // r2 -- [3, 4]
+          biasFlat.flat(
+              fl::range(base + hiddenSize * 3, base + hiddenSize * 4)),
+          // zeroes to add to o and u'
+          fl::full({hiddenSize * 2}, 0., biasFlat.type()));
+      auto layerBiasCombined = biases1 + biases2;
+      bias = fl::concatenate(0, bias, layerBiasCombined);
+    } else {
+      // The number of bias terms in the tensor per-layer
+      int layerStride = biasSize / numLayers * numBiases;
+      auto biases1 = biasFlat(fl::range(
+          layerStride * i, layerStride * i + layerStride / numBiases));
+      auto biases2 = biasFlat(fl::range(
+          layerStride * i + layerStride / numBiases, layerStride * (i + 1)));
+      auto layerBiasCombined = biases1 + biases2;
+      bias = fl::concatenate(0, bias, layerBiasCombined);
+    }
+  }
+
+  if (firstLayerDifferent) {
+    out.bias1L = bias.flat(fl::range(biasSize / numLayers));
+    if (numLayers > 1) {
+      // bias for the second --> last layer
+      bias = bias.flat(fl::range(biasSize / numLayers, fl::end));
+    }
+  }
+  out.bias = bias;
+
+  // Case for a single layer of different in/hidden size
+  if (firstLayerDifferent && numLayers == 1) {
+    out.weightsInput = out.weightsInput1L;
+    out.weightsHidden = out.weightsHidden1L;
+    out.bias = out.bias1L;
+  }
+
+  return out;
+}
+
+struct RnnResult {
+  dnnl::memory workspace;
+  Tensor y; // output
+  Tensor hy; // hidden output
+  Tensor cy; // cell output
+};
+
+/*
+ * Does forward for a single onednn RNN primitive
+ */
+RnnResult rnnImpl(
+    const Tensor& input,
+    const Tensor& hiddenState,
+    const Tensor& cellState,
+    const Tensor& weightsInput,
+    const Tensor& weightsHidden,
+    const Tensor& bias,
+    int hiddenSize,
+    int numLayers,
+    RnnMode mode,
+    dnnl::algorithm activation,
+    int numGates,
+    dnnl::rnn_direction direction,
+    int directionMult,
+    dnnl::prop_kind kind,
+    float dropout) {
+  RnnResult result;
+  auto dnnlEngine = detail::DnnlEngine::getInstance().getEngine();
+
+  // Dimensions
+  int inSize = input.dim(0);
+  int batchSize = input.ndim() < 2 ? 1 : input.dim(1);
+  int seqLength = input.ndim() < 3 ? 1 : input.dim(2);
+  dnnl::memory::dims inputDims = {seqLength, batchSize, inSize};
+  dnnl::memory::dims outputDims = {
+      seqLength, batchSize, hiddenSize * directionMult};
+  auto dType = detail::dnnlMapToType(input.type());
+  int totalLayers = numLayers;
+  int outSize = hiddenSize;
+  dnnl::memory::dims hDims = {
+      totalLayers, directionMult, batchSize, hiddenSize};
+  dnnl::memory::dims cDims = {
+      totalLayers, directionMult, batchSize, hiddenSize};
+  int extraBias = mode == RnnMode::GRU ? 1 : 0; // for LBR GRU
+  dnnl::memory::dims biasDims = {
+      numLayers, directionMult, numGates + extraBias, hiddenSize};
+  // ldigo
+  dnnl::memory::dims weightsInputDims = {
+      numLayers, directionMult, inSize, numGates, hiddenSize};
+  dnnl::memory::dims weightsHiddenDims = {
+      numLayers, directionMult, hiddenSize, numGates, hiddenSize};
+
+  // Out tensors: output (y), hidden state output (hy), cell state output (cy)
+  auto y = Tensor({outSize, batchSize, seqLength}, input.type());
+  auto hy = Tensor({hiddenSize, batchSize, totalLayers}, input.type());
+  Tensor cy;
+  if (mode == RnnMode::LSTM) {
+    cy = Tensor(hy.shape(), input.type());
+  }
+
+  // Memory for forward
+  auto tnc = dnnl::memory::format_tag::tnc;
+  auto ldnc = dnnl::memory::format_tag::ldnc;
+  auto ldgoi = dnnl::memory::format_tag::ldgoi;
+  auto ldgo = dnnl::memory::format_tag::ldgo;
+  const detail::DnnlMemoryWrapper inputMemInit(
+      input.asContiguousTensor(), {inputDims}, tnc);
+  const detail::DnnlMemoryWrapper outputMemInit(y, {outputDims}, tnc);
+  detail::DnnlMemoryWrapper hiddenInMemInit;
+  if (!hiddenState.isEmpty()) {
+    hiddenInMemInit = detail::DnnlMemoryWrapper(
+        hiddenState.asContiguousTensor(), {hDims}, ldnc);
+  }
+  const detail::DnnlMemoryWrapper hiddenOutMemInit(hy, {hDims}, ldnc);
+  const detail::DnnlMemoryWrapper weightsInputMemRawInit(
+      weightsInput.asContiguousTensor(), {weightsInputDims}, ldgoi);
+  const detail::DnnlMemoryWrapper weightsHiddenMemRawInit(
+      weightsHidden.asContiguousTensor(), {weightsHiddenDims}, ldgoi);
+  const detail::DnnlMemoryWrapper biasMemInit(
+      bias.asContiguousTensor(), {biasDims}, ldgo);
+
+  // TODO(jacobkahn): don't force a format tag - use any and do a reorder based
+  // on the format of the primitive - what it says - like you're supposed to
+  // Primitive for reordering input weights: ldgoi --> ldigo
+  auto weightsInputMemDesc = dnnl::memory::desc(
+      weightsInputDims, dType, dnnl::memory::format_tag::ldigo);
+  auto weightsInputMemInit = dnnl::memory(weightsInputMemDesc, dnnlEngine);
+  // Primitive for reordering iter/hidden weights: ldgoi --> ldigo
+  auto weightsHiddenMemDesc = dnnl::memory::desc(
+      weightsHiddenDims, dType, dnnl::memory::format_tag::ldigo);
+  auto weightsHiddenMemInit = dnnl::memory(weightsHiddenMemDesc, dnnlEngine);
+
+  // Add arguments
+  std::unordered_map<int, dnnl::memory> rnnFwdArgs = {
+      {DNNL_ARG_SRC_LAYER, inputMemInit.getMemory()},
+      {DNNL_ARG_SRC_ITER, hiddenInMemInit.getMemory()},
+      {DNNL_ARG_WEIGHTS_LAYER, weightsInputMemInit},
+      {DNNL_ARG_WEIGHTS_ITER, weightsHiddenMemInit},
+      {DNNL_ARG_BIAS, biasMemInit.getMemory()},
+      {DNNL_ARG_DST_LAYER, outputMemInit.getMemory()},
+      {DNNL_ARG_DST_ITER, hiddenOutMemInit.getMemory()}};
+
+  // Workspace memory, if needed
+  dnnl::memory workspace;
+  std::vector<dnnl::primitive> network;
+  std::vector<std::unordered_map<int, dnnl::memory>> fwdArgs;
+
+  // reorder input weights
+  network.push_back(
+      dnnl::reorder(weightsInputMemRawInit.getMemory(), weightsInputMemInit));
+  fwdArgs.push_back(
+      {{DNNL_ARG_FROM, weightsInputMemRawInit.getMemory()},
+       {DNNL_ARG_TO, weightsInputMemInit}});
+  // reorder iter weights
+  network.push_back(
+      dnnl::reorder(weightsHiddenMemRawInit.getMemory(), weightsHiddenMemInit));
+  fwdArgs.push_back(
+      {{DNNL_ARG_FROM, weightsHiddenMemRawInit.getMemory()},
+       {DNNL_ARG_TO, weightsHiddenMemInit}});
+
+  // Initialize descriptors
+  if (mode == RnnMode::RELU || mode == RnnMode::TANH) {
+    auto vanillaPd = dnnl::vanilla_rnn_forward::primitive_desc(
+        dnnlEngine,
+        kind,
+        activation,
+        direction,
+        inputMemInit.getDescriptor(),
+        hiddenInMemInit.getDescriptor(),
+        weightsInputMemDesc, // weights "layer"
+        weightsHiddenMemDesc, // weights "iter"
+        biasMemInit.getDescriptor(),
+        outputMemInit.getDescriptor(),
+        hiddenOutMemInit.getDescriptor());
+    network.push_back(dnnl::vanilla_rnn_forward(vanillaPd));
+    workspace = dnnl::memory(vanillaPd.workspace_desc(), dnnlEngine);
+
+  } else if (mode == RnnMode::LSTM) {
+    // LSTM-only
+    // input cell state
+    // TODO(jacobkahn): function that takes the array and
+    // returns the desciptor and memory -- takes an argument for
+    // which determines whether or not it's ok to return empty
+    // descriptors if the array is empty
+    detail::DnnlMemoryWrapper cellInMemInit;
+    if (!cellState.isEmpty()) {
+      cellInMemInit = detail::DnnlMemoryWrapper(
+          cellState.asContiguousTensor(), {cDims}, ldnc);
+    }
+    // output cell state
+    detail::DnnlMemoryWrapper cellOutMemInit(cy, cDims, ldnc);
+
+    auto lstmPd = dnnl::lstm_forward::primitive_desc(
+        dnnlEngine,
+        kind,
+        direction,
+        inputMemInit.getDescriptor(),
+        hiddenInMemInit.getDescriptor(),
+        cellInMemInit.getDescriptor(),
+        weightsInputMemDesc, // weights "layer"
+        weightsHiddenMemDesc, // weights "iter"
+        biasMemInit.getDescriptor(),
+        outputMemInit.getDescriptor(),
+        hiddenOutMemInit.getDescriptor(),
+        cellOutMemInit.getDescriptor());
+    network.push_back(dnnl::lstm_forward(lstmPd));
+    workspace = dnnl::memory(lstmPd.workspace_desc(), dnnlEngine);
+    rnnFwdArgs.insert({DNNL_ARG_SRC_ITER_C, cellInMemInit.getMemory()});
+    rnnFwdArgs.insert({DNNL_ARG_DST_ITER_C, cellOutMemInit.getMemory()});
+
+  } else if (mode == RnnMode::GRU) {
+    // Use a linear-before-reset GRU so we can have parity with cuDNN
+    auto gruPd = dnnl::lbr_gru_forward::primitive_desc(
+        dnnlEngine,
+        kind,
+        direction,
+        inputMemInit.getDescriptor(),
+        hiddenInMemInit.getDescriptor(),
+        weightsInputMemDesc,
+        weightsHiddenMemDesc,
+        biasMemInit.getDescriptor(),
+        outputMemInit.getDescriptor(),
+        hiddenOutMemInit.getDescriptor());
+    network.push_back(dnnl::lbr_gru_forward(gruPd));
+    workspace = dnnl::memory(gruPd.workspace_desc(), dnnlEngine);
+  }
+  rnnFwdArgs.insert({DNNL_ARG_WORKSPACE, workspace});
+  fwdArgs.push_back(rnnFwdArgs);
+
+  detail::executeNetwork(network, fwdArgs);
+
+  result.y = y;
+  result.hy = hy;
+  result.cy = cy;
+  result.workspace = workspace;
+  return result;
+}
+
+} // namespace
+
+std::tuple<Tensor, Tensor, Tensor> OneDnnAutogradExtension::rnn(
+    const Tensor& input,
+    const Tensor& hiddenState,
+    const Tensor& cellState,
+    const Tensor& weights,
+    const int hiddenSize,
+    const int numLayers,
+    const RnnMode mode,
+    const bool bidirectional,
+    const float dropout,
+    std::shared_ptr<detail::AutogradPayload> autogradPayload) {
+  if (dropout > 0.0) {
+    throw std::invalid_argument("onednn RNN: dropout > 0.0 unsupported");
+  }
+  if (bidirectional) {
+    throw std::invalid_argument("onednn RNN: bidirectional not yet supported");
+  }
+
+  const bool train = (autogradPayload != nullptr);
+
+  // Constants
+  auto direction = bidirectional
+      ? dnnl::rnn_direction::bidirectional_concat
+      : dnnl::rnn_direction::unidirectional_left2right;
+  int directionMult = bidirectional ? 2 : 1;
+  auto kind = train ? dnnl::prop_kind::forward_training
+                    : dnnl::prop_kind::forward_inference;
+  int numGates = 1;
+  auto activation = dnnl::algorithm::undef;
+  switch (mode) {
+    case RnnMode::LSTM:
+      numGates = 4;
+      break;
+    case RnnMode::GRU:
+      numGates = 3;
+      break;
+    case RnnMode::RELU:
+      activation = dnnl::algorithm::eltwise_relu;
+      break;
+    case RnnMode::TANH:
+      activation = dnnl::algorithm::eltwise_tanh;
+      break;
+    default:
+      break;
+  }
+
+  int inSize = input.dim(0);
+
+  // In Flashlight, all RNN weights are stored as one contiguous tensor, so we
+  // have to parse out the input weights, input biases, hidden weights, and
+  // hidden biases from one tensor. Order doesn't matter since the arrangement
+  // is a black box
+  auto parsedWeights = parseWeights(
+      weights, mode, numLayers, directionMult, inSize, numGates, hiddenSize);
+
+  RnnResult result;
+  // The oneDNN RNN primitive has an API limitation where input size and
+  // hidden size can only differ if the primitive has exactly one layer.
+  // Therefore, for computations for more than one layer, first do the
+  // operation for one layer, which gives an output vector of size [hidden
+  // size, batch size, sequence length * number of directions], then use
+  // that output as the input for layers [2, L]. Since the input size dim 0
+  // is now the hidden size, the primitive can fuse computation for
+  // arbitrarily-many layers.
+  if (input.dim(0) == hiddenSize || numLayers == 1) {
+    // Input and hidden size are the same, or we only have one layer, which
+    // means we can call the impl as is and parse weights "normally"
+    result = rnnImpl(
+        input,
+        hiddenState,
+        cellState,
+        parsedWeights.weightsInput,
+        parsedWeights.weightsHidden,
+        parsedWeights.bias,
+        hiddenSize,
+        numLayers,
+        mode,
+        activation,
+        numGates,
+        direction,
+        directionMult,
+        kind,
+        dropout);
+  } else {
+    // We require more than one layer with different input and hidden states -
+    // see the above. Seek to the first layer's hidden/cell state, weights, and
+    // bias
+    RnnResult resultL1 = rnnImpl(
+        input,
+        hiddenState(fl::span, fl::span, 0),
+        cellState(fl::span, fl::span, 0),
+        parsedWeights.weightsInput1L,
+        parsedWeights.weightsHidden1L,
+        parsedWeights.bias1L,
+        hiddenSize,
+        1,
+        mode,
+        activation,
+        numGates,
+        direction,
+        directionMult,
+        kind,
+        dropout);
+
+    /* Layers [2..N] */
+    // Seek  past the first layer's hidden/cell state, weights, and bias
+    RnnResult resultL2N = rnnImpl(
+        resultL1.y, // fixme
+        hiddenState(fl::span, fl::span, fl::range(1, fl::end)),
+        cellState(fl::span, fl::span, fl::range(1, fl::end)),
+        parsedWeights.weightsInput,
+        parsedWeights.weightsHidden,
+        parsedWeights.bias,
+        hiddenSize,
+        numLayers - 1, // layers [2..N]
+        mode,
+        activation,
+        numGates,
+        direction,
+        directionMult,
+        kind,
+        dropout);
+
+    result.y = resultL2N.y;
+    result.hy = fl::concatenate(2, resultL1.hy, resultL2N.hy);
+    result.cy = fl::concatenate(2, resultL1.cy, resultL2N.cy);
+  }
+
+  return {result.y, result.hy, result.cy};
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> OneDnnAutogradExtension::rnnBackward(
+    const Tensor& input,
+    const Tensor& hiddenState,
+    const Tensor& cellState,
+    const Tensor& weights,
+    const std::shared_ptr<detail::RNNGradData> gradData,
+    const Tensor& output,
+    const int numLayers,
+    const int hiddenSize,
+    const RnnMode mode,
+    const bool bidirectional,
+    const float dropProb,
+    const std::shared_ptr<detail::AutogradPayload> payload) {
+  throw std::runtime_error(
+      "onednn RNN: Gradient computation not yet supported");
+}
+
+} // namespace fl
diff --git a/flashlight/fl/common/Histogram.h b/flashlight/fl/common/Histogram.h
index f57b26a..df0b300 100644
--- a/flashlight/fl/common/Histogram.h
+++ b/flashlight/fl/common/Histogram.h
@@ -49,7 +49,7 @@ template <typename T>
 struct HistogramBucket {
   T startInclusive = 0; //! left boundary of the bucket.
   T endExclusive = 0; //! right boundary of the bucket.
-  size_t count = 0; //! Number of elements in this bucket.88
+  size_t count = 0; //! Number of elements in this bucket.
 
   std::string prettyString(
       double countPerTick, // ratio of count/bar_length
@@ -146,8 +146,8 @@ HistogramStats<T> FixedBucketSizeHistogram(
   stats.mean = simpleMovingAverage;
 
   // Calculate bucket size
-  long range = stats.max - stats.min;
-  double bucketWidth = range / nBuckets;
+  double range = stats.max - stats.min;
+  auto bucketWidth = range / nBuckets;
   if (range == 0 || bucketWidth == 0) {
     stats.buckets[0].count = stats.numValues;
     stats.maxNumValuesPerBucket = stats.numValues;
@@ -157,11 +157,11 @@ HistogramStats<T> FixedBucketSizeHistogram(
   // Calculate count per bucket
   stats.maxNumValuesPerBucket = 0;
   for (auto itr = begin; itr != end; ++itr) {
-    if (*itr < clipMinValueInclusive || *itr > clipMaxValueExclusive) {
+    if (*itr < clipMinValueInclusive || *itr >= clipMaxValueExclusive) {
       continue;
     }
     double index =
-        std::round(static_cast<double>(*itr - stats.min) / bucketWidth);
+        std::floor(static_cast<double>(*itr - stats.min) / bucketWidth);
     size_t intIndex = std::min(static_cast<size_t>(index), nBuckets - 1);
 
     HistogramBucket<T>& bucket = stats.buckets[intIndex];
diff --git a/flashlight/fl/common/Logging.cpp b/flashlight/fl/common/Logging.cpp
index 181a8d7..d666739 100644
--- a/flashlight/fl/common/Logging.cpp
+++ b/flashlight/fl/common/Logging.cpp
@@ -87,13 +87,18 @@ void addContext(
     std::stringstream* outputStream) {
   // report only the last threadIdNumDigits of the thread ID for succinctness
   // and compatibility with glog.
-  constexpr size_t threadIdNumDigits = 5;
+  constexpr size_t maxThreadIdNumDigits = 5;
   std::stringstream ss;
   ss << std::this_thread::get_id();
-  const std::string threadId = ss.str();
-
+  
+  std::string threadId = ss.str();
+  if(threadId.size() > maxThreadIdNumDigits){
+   threadId = threadId.substr(threadId.size() - maxThreadIdNumDigits);
+  }
+    
+  
   (*outputStream) << dateTimeWithMicroSeconds() << ' '
-                  << threadId.substr(threadId.size() - threadIdNumDigits) << ' '
+                  << threadId << ' '
                   << getFileName(fullPath) << ':' << lineNumber << ' ';
 }
 
diff --git a/flashlight/fl/dataset/BlobDataset.cpp b/flashlight/fl/dataset/BlobDataset.cpp
index 563fba5..1b605f5 100644
--- a/flashlight/fl/dataset/BlobDataset.cpp
+++ b/flashlight/fl/dataset/BlobDataset.cpp
@@ -159,7 +159,7 @@ std::vector<uint8_t> BlobDataset::readRawArray(
     buffer.resize(fl::getTypeSize(e.type) * e.dims.elements());
     readData(
         e.offset,
-        (char*)buffer.data(),
+        reinterpret_cast<char*>(buffer.data()),
         fl::getTypeSize(e.type) * e.dims.elements());
   }
   return buffer;
diff --git a/flashlight/fl/dataset/FileBlobDataset.cpp b/flashlight/fl/dataset/FileBlobDataset.cpp
index 2f8d027..f432935 100644
--- a/flashlight/fl/dataset/FileBlobDataset.cpp
+++ b/flashlight/fl/dataset/FileBlobDataset.cpp
@@ -16,7 +16,8 @@ FileBlobDataset::FileBlobDataset(
     bool rw,
     bool truncate)
     : name_(name) {
-  mode_ = (rw ? std::ios_base::in | std::ios_base::out : std::ios_base::in);
+  mode_ = (rw ? std::ios_base::in | std::ios_base::out : std::ios_base::in) |
+      std::ios_base::binary;
   {
     std::ofstream fs(name_, (truncate ? mode_ | std::ios_base::trunc : mode_));
     if (!fs.is_open()) {
diff --git a/flashlight/fl/tensor/backend/af/CMakeLists.txt b/flashlight/fl/tensor/backend/af/CMakeLists.txt
index 95b9420..da4fbdb 100644
--- a/flashlight/fl/tensor/backend/af/CMakeLists.txt
+++ b/flashlight/fl/tensor/backend/af/CMakeLists.txt
@@ -74,6 +74,33 @@ if (${FL_ARRAYFIRE_USE_CUDA})
   fl_set_backend_state(ENABLE CUDA)
 elseif(${FL_ARRAYFIRE_USE_CPU})
   target_link_libraries(flashlight PUBLIC ArrayFire::afcpu)
+
+  # af forgets dependencies, on windows we have to copy them
+  if(WIN32)
+    include(fm_target_utilities)
+    
+    set(AF_LIB_DIR "${ArrayFire_DIR}/../lib")
+
+    fm_glob(
+        MKL_DLLS 
+        "${AF_LIB_DIR}/"
+        PATTERNS
+        "mkl_*.dll"
+        "libiomp5md.dll"
+    )
+
+    fm_target_attach_dependency(
+        flashlight 
+        NOLINK 
+        ${MKL_DLLS}
+    )
+  endif()
+  
+  if(LINUX)
+    find_package(OpenMP REQUIRED COMPONENTS CXX)
+    target_link_libraries(flashlight PUBLIC OpenMP::OpenMP_CXX)
+  endif()
+
   fl_set_backend_state(ENABLE CPU)
 elseif(${FL_ARRAYFIRE_USE_OPENCL})
   target_link_libraries(flashlight PUBLIC ArrayFire::afopencl)
diff --git a/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp b/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp
index 420f547..d0ee0d4 100644
--- a/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp
@@ -126,11 +126,11 @@ TEST(AutogradBinaryOpsTest, Linear) {
     auto wt = Variable(fl::rand({6, 3}, fl::dtype::f64) * 2 - 1, true);
     auto bs = Variable(fl::rand({6}, fl::dtype::f64) * 2 - 1, true);
     auto funcLinIn = [&](Variable& input) { return linear(input, wt, bs); };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinIn, in, 1E-8));
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinIn, in, 1E-8, 1E-4, {&wt, &bs}));
     auto funcLinWt = [&](Variable& weight) { return linear(in, weight, bs); };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinWt, wt, 1E-8));
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinWt, wt, 1E-8, 1E-4, {&in, &bs}));
     auto funcLinBs = [&](Variable& bias) { return linear(in, wt, bias); };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinBs, bs, 1E-8));
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinBs, bs, 1E-8, 1E-4, {&in, &wt}));
   }
 }
 
@@ -146,11 +146,11 @@ TEST_F(AutogradTestF16, LinearF16) {
     auto wt = Variable(fl::rand({2, 2}, fl::dtype::f16) * scale, true);
     auto bs = Variable(fl::rand({2}, fl::dtype::f16) * scale, true);
     auto funcLinIn = [&](Variable& input) { return linear(input, wt, bs); };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinIn, in, 5E-2, 5E-1));
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinIn, in, 5E-2, 5E-1, {&wt, &bs}));
     auto funcLinWt = [&](Variable& weight) { return linear(in, weight, bs); };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinWt, wt, 5E-2, 5E-1));
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinWt, wt, 5E-2, 5E-1, {&in, &bs}));
     auto funcLinBs = [&](Variable& bias) { return linear(in, wt, bias); };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinBs, bs, 5E-2, 5E-1));
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinBs, bs, 5E-2, 5E-1, {&in, &wt}));
   }
 }
 
diff --git a/flashlight/fl/test/autograd/AutogradConv2DTest.cpp b/flashlight/fl/test/autograd/AutogradConv2DTest.cpp
index 88cd170..c13f3c6 100644
--- a/flashlight/fl/test/autograd/AutogradConv2DTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradConv2DTest.cpp
@@ -22,6 +22,7 @@ TEST(AutogradConv2DTest, Convolve) {
   auto in = Variable(fl::rand({10, 9, 8, 7}, fl::dtype::f32), true);
   auto wt = Variable(fl::rand({4, 3, 8, 6}, fl::dtype::f32), true);
   auto bs = Variable(fl::rand({1, 1, 6, 1}, fl::dtype::f32), true);
+
   int px = 2, py = 1;
   int sx = 1, sy = 1;
   int dx = 1, dy = 1;
@@ -40,7 +41,8 @@ TEST(AutogradConv2DTest, Convolve) {
         /* groups */ 1,
         benchmarks);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06, 1E-4, {&wt}));
+
   auto funcConvWt = [&](Variable& weight) {
     return conv2d(
         in,
@@ -55,7 +57,8 @@ TEST(AutogradConv2DTest, Convolve) {
         /* groups */ 1,
         benchmarks);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.06));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.06, 1E-4, {&in}));
+
   auto funcConvBs = [&](Variable& bias) {
     return conv2d(
         in,
@@ -70,7 +73,7 @@ TEST(AutogradConv2DTest, Convolve) {
         /* groups */ 1,
         benchmarks);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 0.03));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 0.03, 1E-4, {&in, &wt}));
 }
 
 TEST_F(AutogradTestF16, ConvolveF16) {
@@ -83,6 +86,7 @@ TEST_F(AutogradTestF16, ConvolveF16) {
       Variable(fl::rand({3, 1, 2, 1}, fl::dtype::f16) * scaleFactor, true);
   auto wt = Variable(fl::rand({2, 1, 2, 1}, fl::dtype::f16), true);
   auto bs = Variable(fl::rand({1, 1, 1, 1}, fl::dtype::f16), true);
+
   int px = 1, py = 1;
   int sx = 1, sy = 1;
   int dx = 1, dy = 1;
@@ -101,7 +105,8 @@ TEST_F(AutogradTestF16, ConvolveF16) {
         /* groups */ 1,
         benchmarks);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 5e-1, 0.1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 5e-1, 0.1, {&wt, &bs}));
+
   auto funcConvWt = [&](Variable& weight) {
     return conv2d(
         in,
@@ -116,7 +121,8 @@ TEST_F(AutogradTestF16, ConvolveF16) {
         /* groups */ 1,
         benchmarks);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 5e-2, 0.1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 5e-2, 0.1, {&in, &bs}));
+
   auto funcConvBs = [&](Variable& bias) {
     return conv2d(
         in,
@@ -131,7 +137,7 @@ TEST_F(AutogradTestF16, ConvolveF16) {
         /* groups */ 1,
         benchmarks);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 3e-2, 0.1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 3e-2, 0.1, {&in, &wt}));
 }
 
 TEST(AutogradConv2DTest, ConvolveFilterGroups) {
@@ -150,15 +156,15 @@ TEST(AutogradConv2DTest, ConvolveFilterGroups) {
   auto funcConvIn = [&](Variable& input) {
     return conv2d(input, wt, bs, sx, sy, px, py, dx, dy, groups);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06, 1E-4, {&wt, &bs}));
   auto funcConvWt = [&](Variable& weight) {
     return conv2d(in, weight, bs, sx, sy, px, py, dx, dy, groups);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.05));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.05, 1E-4, {&in, &bs}));
   auto foncConvBs = [&](Variable& bias) {
     return conv2d(in, wt, bias, sx, sy, px, py, dx, dy, groups);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(foncConvBs, bs, 0.02));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(foncConvBs, bs, 0.02, 1E-4, {&in, &wt}));
 }
 
 TEST(AutogradConv2DTest, ConvolveDilation) {
@@ -181,7 +187,7 @@ TEST(AutogradConv2DTest, ConvolveDilation) {
         dy,
         /* groups */ 1);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06, 1E-4, {&wt, &bs}));
   auto funcConvWt = [&](Variable& weight) {
     return conv2d(
         in,
@@ -195,7 +201,7 @@ TEST(AutogradConv2DTest, ConvolveDilation) {
         dy,
         /* groups */ 1);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.05));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.05, 1E-4, {&in, &bs}));
   auto funcConvBs = [&](Variable& bias) {
     return conv2d(
         in,
@@ -209,7 +215,7 @@ TEST(AutogradConv2DTest, ConvolveDilation) {
         dy,
         /* groups */ 1);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 0.02));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 0.02, 1E-4, {&in, &wt}));
 }
 
 TEST(AutogradConv2DTest, WeightNormConv) {
@@ -233,7 +239,7 @@ TEST(AutogradConv2DTest, WeightNormConv) {
         /* dy */ 1,
         /* groups */ 1);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormIn, in, 3E-1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormIn, in, 3E-1, 1E-4, {&v, &g}));
 
   auto funcWeightNormV = [&](Variable& input) {
     auto w = input *
@@ -250,7 +256,7 @@ TEST(AutogradConv2DTest, WeightNormConv) {
         /* dy */ 1,
         /* groups */ 1);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormV, v, 2E-1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormV, v, 2E-1, 1E-4, {&g, &in}));
 
   auto funcWeightNormG = [&](Variable& input) {
     auto w = v *
@@ -267,7 +273,7 @@ TEST(AutogradConv2DTest, WeightNormConv) {
         /* dy */ 1,
         /* groups */ 1);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormG, g, 2E-1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormG, g, 2E-1, 1E-4, {&v, &in}));
 }
 
 int main(int argc, char** argv) {
diff --git a/flashlight/fl/test/autograd/AutogradNormalizationTest.cpp b/flashlight/fl/test/autograd/AutogradNormalizationTest.cpp
index 81479e1..7663b9b 100644
--- a/flashlight/fl/test/autograd/AutogradNormalizationTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradNormalizationTest.cpp
@@ -240,19 +240,22 @@ TEST(AutogradNormalizationTest, BatchNormJacobian) {
     return (batchnorm(
         in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 1e-2, 1e-4));
+
+
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 1e-2, 1e-4, {&weight, &bias}));
 
   auto funcBnWt = [&](Variable& wt) {
     return (batchnorm(
         input, wt, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 1e-2, 1e-4));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 1e-2, 1e-4, {&input, &bias}));
+
 
   auto funcBnBs = [&](Variable& bs) {
     return (batchnorm(
         input, weight, bs, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 1e-2, 1e-4));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 1e-2, 1e-4, {&input, &weight}));
 }
 
 TEST_F(AutogradTestF16, BatchNormJacobianF16) {
@@ -276,25 +279,25 @@ TEST_F(AutogradTestF16, BatchNormJacobianF16) {
     return (batchnorm(
         in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 5e-2, 1e-1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 5e-2, 1e-1, {&weight, &bias}));
 
   auto funcBnWt = [&](Variable& wt) {
     return (batchnorm(
         input, wt, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 5e-2, 1e-1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 5e-2, 1e-1, {&input, &bias}));
 
   auto funcBnBs = [&](Variable& bs) {
     return (batchnorm(
         input, weight, bs, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 5e-2, 1e-1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 5e-2, 1e-1, {&input, &weight}));
 }
 
 TEST(AutogradNormalizationTest, BatchNormJacobianMultipleAxes) {
   // Jacobian Test with  trainMode = true;
   std::vector<int> featAxes = {0, 1, 2};
-  auto input = Variable(fl::rand({8, 8, 3, 16}, fl::dtype::f32), true);
+  auto input = Variable(fl::rand({4, 4, 3, 4}, fl::dtype::f32), true);
   auto nfeatures = 1;
   for (auto ax : featAxes) {
     nfeatures *= input.dim(ax);
@@ -308,19 +311,19 @@ TEST(AutogradNormalizationTest, BatchNormJacobianMultipleAxes) {
     return (batchnorm(
         in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 1e-2, 1e-3));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 1e-2, 1e-3, {&weight, &bias}));
 
   auto funcBnWt = [&](Variable& wt) {
     return (batchnorm(
         input, wt, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 1e-2, 1e-3));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 1e-2, 1e-3, {&input, &bias}));
 
   auto funcBnBs = [&](Variable& bs) {
     return (batchnorm(
         input, weight, bs, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 1e-2, 1e-3));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 1e-2, 1e-3, {&input, &weight}));
 }
 
 TEST_F(AutogradTestF16, BatchNormJacobianMultipleAxesF16) {
@@ -347,19 +350,19 @@ TEST_F(AutogradTestF16, BatchNormJacobianMultipleAxesF16) {
         in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
   ASSERT_TRUE(fl::detail::jacobianTestImpl(
-      funcBnIn, input, 5e-2, 1e-1)); // TODO: investigate
+      funcBnIn, input, 5e-2, 1e-1, {&weight, &bias})); // TODO: investigate
 
   auto funcBnWt = [&](Variable& wt) {
     return (batchnorm(
         input, wt, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 5e-2, 1e-1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 5e-2, 1e-1, {&input, &bias}));
 
   auto funcBnBs = [&](Variable& bs) {
     return (batchnorm(
         input, weight, bs, runningMean, runningVar, featAxes, true, 0.0, 1E-5));
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 5e-2, 1e-1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 5e-2, 1e-1, {&input, &weight}));
 }
 
 TEST(AutogradNormalizationTest, LayerNormJacobian) {
@@ -379,7 +382,7 @@ TEST(AutogradNormalizationTest, LayerNormJacobian) {
         in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5);
   };
 
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLnIn, input, 1e-2, 1e-4));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLnIn, input, 1e-2, 1e-4, {&weight, &bias}));
 }
 
 TEST_F(AutogradTestF16, LayerNormJacobianF16) {
@@ -405,7 +408,7 @@ TEST_F(AutogradTestF16, LayerNormJacobianF16) {
         in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5);
   };
 
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLnIn, input, 1e-4, 1e-2));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLnIn, input, 1e-4, 1e-2, {&weight, &bias}));
 }
 
 int main(int argc, char** argv) {
diff --git a/flashlight/fl/test/autograd/AutogradRnnTest.cpp b/flashlight/fl/test/autograd/AutogradRnnTest.cpp
index 2201fdb..a0eb335 100644
--- a/flashlight/fl/test/autograd/AutogradRnnTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradRnnTest.cpp
@@ -64,7 +64,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
                 bidirectional,
                 0.0));
     };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnIn, in, expectedPrecision, perturbation));
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnIn, in, expectedPrecision, perturbation, {&w}));
 
     auto funcRnnW = [&](Variable& weights) -> Variable {
         return std::get<0>(
@@ -78,7 +78,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
                 bidirectional,
                 0.0));
     };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnW, w, expectedPrecision, perturbation));
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnW, w, expectedPrecision, perturbation, {&in}));
 
     // We get the correct gradient for hx
     auto hx = Variable(
@@ -98,7 +98,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
                 bidirectional,
                 0.0));
     };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnHx, hx, expectedPrecision, perturbation));
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnHx, hx, expectedPrecision, perturbation, {&in, &w}));
 
     // We can compute the gradient w.r.t. hy
     auto funcRnnInDhy = [&](Variable& input) -> Variable {
@@ -114,7 +114,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
                 0.0));
     };
     ASSERT_TRUE(
-        fl::detail::jacobianTestImpl(funcRnnInDhy, in, expectedPrecision, perturbation));
+        fl::detail::jacobianTestImpl(funcRnnInDhy, in, expectedPrecision, perturbation, {&w}));
 
     if (mode == RnnMode::LSTM) {
         // We get the correct gradient for cx
@@ -136,7 +136,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
                 0.0));
         };
         ASSERT_TRUE(
-            fl::detail::jacobianTestImpl(funcRnnCx, cx, expectedPrecision, perturbation));
+            fl::detail::jacobianTestImpl(funcRnnCx, cx, expectedPrecision, perturbation, {&in, &w}));
 
         // We can compute the gradient w.r.t. cy
         auto funcRnnInDcy = [&](Variable& input) -> Variable {
@@ -152,7 +152,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
                 0.0));
         };
         ASSERT_TRUE(
-            fl::detail::jacobianTestImpl(funcRnnInDcy, in, expectedPrecision, perturbation));
+            fl::detail::jacobianTestImpl(funcRnnInDcy, in, expectedPrecision, perturbation, {&w}));
     }
 }
 
diff --git a/flashlight/fl/test/autograd/AutogradTest.cpp b/flashlight/fl/test/autograd/AutogradTest.cpp
index 4253545..f506e32 100644
--- a/flashlight/fl/test/autograd/AutogradTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradTest.cpp
@@ -198,12 +198,12 @@ TEST(AutogradTest, Concatenate) {
   auto funcConcatenateT1 = [x2, x3, x4](Variable& in) {
     return concatenate({in, x2, x3, x4}, 2);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT1, x1));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT1, x1, 1E-5, 1E-4, {&x2, &x3, &x4}));
 
   auto funcConcatenateT2 = [x1, x2, x4](Variable& in) {
     return concatenate({x1, x2, in, x4}, 2);
   };
-  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT2, x3));
+  ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT2, x3, 1E-5, 1E-4, {&x1, &x2, &x4}));
 }
 
 TEST(AutogradTest, Split) {
diff --git a/flashlight/fl/test/autograd/AutogradTestUtils.h b/flashlight/fl/test/autograd/AutogradTestUtils.h
index 39bd605..9dafbdb 100644
--- a/flashlight/fl/test/autograd/AutogradTestUtils.h
+++ b/flashlight/fl/test/autograd/AutogradTestUtils.h
@@ -34,7 +34,8 @@ inline bool jacobianTestImpl(
     const JacobianFunc& func,
     Variable& input,
     float precision = 1E-5,
-    float perturbation = 1E-4) {
+    float perturbation = 1E-4,
+    const std::vector<Variable*>& zeroGradientVariables = {}) {
     auto fwdJacobian =
         Tensor({func(input).elements(), input.elements()}, fl::dtype::f32);
 
@@ -60,6 +61,9 @@ inline bool jacobianTestImpl(
     for (int i = 0; i < dout.elements(); ++i) {
         dout.tensor().flat(i) = 1; // element in 1D view
         input.zeroGrad();
+        for (auto* var : zeroGradientVariables) {
+            var->zeroGrad();
+        }
         auto out = func(input);
         out.backward(dout);
 
diff --git a/flashlight/fl/test/common/DynamicBenchmarkTest.cpp b/flashlight/fl/test/common/DynamicBenchmarkTest.cpp
index 79fb76d..8d7fce7 100644
--- a/flashlight/fl/test/common/DynamicBenchmarkTest.cpp
+++ b/flashlight/fl/test/common/DynamicBenchmarkTest.cpp
@@ -77,7 +77,7 @@ TEST_F(DynamicBenchmark, OptionsStateTimed) {
 
 TEST_F(DynamicBenchmark, DynamicBenchmarkSimple) {
   size_t maxCount = 5;
-  std::vector<int> sleepTimes = {4, 2, 6};
+  std::vector<int> sleepTimes = {30, 16, 40}; //min 16ms (win)
 
   auto options =
       std::make_shared<fl::DynamicBenchmarkOptions<int>>(sleepTimes, maxCount);
@@ -90,12 +90,12 @@ TEST_F(DynamicBenchmark, DynamicBenchmarkSimple) {
   }
   ASSERT_TRUE(options->timingsComplete());
   // sleeping for fewer miliseconds is faster
-  ASSERT_EQ(options->currentOption(), 2);
+  ASSERT_EQ(options->currentOption(), sleepTimes[1]);
 }
 
 TEST_F(DynamicBenchmark, DynamicBenchmarkDisjointLambdas) {
   size_t maxCount = 5;
-  std::vector<int> sleepTimes = {4, 2, 6};
+  std::vector<int> sleepTimes = {30, 16, 40};
 
   auto options =
       std::make_shared<fl::DynamicBenchmarkOptions<int>>(sleepTimes, maxCount);
@@ -120,7 +120,7 @@ TEST_F(DynamicBenchmark, DynamicBenchmarkDisjointLambdas) {
   }
   ASSERT_TRUE(options->timingsComplete());
   // option 2 is still fastest disregarding intermediate time
-  ASSERT_EQ(options->currentOption(), 2);
+  ASSERT_EQ(options->currentOption(), sleepTimes[1]);
 }
 
 TEST_F(DynamicBenchmark, DynamicBenchmarkMatmul) {
diff --git a/flashlight/fl/test/common/LoggingTest.cpp b/flashlight/fl/test/common/LoggingTest.cpp
index 0807336..def67cf 100644
--- a/flashlight/fl/test/common/LoggingTest.cpp
+++ b/flashlight/fl/test/common/LoggingTest.cpp
@@ -135,7 +135,7 @@ TEST(LoggingDeathTest, FatalOnOff) {
   std::cerr.rdbuf(origStderrBuffer);
 
   Logging::setMaxLoggingLevel(fl::LogLevel::FATAL);
-  EXPECT_DEATH({ FL_LOG(fl::LogLevel::FATAL) << "log-fatal"; }, "");
+  EXPECT_DEATH_IF_SUPPORTED({ FL_LOG(fl::LogLevel::FATAL) << "log-fatal"; }, "");
 }
 
 } // namespace
diff --git a/flashlight/fl/test/runtime/DeviceManagerTest.cpp b/flashlight/fl/test/runtime/DeviceManagerTest.cpp
index f2d228e..bbe6426 100644
--- a/flashlight/fl/test/runtime/DeviceManagerTest.cpp
+++ b/flashlight/fl/test/runtime/DeviceManagerTest.cpp
@@ -38,7 +38,7 @@ TEST(DeviceManagerTest, getDeviceCount) {
     ASSERT_NO_THROW(manager.getDeviceCount(DeviceType::CUDA));
   } else {
     ASSERT_THROW(manager.getDeviceCount(DeviceType::CUDA),
-      std::invalid_argument);
+      std::runtime_error);
   }
 }
 
@@ -54,7 +54,7 @@ TEST(DeviceManagerTest, getDevicesOfType) {
       }
     } else {
       ASSERT_THROW(manager.getDeviceCount(DeviceType::CUDA),
-          std::invalid_argument);
+          std::runtime_error);
     }
   }
 }
@@ -72,7 +72,7 @@ TEST(DeviceManagerTest, getActiveDevice) {
     if (manager.isDeviceTypeAvailable(type)) {
       ASSERT_EQ(manager.getActiveDevice(type).type(), type);
     } else {
-      ASSERT_THROW(manager.getActiveDevice(type), std::invalid_argument);
+      ASSERT_THROW(manager.getActiveDevice(type), std::runtime_error);
     }
   }
 }
diff --git a/flashlight/fl/test/tensor/TensorBaseTest.cpp b/flashlight/fl/test/tensor/TensorBaseTest.cpp
index fd0553b..283c55e 100644
--- a/flashlight/fl/test/tensor/TensorBaseTest.cpp
+++ b/flashlight/fl/test/tensor/TensorBaseTest.cpp
@@ -475,8 +475,11 @@ void assertScalarBehavior(fl::dtype type) {
         << ", ScalarArgType: " << dtype_traits<ScalarArgType>::getName();
   }
 
-  auto a = fl::rand({5, 6}, type);
-  ASSERT_TRUE(allClose(fl::full({1}, a.scalar<ScalarArgType>(), type), a(0, 0)))
+
+  ScalarArgType val = static_cast<ScalarArgType>(rand());
+  auto a = fl::full({5, 6}, val, type);
+  
+  ASSERT_TRUE(allClose(fl::full({1}, a.template scalar<ScalarArgType>(), type), a(0, 0)))
       << "dtype: " << type
       << ", ScalarArgType: " << dtype_traits<ScalarArgType>::getName();
 }
diff --git a/flashlight/fl/test/tensor/TensorReductionTest.cpp b/flashlight/fl/test/tensor/TensorReductionTest.cpp
index dac0757..04ce54b 100644
--- a/flashlight/fl/test/tensor/TensorReductionTest.cpp
+++ b/flashlight/fl/test/tensor/TensorReductionTest.cpp
@@ -23,8 +23,9 @@ TEST(TensorReductionTest, countNonzero) {
     a(idx / 10, idx % 10) = 0;
   }
 
-  ASSERT_TRUE(
-      allClose(fl::fromScalar(a.elements() - idxs.size()), fl::countNonzero(a)));
+  ASSERT_TRUE(allClose(
+      fl::fromScalar(a.elements() - idxs.size(), a.type()),
+      fl::countNonzero(a)));
 
   std::vector<unsigned> sizes(a.shape().dim(0));
   for (unsigned i = 0; i < a.shape().dim(0); ++i) {
@@ -44,7 +45,7 @@ TEST(TensorReductionTest, countNonzero) {
       fl::Tensor::fromVector<unsigned>({2}, {4, 1}),
       fl::countNonzero(b, {0, 1})));
   ASSERT_TRUE(
-      allClose(fl::fromScalar(b.elements() - 3), fl::countNonzero(b, {0, 1, 2})));
+      allClose(fl::fromScalar(b.elements() - 3, b.type()), fl::countNonzero(b, {0, 1, 2})));
 }
 
 TEST(TensorReductionTest, amin) {
diff --git a/flashlight/pkg/speech/audio/feature/CMakeLists.txt b/flashlight/pkg/speech/audio/feature/CMakeLists.txt
index f972509..4028388 100644
--- a/flashlight/pkg/speech/audio/feature/CMakeLists.txt
+++ b/flashlight/pkg/speech/audio/feature/CMakeLists.txt
@@ -2,14 +2,19 @@ cmake_minimum_required(VERSION 3.16)
 
 # ----------------------------- Dependencies -----------------------------
 # BLAS
-find_package(MKL)
+find_package(MKL CONFIG REQUIRED)
 if (MKL_FOUND)
   set(FL_USE_MKL ON)
-  setup_install_find_module(${PROJECT_SOURCE_DIR}/cmake/FindMKL.cmake)
-  set(CBLAS_LIBRARIES ${MKL_LIBRARIES})
-  set(CBLAS_INCLUDE_DIR ${MKL_INCLUDE_DIR})
+  #setup_install_find_module(${PROJECT_SOURCE_DIR}/cmake/FindMKL.cmake)
+  #set(CBLAS_LIBRARIES ${MKL_LIBRARIES})
+  #set(CBLAS_INCLUDE_DIR ${MKL_INCLUDE_DIR})
   # TODO: remove me when we consolidate build options
+  
+  # TODO linking to MKL doesn't work with gcc bc of openmpi issues
   target_compile_definitions(fl_pkg_speech PUBLIC FL_USE_MKL=$<BOOL:${FL_USE_MKL}>)
+  target_link_libraries(fl_pkg_speech PUBLIC MKL::MKL)
+
+
 else()
   find_package(CBLAS REQUIRED)
   setup_install_find_module(${PROJECT_SOURCE_DIR}/cmake/FindCBLAS.cmake)
@@ -29,11 +34,11 @@ endif()
 target_link_libraries(fl_pkg_speech PUBLIC $<BUILD_INTERFACE:FFTW3::fftw3>)
 
 # OpenMP
-if (NOT MKL_FOUND)
+#if (NOT MKL_FOUND)
   # NB: MKL provides iomp if enabled
-  find_package(OpenMP REQUIRED)
-  target_link_libraries(fl_pkg_speech PRIVATE OpenMP::OpenMP_CXX)
-endif()
+#  find_package(OpenMP REQUIRED)
+#  target_link_libraries(fl_pkg_speech PRIVATE OpenMP::OpenMP_CXX)
+#endif()
 
 # Threads
 find_package(Threads REQUIRED)
diff --git a/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp b/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp
index c1bc4d9..1bddd6b 100644
--- a/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp
+++ b/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp
@@ -22,7 +22,8 @@ bool jacobianTestImpl(
     const JacobianFunc& func,
     Variable& input,
     float precision = 1E-5,
-    float perturbation = 1E-4) {
+    float perturbation = 1E-4,
+    const std::vector<Variable*>& zeroGradientVariables = {}) {
   auto fwdJacobian =
       Tensor({func(input).elements(), input.elements()}, fl::dtype::f32);
 
@@ -48,6 +49,9 @@ bool jacobianTestImpl(
   for (int i = 0; i < dout.elements(); ++i) {
     dout.tensor().flat(i) = 1; // element in 1D view
     input.zeroGrad();
+    for (auto* var : zeroGradientVariables) {
+      var->zeroGrad();
+    }
     auto out = func(input);
     out.backward(dout);
 
diff --git a/vcpkg.json b/vcpkg.json
index c91b75c..7dcf289 100644
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -12,7 +12,9 @@
     },
     "cpu": {
       "description": "Dependencies for cpu backend",
-      "dependencies": []
+      "dependencies": [
+        "onednn"
+      ]
     }
   }
 }