diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..53f2797 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,6 @@ +.github/actions/ @codethinki +.github/workflows/ @codethinki +ci/docker/ @codethinki + +LICENSE @codethinki +.clang-format @codethinki diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml new file mode 100644 index 0000000..72a9cfb --- /dev/null +++ b/.github/actions/build-container/action.yml @@ -0,0 +1,32 @@ +name: 'Build Container' +description: 'Builds and pushes the Linux Docker image' +inputs: + dockerfile: + description: 'Path to the Dockerfile' + required: true + tag: + description: 'Tag suffix for the image' + required: true + +runs: + using: "composite" + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ github.token }} + + - name: Build Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: ${{ inputs.dockerfile }} + push: true + tags: ghcr.io/${{ github.repository }}:${{ inputs.tag }} + cache-from: type=registry,ref=ghcr.io/${{ github.repository }}:${{ inputs.tag }}-cache + cache-to: type=registry,ref=ghcr.io/${{ github.repository }}:${{ inputs.tag }}-cache,mode=max \ No newline at end of file diff --git a/.github/actions/build-flashmini/action.yml b/.github/actions/build-flashmini/action.yml new file mode 100644 index 0000000..d461d1c --- /dev/null +++ b/.github/actions/build-flashmini/action.yml @@ -0,0 +1,62 @@ +name: 'Build flashmini' +description: 'Handles Caching, Configuration, and Compilation' +inputs: + cache_prefix: + description: 'Cache Prefix e.g. OS' + required: true + compiler: + description: 'Compiler' + required: true + backend: + description: 'Backend' + required: true + +runs: + using: "composite" + steps: + # --- 1. Restore Vcpkg --- + - name: Restore vcpkg cache + id: restore-vcpkg + uses: actions/cache/restore@v4 + with: + path: vcpkg_installed + key: vcpkg-${{ inputs.cache_prefix }}-${{ inputs.compiler }}-${{ inputs.backend }}-${{ hashFiles('vcpkg.json') }} + restore-keys: vcpkg-${{ inputs.cache_prefix }}-${{ inputs.compiler }}-${{ inputs.backend }}- + + # --- 2. Configure --- + - name: Configure CMake + shell: bash + run: | + [ -f ci/CMakeUserPresets.json ] && cp ci/CMakeUserPresets.json CMakeUserPresets.json + # Preset: ci_gcc_af_cpu + cmake --preset ci_${{ inputs.compiler }}_af_${{ inputs.backend }} + + # --- 3. Save Vcpkg --- + - name: Save vcpkg cache + if: steps.restore-vcpkg.outputs.cache-hit != 'true' + uses: actions/cache/save@v4 + with: + path: vcpkg_installed + key: ${{ steps.restore-vcpkg.outputs.cache-primary-key }} + + # --- 4. Restore BuildCache --- + - name: Restore BuildCache + id: restore-buildcache + uses: actions/cache/restore@v4 + with: + path: .buildcache + key: buildcache-${{ inputs.cache_prefix }}-${{ inputs.compiler }}-${{ inputs.backend }}-${{ github.run_id }} + restore-keys: buildcache-${{ inputs.cache_prefix }}-${{ inputs.compiler }}-${{ inputs.backend }}- + + # --- 5. Build --- + - name: Build + shell: bash + run: cmake --build --preset ci_${{ inputs.compiler }}_af_${{ inputs.backend }} + + # --- 6. Save BuildCache --- + - name: Save BuildCache + if: always() && steps.restore-buildcache.outputs.cache-primary-key != '' + uses: actions/cache/save@v4 + with: + path: .buildcache + key: ${{ steps.restore-buildcache.outputs.cache-primary-key }} \ No newline at end of file diff --git a/.github/actions/setup-windows/action.yml b/.github/actions/setup-windows/action.yml new file mode 100644 index 0000000..09c1b85 --- /dev/null +++ b/.github/actions/setup-windows/action.yml @@ -0,0 +1,85 @@ +name: 'Setup Windows Environment' +description: 'Restores Vcpkg, BuildCache etc.' +inputs: + compiler: + description: 'compiler to CMakeUserPresets.json' + required: true + backend: + description: "backend to compile for" + required: true + +runs: + using: "composite" + steps: + - uses: ilammy/msvc-dev-cmd@v1 + + - name: Setup Vcpkg Environment + shell: pwsh + run: | + $vcpkgPath = $env:VCPKG_INSTALLATION_ROOT + if (-not (Test-Path "$vcpkgPath")) { + Write-Error "Vcpkg not found at VCPKG_INSTALLATION_ROOT ($vcpkgPath)" + exit 1 + } + "VCPKG_ROOT=$vcpkgPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + + - name: Install BuildCache + shell: pwsh + run: | + Invoke-WebRequest -Uri "https://gitlab.com/bits-n-bites/buildcache/-/releases/v0.31.7/downloads/buildcache-windows.zip" -OutFile "buildcache.zip" + Expand-Archive buildcache.zip -DestinationPath c:\buildcache + echo "c:\buildcache\buildcache\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + + - name: Install CUDA (micromamba) + if: inputs.backend == 'cuda' + uses: mamba-org/setup-micromamba@v1 + with: + environment-name: cuda-env + condarc: | + channels: + - nvidia + - conda-forge + create-args: >- + cuda-toolkit=12.3.0 + cache-environment: true + init-shell: powershell + + - name: Set CUDA Environment Variables + if: inputs.backend == 'cuda' + shell: powershell + run: | + "CUDAToolkit_ROOT=$env:CONDA_PREFIX" >> $env:GITHUB_ENV + "CUDA_PATH=$env:CONDA_PREFIX" >> $env:GITHUB_ENV + "$env:CONDA_PREFIX\Library\bin" >> $env:GITHUB_PATH + "$env:CONDA_PREFIX\bin" >> $env:GITHUB_PATH + + - name: Cache ArrayFire + id: cache-arrayfire-windows + uses: actions/cache@v4 + with: + path: C:\tools\ArrayFire + key: arrayfire-windows-3.10.0 + + - name: "Install ArrayFire" + if: steps.cache-arrayfire-windows.outputs.cache-hit != 'true' + run: | + choco install --no-progress wget -y + cd $HOME + wget -nv https://arrayfire.gateway.scarf.sh/windows/3.10.0/ArrayFire.exe -O ArrayFire.exe + 7z.exe x ArrayFire.exe -o"C:\tools\ArrayFire" -y + rm ArrayFire.exe + shell: bash -el {0} + + - name: Set ArrayFire Env + run: | + echo "ArrayFire_DIR=C:\tools\ArrayFire" >> $GITHUB_ENV + echo "C:\tools\ArrayFire\lib" >> $GITHUB_PATH + shell: bash -el {0} + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Install CMake & Ninja + run: pip install --upgrade cmake ninja + shell: powershell \ No newline at end of file diff --git a/.github/actions/test-flashmini/action.yml b/.github/actions/test-flashmini/action.yml new file mode 100644 index 0000000..58fc3ff --- /dev/null +++ b/.github/actions/test-flashmini/action.yml @@ -0,0 +1,13 @@ +name: 'Test flashmini' +description: 'Runs CTest in the specified directory' +inputs: + test_dir: + description: 'Directory containing the CTestTestfile.cmake (usually build dir)' + required: true + +runs: + using: "composite" + steps: + - name: Test + shell: bash + run: ctest --test-dir ${{ inputs.test_dir }} -C Release --output-on-failure \ No newline at end of file diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml new file mode 100644 index 0000000..7f4c5c0 --- /dev/null +++ b/.github/workflows/build-test.yml @@ -0,0 +1,40 @@ +name: Build & Test + +on: + push: { branches: ["master", "_master/add_ci"] } + pull_request: { branches: ["master"] } + workflow_dispatch: + +permissions: { contents: read, packages: write } + +jobs: + linux: + name: Linux (${{ matrix.compiler }}, ${{ matrix.backend }}) + strategy: + fail-fast: false + matrix: + compiler: [gcc] + backend: [cpu, cuda] + + uses: ./.github/workflows/linux-pipeline.yml + with: + compiler: ${{ matrix.compiler }} + backend: ${{ matrix.backend }} + # run_tests is true only if backend is cpu + run_tests: ${{ matrix.backend == 'cpu' }} + secrets: inherit + + windows: + name: Windows (${{ matrix.compiler }}, ${{ matrix.backend }}) + strategy: + fail-fast: false + matrix: + compiler: [msvc] + backend: [cpu, cuda] + + uses: ./.github/workflows/windows-pipeline.yml + with: + compiler: ${{ matrix.compiler }} + backend: ${{ matrix.backend }} + run_tests: ${{ matrix.backend == 'cpu' }} + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/docker_image_build.yml b/.github/workflows/docker_image_build.yml deleted file mode 100644 index 594b0f5..0000000 --- a/.github/workflows/docker_image_build.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: Publish Docker images -on: - push: - branches: - - master -jobs: - cuda_image_build: - if: github.repository_owner == 'flashlight' - name: CUDA image build - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@master - - name: Build the CUDA Docker image - run: docker build . --file .docker/Dockerfile-CUDA --tag flml/flashlight:cuda-latest - - name: Docker login - env: - USER: ${{ secrets.DOCKER_USERNAME }} - PASSWORD: ${{ secrets.DOCKER_TOKEN }} - run: docker login -u=$USER -p=$PASSWORD - - name: Push image with the latest tag - run: docker push flml/flashlight:cuda-latest - - name: Tag revision - run: docker tag flml/flashlight:cuda-latest flml/flashlight:cuda-`git rev-parse --short HEAD` - - name: Push image with the revision tag - run: docker push flml/flashlight:cuda-`git rev-parse --short HEAD` - - name: Docker logout - run: docker logout - cpu_image_build: - if: github.repository_owner == 'flashlight' - name: CPU image build - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@master - - name: Build the CPU Docker image - run: docker build . --file .docker/Dockerfile-CPU --tag flml/flashlight:cpu-latest - - name: Docker login - env: - USER: ${{ secrets.DOCKER_USERNAME }} - PASSWORD: ${{ secrets.DOCKER_TOKEN }} - run: docker login -u=$USER -p=$PASSWORD - - name: Push image with the latest tag - run: docker push flml/flashlight:cpu-latest - - name: Tag revision - run: docker tag flml/flashlight:cpu-latest flml/flashlight:cpu-`git rev-parse --short HEAD` - - name: Push image with the revision tag - run: docker push flml/flashlight:cpu-`git rev-parse --short HEAD` - - name: Docker logout - run: docker logout diff --git a/.github/workflows/linux-pipeline.yml b/.github/workflows/linux-pipeline.yml new file mode 100644 index 0000000..f3047f3 --- /dev/null +++ b/.github/workflows/linux-pipeline.yml @@ -0,0 +1,66 @@ +name: Linux Pipeline + +on: + workflow_call: + inputs: + compiler: + required: true + type: string + backend: + required: true + type: string + run_tests: + required: true + type: boolean + +permissions: + contents: read + packages: write + +jobs: + prepare: + name: Build Container (${{ inputs.backend }}) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build & Push + uses: ./.github/actions/build-container + with: + dockerfile: ci/docker/linux/Dockerfile.${{ inputs.backend }} + tag: linux-${{ inputs.backend }} + + build: + name: Linux (${{ inputs.compiler }}-${{ inputs.backend }}) + needs: prepare + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}:linux-${{ inputs.backend }} + credentials: + username: ${{ github.actor }} + password: ${{ github.token }} + options: >- + -v /usr/local/share/vcpkg:/vcpkg + -e VCPKG_ROOT=/vcpkg + env: + VCPKG_ROOT: /vcpkg + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Build + uses: ./.github/actions/build-flashmini + with: + cache_prefix: linux + compiler: ${{ inputs.compiler }} + backend: ${{ inputs.backend }} + + - name: Test + if: inputs.run_tests + uses: ./.github/actions/test-flashmini + with: + test_dir: out/build/ci_${{ inputs.compiler }}_af_${{ inputs.backend }} + + - name: Fix permissions + if: always() + run: sudo chown -R $(id -u):$(id -g) out/build \ No newline at end of file diff --git a/.github/workflows/windows-pipeline.yml b/.github/workflows/windows-pipeline.yml new file mode 100644 index 0000000..40c75fc --- /dev/null +++ b/.github/workflows/windows-pipeline.yml @@ -0,0 +1,45 @@ +name: Windows Pipeline + +on: + workflow_call: + inputs: + compiler: + required: true + type: string + backend: + required: true + type: string + run_tests: + required: true + type: boolean + +jobs: + build: + name: Windows (${{ inputs.compiler }}-${{ inputs.backend }}) + runs-on: windows-latest + defaults: + run: + shell: bash # Forces Git Bash for consistency with Linux scripts + + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - uses: ./.github/actions/setup-windows + with: + compiler: ${{ inputs.compiler }} + backend: ${{ inputs.backend }} + + - name: Build + uses: ./.github/actions/build-flashmini + with: + cache_prefix: windows + compiler: ${{ inputs.compiler }} + backend: ${{ inputs.backend }} + + - name: Test + if: inputs.run_tests + uses: ./.github/actions/test-flashmini + with: + test_dir: out/build/ci_${{ inputs.compiler }}_af_${{ inputs.backend }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1b55e23..9b1beca 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ release/ debug/ out/ vcpkg_installed/ +Testing/ *.so # FB diff --git a/CMakeLists.txt b/CMakeLists.txt index a360cc2..5bfeed9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,9 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_STANDARD_REQUIRED ON) +# no modules in this library +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + # Default directories for installation set(FL_INSTALL_INC_DIR "include" CACHE PATH "Install path for headers") set(FL_INSTALL_INC_DIR_HEADER_LOC ${FL_INSTALL_INC_DIR}/flashlight) diff --git a/CMakePresets.json b/CMakePresets.json index bf19f1d..aec9e32 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -8,6 +8,11 @@ "CMAKE_TOOLCHAIN_FILE": "cmake/utils/toolchain.cmake" } }, + { + "name": "ninja", + "hidden": true, + "generator": "Ninja Multi-Config" + }, { "name": "wsl-settings", "hidden": true, @@ -26,11 +31,7 @@ } } }, - { - "name": "ninja", - "hidden": true, - "generator": "Ninja Multi-Config" - }, + { "name": "base", "hidden": true, @@ -39,7 +40,10 @@ "ninja", "vcpkg" ], - "binaryDir": "${sourceDir}/out/build/${presetName}" + "binaryDir": "${sourceDir}/out/build/${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": true + } }, { "name": "msvc", @@ -79,10 +83,8 @@ "cacheVariables": { "CMAKE_CUDA_COMPILER": "nvcc", "FL_USE_CUDNN": false, - "CMAKE_CUDA_ARCHITECTURES": "native", "CMAKE_CUDA_FLAGS": "-allow-unsupported-compiler", - "VCPKG_MANIFEST_FEATURES": "cuda" } }, @@ -104,7 +106,6 @@ "name": "af-backend-base", "hidden": true, "cacheVariables": { - "FL_BUILD_ARRAYFIRE": true, "FL_USE_ARRAYFIRE": true } }, @@ -116,7 +117,8 @@ "af-backend-base" ], "cacheVariables": { - "FL_ARRAYFIRE_USE_CPU": true + "FL_ARRAYFIRE_USE_CPU": true, + "FL_USE_ONEDNN": true } }, { @@ -162,5 +164,57 @@ "af-cpu-backend" ] } + ], + "buildPresets": [ + { + "name": "release-base", + "hidden": true, + "configuration": "Release" + }, + { + "name": "debug-base", + "hidden": true, + "configuration": "Debug" + }, + { + "name": "msvc_af_cuda_release", + "configurePreset": "msvc_af_cuda", + "inherits": "release-base" + }, + { + "name": "gcc_af_cuda_release", + "configurePreset": "gcc_af_cuda", + "inherits": "release-base" + }, + { + "name": "msvc_af_cpu_release", + "configurePreset": "msvc_af_cpu", + "inherits": "release-base" + }, + { + "name": "gcc_af_cpu_release", + "configurePreset": "gcc_af_cpu", + "inherits": "release-base" + }, + { + "name": "msvc_af_cuda_debug", + "configurePreset": "msvc_af_cuda", + "inherits": "debug-base" + }, + { + "name": "gcc_af_cuda_debug", + "configurePreset": "gcc_af_cuda", + "inherits": "debug-base" + }, + { + "name": "msvc_af_cpu_debug", + "configurePreset": "msvc_af_cpu", + "inherits": "debug-base" + }, + { + "name": "gcc_af_cpu_debug", + "configurePreset": "gcc_af_cpu", + "inherits": "debug-base" + } ] } \ No newline at end of file diff --git a/ci/CMakeUserPresets.json b/ci/CMakeUserPresets.json new file mode 100644 index 0000000..a82111d --- /dev/null +++ b/ci/CMakeUserPresets.json @@ -0,0 +1,90 @@ +{ + "version": 3, + "configurePresets": [ + { + "name": "ci-vcpkg", + "hidden": true, + "cacheVariables": { + "VCPKG_INSTALLED_DIR": "${sourceDir}/vcpkg_installed" + } + }, + { + "name": "ci-buildcache", + "hidden": true, + "environment": { + "BUILDCACHE_DIR": "${sourceDir}/.buildcache", + "BUILDCACHE_ACCURACY": "SLOPPY", + "BUILDCACHE_MAX_CACHE_SIZE": "524288000" + } + }, + + + { + "name": "ci-config-base", + "hidden": true, + "inherits": [ + "ci-vcpkg", + "ci-buildcache" + ], + "cacheVariables": { + "FL_BUILD_TESTS": "ON", + "FL_BUILD_STANDALONE": "ON" + } + }, + { + "name": "ci_msvc_af_cpu", + "inherits": [ + "ci-config-base", + "msvc_af_cpu" + ] + }, + { + "name": "ci_msvc_af_cuda", + "inherits": [ + "ci-config-base", + "msvc_af_cuda" + ] + }, + { + "name": "ci_gcc_af_cpu", + "inherits": [ + "ci-config-base", + "gcc_af_cpu" + ] + }, + { + "name": "ci_gcc_af_cuda", + "inherits": [ + "ci-config-base", + "gcc_af_cuda" + ] + } + ], + "buildPresets": [ + { + "name": "ci-build-base", + "hidden": true, + "configuration": "Release" + }, + { + "name": "ci_msvc_af_cpu", + "configurePreset": "ci_msvc_af_cpu", + "inherits": "ci-build-base" + }, + { + "name": "ci_msvc_af_cuda", + "configurePreset": "ci_msvc_af_cuda", + "inherits": "ci-build-base" + }, + { + "name": "ci_gcc_af_cpu", + "configurePreset": "ci_gcc_af_cpu", + "inherits": "ci-build-base" + }, + { + "name": "ci_gcc_af_cuda", + "configurePreset": "ci_gcc_af_cuda", + "inherits": "ci-build-base" + } + ] +} \ No newline at end of file diff --git a/ci/docker/linux/Dockerfile.cpu b/ci/docker/linux/Dockerfile.cpu new file mode 100644 index 0000000..1bbcd0e --- /dev/null +++ b/ci/docker/linux/Dockerfile.cpu @@ -0,0 +1,25 @@ +FROM cachyos/cachyos:latest + +RUN pacman -Syu --noconfirm && \ + pacman -S --noconfirm \ + base-devel \ + cmake \ + ninja \ + git \ + openmpi \ + python \ + python-pip \ + vcpkg \ + wget \ + cloc \ + buildcache-git + +ENV VCPKG_ROOT=/opt/vcpkg + +# Install ArrayFire from script +RUN wget -nv https://arrayfire.s3.amazonaws.com/3.10.0/ArrayFire-v3.10.0_Linux_x86_64.sh -O af_installer.sh && \ + chmod +x af_installer.sh && \ + ./af_installer.sh --include-subdir --prefix=/opt --skip-license --yes && \ + rm af_installer.sh +ENV ArrayFire_DIR=/opt/arrayfire +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/arrayfire/lib64 diff --git a/ci/docker/linux/Dockerfile.cuda b/ci/docker/linux/Dockerfile.cuda new file mode 100644 index 0000000..1d76e18 --- /dev/null +++ b/ci/docker/linux/Dockerfile.cuda @@ -0,0 +1,35 @@ +FROM cachyos/cachyos:latest + +RUN pacman -Syu --noconfirm && \ + pacman -S --noconfirm \ + base-devel \ + cmake \ + ninja \ + git \ + openmpi \ + cuda \ + cudnn \ + python \ + python-pip \ + vcpkg \ + wget \ + cloc \ + buildcache-git + +ENV VCPKG_ROOT=/opt/vcpkg + +#symlink for cuda stubs +RUN ln -sf /opt/cuda/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/libcuda.so.1 && \ + ln -sf /opt/cuda/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/libcuda.so + + +# Install ArrayFire from script +RUN wget -nv https://arrayfire.s3.amazonaws.com/3.10.0/ArrayFire-v3.10.0_Linux_x86_64.sh -O af_installer.sh && \ + chmod +x af_installer.sh && \ + ./af_installer.sh --include-subdir --prefix=/opt --skip-license --yes && \ + rm af_installer.sh + +ENV ArrayFire_DIR=/opt/arrayfire +ENV CUDA_HOME=/opt/cuda +ENV PATH=/opt/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/cuda/lib64:/opt/arrayfire/lib64 diff --git a/.circleci/config.yml b/ci/unused/.circleci/config.yml similarity index 100% rename from .circleci/config.yml rename to ci/unused/.circleci/config.yml diff --git a/.docker/Dockerfile-CPU b/ci/unused/.docker/Dockerfile-CPU similarity index 100% rename from .docker/Dockerfile-CPU rename to ci/unused/.docker/Dockerfile-CPU diff --git a/.docker/Dockerfile-CPU-Base b/ci/unused/.docker/Dockerfile-CPU-Base similarity index 100% rename from .docker/Dockerfile-CPU-Base rename to ci/unused/.docker/Dockerfile-CPU-Base diff --git a/.docker/Dockerfile-CUDA b/ci/unused/.docker/Dockerfile-CUDA similarity index 100% rename from .docker/Dockerfile-CUDA rename to ci/unused/.docker/Dockerfile-CUDA diff --git a/.docker/Dockerfile-CUDA-Base b/ci/unused/.docker/Dockerfile-CUDA-Base similarity index 100% rename from .docker/Dockerfile-CUDA-Base rename to ci/unused/.docker/Dockerfile-CUDA-Base diff --git a/.docker/README.md b/ci/unused/.docker/README.md similarity index 100% rename from .docker/README.md rename to ci/unused/.docker/README.md diff --git a/.github/actions/install_core_deps/action.yml b/ci/unused/.github/actions/install-core-deps/action.yml similarity index 71% rename from .github/actions/install_core_deps/action.yml rename to ci/unused/.github/actions/install-core-deps/action.yml index 98a2e21..6627ddb 100644 --- a/.github/actions/install_core_deps/action.yml +++ b/ci/unused/.github/actions/install-core-deps/action.yml @@ -16,6 +16,31 @@ runs: steps: # ]----- Backend dependencies # ArrayFire + - name: Cache ArrayFire (Windows) + id: cache-arrayfire-windows + uses: actions/cache@v4 + if: runner.os == 'Windows' && inputs.backend == 'ArrayFire' + with: + path: C:\tools\ArrayFire + key: arrayfire-windows-3.10.0 + + - name: "Install ArrayFire (Windows)" + if: runner.os == 'Windows' && inputs.backend == 'ArrayFire' && steps.cache-arrayfire-windows.outputs.cache-hit != 'true' + run: | + choco install --no-progress wget -y + cd $HOME + wget -nv https://arrayfire.gateway.scarf.sh/windows/3.10.0/ArrayFire.exe -O ArrayFire.exe + 7z.exe x ArrayFire.exe -o"C:\tools\ArrayFire" -y + rm ArrayFire.exe + shell: bash -el {0} + + - name: Set ArrayFire Env (Windows) + if: runner.os == 'Windows' && inputs.backend == 'ArrayFire' + run: | + echo "ArrayFire_DIR=C:\tools\ArrayFire" >> $GITHUB_ENV + echo "C:\tools\ArrayFire\lib" >> $GITHUB_PATH + shell: bash -el {0} + - name: "Install ArrayFire (Linux)" run: | sudo apt update @@ -25,20 +50,11 @@ runs: sudo apt install arrayfire-cmake=3.8.1-2 arrayfire-headers=3.8.1-2 arrayfire-cpu3-mkl=3.8.1-2 arrayfire-cpu3-dev=3.8.1-2 if: runner.os == 'Linux' && inputs.backend == 'ArrayFire' shell: bash -el {0} + - name: "Install ArrayFire (macOS)" run: brew install arrayfire if: runner.os == 'macOS' && inputs.backend == 'ArrayFire' shell: bash -el {0} - - name: "Install ArrayFire (Windows)" - run: | - choco install --no-progress wget -y - cd $HOME - INSTALLER_NAME="ArrayFire-v3.8.1-CUDA-11.4.exe" - wget --quiet https://arrayfire.s3.amazonaws.com/3.8.1/$INSTALLER_NAME - 7z.exe x $INSTALLER_NAME -o"C:\Program Files\ArrayFire" -y - rm $INSTALLER_NAME - if: runner.os == 'Windows' && inputs.backend == 'ArrayFire' - shell: bash -el {0} # oneDNN - name: Install oneDNN with micromamba uses: mamba-org/setup-micromamba@v1 diff --git a/.github/actions/install_pkg_deps/action.yml b/ci/unused/.github/actions/install_pkg_deps/action.yml similarity index 100% rename from .github/actions/install_pkg_deps/action.yml rename to ci/unused/.github/actions/install_pkg_deps/action.yml diff --git a/.github/workflows/build.yml b/ci/unused/.github/workflows/build.yml similarity index 100% rename from .github/workflows/build.yml rename to ci/unused/.github/workflows/build.yml diff --git a/.github/workflows/build_docs.yml b/ci/unused/.github/workflows/build_docs.yml similarity index 100% rename from .github/workflows/build_docs.yml rename to ci/unused/.github/workflows/build_docs.yml diff --git a/cmake/TestUtils.cmake b/cmake/TestUtils.cmake index 13c5483..8f9a228 100644 --- a/cmake/TestUtils.cmake +++ b/cmake/TestUtils.cmake @@ -1,4 +1,5 @@ -cmake_minimum_required(VERSION 3.5.1) +cmake_minimum_required(VERSION 3.21) +include(fm_target_utilities) set(GTEST_IMPORTED_TARGETS "") @@ -61,6 +62,7 @@ function(build_test) PUBLIC ${build_test_PREPROC} ) + if (CMAKE_SYSTEM_NAME STREQUAL "Windows") target_compile_definitions( ${target} @@ -70,4 +72,8 @@ function(build_test) ) endif() gtest_add_tests(TARGET ${target}) + + if(WIN32) + fm_target_copy_dependencies(${target}) + endif() endfunction(build_test) diff --git a/cmake/utils/fm_target_utilities.cmake b/cmake/utils/fm_target_utilities.cmake index 51dbf09..ce2e437 100644 --- a/cmake/utils/fm_target_utilities.cmake +++ b/cmake/utils/fm_target_utilities.cmake @@ -34,11 +34,24 @@ function(fm_glob OUT_VAR) set(SUB_PATHS ${ARG_UNPARSED_ARGUMENTS}) set(GLOB_PATTERNS "") - foreach(SUB_PATH IN LISTS SUB_PATHS) - foreach(PATTERN IN LISTS ARG_PATTERNS) - list(APPEND GLOB_PATTERNS "${SUB_PATH}/${PATTERN}") + if(SUB_PATHS AND ARG_PATTERNS) + # Case 1: Both paths and patterns provided - generate cross-product + foreach(SUB_PATH IN LISTS SUB_PATHS) + foreach(PATTERN IN LISTS ARG_PATTERNS) + if(SUB_PATH) + list(APPEND GLOB_PATTERNS "${SUB_PATH}/${PATTERN}") + else() + list(APPEND GLOB_PATTERNS "${PATTERN}") + endif() + endforeach() endforeach() - endforeach() + elseif(SUB_PATHS) + # Case 2: Only sub_paths provided - treat them as full patterns + set(GLOB_PATTERNS ${SUB_PATHS}) + elseif(ARG_PATTERNS) + # Case 3: Only patterns provided - treat them as full patterns + set(GLOB_PATTERNS ${ARG_PATTERNS}) + endif() if(GLOB_PATTERNS) file(GLOB_RECURSE FOUND_FILES @@ -407,4 +420,231 @@ function(fm_add_clang_format_target TARGET_NAME) COMMENT "Formatting all source files with clang-format..." VERBATIM ) +endfunction() + +function(_fm_ensure_stub_lib) + set(stub_target "fm_link_attachment_stub") + if(TARGET ${stub_target}) + return() + endif() + + # Define a predictable, flat output directory + set(stub_dir "${CMAKE_BINARY_DIR}/_fm_internal") + file(MAKE_DIRECTORY "${stub_dir}") + + # Generate dummy source + set(stub_src "${stub_dir}/stub.c") + if(NOT EXISTS "${stub_src}") + file(WRITE "${stub_src}" "void fm_link_attachment_stub_symbol(void) {}\n") + endif() + + # Create the real static library + add_library(${stub_target} STATIC "${stub_src}") + + # FORCE the output location to be flat. + set_target_properties(${stub_target} PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "${stub_dir}" + OUTPUT_NAME "fm_link_attachment_stub" + ) + + if(CMAKE_CONFIGURATION_TYPES) + foreach(config ${CMAKE_CONFIGURATION_TYPES}) + string(TOUPPER "${config}" config_upper) + set_target_properties(${stub_target} PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY_${config_upper} "${stub_dir}" + ) + endforeach() + endif() +endfunction() + +#[[.rst: +.. command:: fm_target_attach_dependency + + .. code-block:: cmake + + fm_target_attach_dependency( ) + + Attaches external file dependencies (like DLLs) to a target by creating imported targets. + This allows CMake to track these dependencies. + + :param target: The target to attach dependencies to. + :type target: string + :param mode: Attachment mode. Must be ``LINK`` or ``NOLINK``. + - ``LINK``: The file behaves like a linked library (implicit link). + - ``NOLINK``: The file is attached but not linked (e.g., runtime-only DLL). + :type mode: string + :param files: List of file paths to attach. + :type files: list of strings + + :pre: ``target`` must exist. + :pre: ``mode`` must be ``LINK`` or ``NOLINK``. + + .. note:: + Creates internal targets named ``__attachment_`` for each file. + In ``NOLINK`` mode, a dummy stub library is linked to satisfy CMake's requirement + for SHARED libraries on Windows, preventing LNK1107 errors. +#]] +function(fm_target_attach_dependency target mode) + fm_assert_target(${target}) + fm_assert_true( + "${mode}" MATCHES "^(LINK|NOLINK)$" + ) + + _fm_ensure_stub_lib() + + set(stub_dir "${CMAKE_BINARY_DIR}/_fm_internal") + set( + stub_lib_path + "${stub_dir}/${CMAKE_STATIC_LIBRARY_PREFIX}fm_link_attachment_stub${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + + foreach(file_path ${ARGN}) + # Resolve full path immediately + get_filename_component(abs_path "${file_path}" ABSOLUTE) + get_filename_component(file_name "${file_path}" NAME) + + # Generate unique target name + string(MD5 path_hash "${abs_path}") + set(leaf_target_name "_${target}_${mode}_${path_hash}") + + if(NOT TARGET ${leaf_target_name}) + + # --- PLATFORM LOGIC --- + if(WIN32) + # Windows: Always SHARED IMPORTED. + add_library(${leaf_target_name} SHARED IMPORTED GLOBAL) + set_target_properties(${leaf_target_name} PROPERTIES + IMPORTED_LOCATION "${abs_path}" + ) + + if(mode STREQUAL "NOLINK") + # NOLINK: Point IMPLIB to the dummy stub. + set_target_properties(${leaf_target_name} PROPERTIES + IMPORTED_IMPLIB "${stub_lib_path}" + ) + # Ensure stub is built before linking + add_dependencies(${leaf_target_name} fm_stub_lib) + else() + # LINK: Calculate the real import library path. + get_filename_component(dir_name "${abs_path}" DIRECTORY) + get_filename_component(name_we "${abs_path}" NAME_WE) + + # Construct path: dir / [prefix]filename[suffix] + set(implib_path "${dir_name}/${CMAKE_IMPORT_LIBRARY_PREFIX}${name_we}${CMAKE_IMPORT_LIBRARY_SUFFIX}") + + if(EXISTS "${implib_path}") + set_target_properties(${leaf_target_name} PROPERTIES + IMPORTED_IMPLIB "${implib_path}" + ) + else() + message(WARNING "fm_target_attach_dependency: LINK mode for ${file_name}, but import lib not found at: ${implib_path}") + endif() + endif() + + else() + # Unix/macOS: + # NOLINK -> MODULE (Loadable, not linked) + # LINK -> SHARED (Linked) + if(mode STREQUAL "NOLINK") + add_library(${leaf_target_name} MODULE IMPORTED GLOBAL) + else() + add_library(${leaf_target_name} SHARED IMPORTED GLOBAL) + endif() + + set_target_properties(${leaf_target_name} PROPERTIES + IMPORTED_LOCATION "${abs_path}" + ) + endif() + + endif() + + # Link the imported target to the main target + target_link_libraries(${target} PRIVATE ${leaf_target_name}) + endforeach() +endfunction() + +#[[.rst: +.. command:: fm_target_copy_dependencies + + .. code-block:: cmake + + fm_target_copy_dependencies() + + Adds a post-build step to copy runtime dependencies (DLLs) to the target's output directory. + Uses ``$`` generator expression. + + :param target: The target to copy dependencies for. + :type target: string + + :pre: ``target`` must exist. + :pre: ``target`` must be an ``EXECUTABLE`` or ``SHARED_LIBRARY``. + :post: Runtime dependencies are copied to ``$`` after build. +#]] +function(fm_target_copy_dependencies target) + fm_assert_target("${target}") + + get_target_property(TGT_TYPE ${target} TYPE) + fm_assert_true("${TGT_TYPE}" MATCHES "^(EXECUTABLE|SHARED_LIBRARY)$" + REASON "fm_target_copy_dependencies: Target '${target}' is of type '${TGT_TYPE}'. This function only supports EXECUTABLES or SHARED_LIBRARIES." + ) + + get_target_property(_registered ${target} _FM_COPY_DEPS_REGISTERED) + if(_registered) + message(WARNING "fm_target_copy_dependencies(${target}) called multiple times!") + return() + endif() + set_property(TARGET ${target} PROPERTY _FM_COPY_DEPS_REGISTERED TRUE) + + set(RETRY_SCRIPT "${CMAKE_CURRENT_BINARY_DIR}/${target}_copy_retry_$.cmake") + + # Generate the script. We use file(GENERATE) so generator expressions resolve correctly. + file(GENERATE OUTPUT "${RETRY_SCRIPT}" CONTENT " + cmake_minimum_required(VERSION 3.21) + + set(DLLS \"$\") + set(DEST \"$\") + + # Exit early if no DLLs to copy + if(NOT DLLS) + return() + endif() + + # Retry Loop: Try up to 5 times + foreach(i RANGE 1 5) + # 1. Try to copy ALL files in one go (Fast). + # 'copy_if_different' is idempotent; if 49/50 files succeed, + # the next attempt only copies the 1 failed file. + execute_process( + COMMAND \${CMAKE_COMMAND} -E copy_if_different \${DLLS} \${DEST} + RESULT_VARIABLE CMD_RESULT + ERROR_VARIABLE CMD_ERR + OUTPUT_VARIABLE CMD_OUT + ) + + # 2. Check success + if(CMD_RESULT EQUAL 0) + return() + endif() + + # 3. Handle failure + if(\${i} LESS 5) + # Print a warning but don't fail yet + message(STATUS \"[${target}] Copy failed (Attempt \${i}/5). Retrying in 1s...\") + + # sleep + execute_process(COMMAND \${CMAKE_COMMAND} -E sleep 1) + else() + # Final attempt failed, print error and exit with failure code + message(STATUS \"\${CMD_OUT}\") + message(STATUS \"\${CMD_ERR}\") + message(FATAL_ERROR \"[${target}] Failed to copy dependencies after 5 attempts.\") + endif() + endforeach() + ") + + # Add the post-build step to run the generated script + add_custom_command(TARGET ${target} POST_BUILD + COMMAND ${CMAKE_COMMAND} -P "${RETRY_SCRIPT}" + COMMENT "Propagating runtime dependencies for ${target} ..." + ) endfunction() \ No newline at end of file diff --git a/cmake/utils/fm_tool_utilities.cmake b/cmake/utils/fm_tool_utilities.cmake index ad297a4..48c144f 100644 --- a/cmake/utils/fm_tool_utilities.cmake +++ b/cmake/utils/fm_tool_utilities.cmake @@ -8,25 +8,38 @@ include(fm_assertions) .. code-block:: cmake - fm_find_program( [args...]) + fm_find_program( [OPTIONAL] [args...]) Locates an external program and exports its path to the parent scope. + If OPTIONAL is specified, does not error if program is not found. :param OUT_VAR variable to export program path to :param prog: Name of the program to find - :type prog: string - :param args: Additional arguments to pass to find_program (e.g., PATHS, HINTS) - :type args: optional arguments + :param OPTIONAL: If specified, do not raise FATAL_ERROR if program is not found + :param args: Additional arguments to pass to find_program + + :post: variable is set in PARENT_SCOPE with the full path to the program, or configuration terminates with FATAL_ERROR if not found and not OPTIONAL - :post: variable is set in PARENT_SCOPE with the full path to the program, or configuration terminates with FATAL_ERROR if not found #]] function(fm_find_program OUT_VAR prog) + cmake_parse_arguments(PARSE_ARGV 2 ARG "OPTIONAL" "" "") + + find_program(${OUT_VAR} "${prog}" ${ARG_UNPARSED_ARGUMENTS}) - find_program(${OUT_VAR} "${prog}" ${ARGN}) - + if(${OUT_VAR}) + message(STATUS "${prog} found: ${${OUT_VAR}}") + message(VERBOSE "${prog} location: ${${OUT_VAR}}") + set(${OUT_VAR} "${${OUT_VAR}}" PARENT_SCOPE) + return() + endif() + + if(ARG_OPTIONAL) + message(STATUS "${prog} not found") + set(${OUT_VAR} "" PARENT_SCOPE) + return() + endif() + fm_assert_true(${OUT_VAR} REASON "Program '${prog}' not found") - - set(${OUT_VAR} "${${OUT_VAR}}" PARENT_SCOPE) endfunction() #[[.rst: @@ -34,10 +47,11 @@ endfunction() .. code-block:: cmake - fm_enable_build_cache() + fm_enable_build_cache([OPTIONAL]) Enables BuildCache globally for all targets by setting compiler launcher variables. + :param OPTIONAL: If specified, do not raise FATAL_ERROR if buildcache is not found :pre: buildcache program is found in PATH :post: CMAKE_C_COMPILER_LAUNCHER and CMAKE_CXX_COMPILER_LAUNCHER are set to buildcache in PARENT_SCOPE @@ -54,9 +68,20 @@ endfunction() #]] function(fm_enable_build_cache) - fm_find_program(BUILDCACHE_EXECUTABLE buildcache) + cmake_parse_arguments(PARSE_ARGV 0 ARG "OPTIONAL" "" "") - message(STATUS "Enabling buildcache globally: ${BUILDCACHE_EXECUTABLE}") + if (ARG_OPTIONAL) + fm_find_program(BUILDCACHE_EXECUTABLE buildcache OPTIONAL) + else() + fm_find_program(BUILDCACHE_EXECUTABLE buildcache) + endif() + + if(BUILDCACHE_EXECUTABLE) + message(STATUS "BuildCache globally enabled") + else() + message(STATUS "Couldn't enable BuildCache globally") + return() + endif() set(CMAKE_C_COMPILER_LAUNCHER "${BUILDCACHE_EXECUTABLE}" PARENT_SCOPE) set(CMAKE_CXX_COMPILER_LAUNCHER "${BUILDCACHE_EXECUTABLE}" PARENT_SCOPE) @@ -67,27 +92,25 @@ endfunction() .. code-block:: cmake - fm_find_clang_format() + fm_find_clang_format([OPTIONAL]) - Locates the clang-format executable and exports its path to the parent scope. + Locates a required clang-format executable and exports its path to the parent scope. + If OPTIONAL is specified, does not error if clang-format is not found. - :post: CLANG_FORMAT_EXECUTABLE is set in PARENT_SCOPE with the full path to clang-format, or configuration terminates with FATAL_ERROR if not found - - .. note:: - The clang-format executable must be available in PATH. - - .. warning:: - This function will fail with FATAL_ERROR if clang-format is not found. - Ensure clang-format is installed and available in your system PATH. + :post: CLANG_FORMAT_EXECUTABLE is set in PARENT_SCOPE with the full path to clang-format, or configuration terminates with FATAL_ERROR if not found and not OPTIONAL .. seealso:: Use ``fm_add_clang_format_target()`` from fm_target_utilities to create a format target. #]] function(fm_find_clang_format) - fm_find_program(CLANG_FORMAT_EXECUTABLE clang-format) - - message(STATUS "Found external clang-format: ${CLANG_FORMAT_EXECUTABLE}") + cmake_parse_arguments(PARSE_ARGV 0 ARG "OPTIONAL" "" "") + if(ARG_OPTIONAL) + fm_find_program(CLANG_FORMAT_EXECUTABLE clang-format OPTIONAL) + else() + fm_find_program(CLANG_FORMAT_EXECUTABLE clang-format) + endif() + set(CLANG_FORMAT_EXECUTABLE ${CLANG_FORMAT_EXECUTABLE} PARENT_SCOPE) -endfunction() \ No newline at end of file +endfunction() diff --git a/cmake/utils/toolchain.cmake b/cmake/utils/toolchain.cmake index bca31f5..57071c2 100644 --- a/cmake/utils/toolchain.cmake +++ b/cmake/utils/toolchain.cmake @@ -19,6 +19,9 @@ message(STATUS "appended (${FM_CMAKE_LIBRARY_DIR}/../) cmake/ to cmake module pa include(fm_assertions) include(fm_tool_utilities) +#enable BuildCache +fm_enable_build_cache(OPTIONAL) + #delegate to vcpkg fm_assert_program(vcpkg REASON "fm needs vcpkg" HINTS "$ENV{VCPKG_ROOT}") diff --git a/flashlight/fl/autograd/tensor/AutogradExtensionBackends.h b/flashlight/fl/autograd/tensor/AutogradExtensionBackends.h index 519971d..27e69e9 100644 --- a/flashlight/fl/autograd/tensor/AutogradExtensionBackends.h +++ b/flashlight/fl/autograd/tensor/AutogradExtensionBackends.h @@ -16,6 +16,9 @@ #if FL_USE_CUDNN #include "flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.h" #endif // FL_USE_CUDNN +#if FL_USE_ONEDNN + #include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h" +#endif // FL_USE_ONEDNN namespace fl { @@ -28,4 +31,10 @@ FL_REGISTER_TENSOR_EXTENSION(CudnnAutogradExtension, ArrayFire); #endif // FL_USE_ARRAYFIRE && FL_ARRAYFIRE_USE_CUDA #endif // FL_USE_CUDNN +#if FL_USE_ONEDNN + #if FL_USE_ARRAYFIRE && (FL_ARRAYFIRE_USE_CPU || FL_ARRAYFIRE_USE_OPENCL) +FL_REGISTER_TENSOR_EXTENSION(OneDnnAutogradExtension, ArrayFire); + #endif +#endif // FL_USE_ONEDNN + } // namespace fl diff --git a/flashlight/fl/autograd/tensor/CMakeLists.txt b/flashlight/fl/autograd/tensor/CMakeLists.txt index 944fdc4..8e5ef4d 100644 --- a/flashlight/fl/autograd/tensor/CMakeLists.txt +++ b/flashlight/fl/autograd/tensor/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.16) option(FL_USE_CUDNN "Build with cuDNN support" OFF) +option(FL_USE_ONEDNN "Build with OneDNN support" OFF) if(FL_USE_CUDNN) find_package(CUDNN) @@ -21,11 +22,18 @@ if (FL_USE_CUDNN) include(${CMAKE_CURRENT_LIST_DIR}/backend/cudnn/CMakeLists.txt) endif() +if (FL_USE_ONEDNN) + find_package(dnnl CONFIG REQUIRED) + include(${CMAKE_CURRENT_LIST_DIR}/backend/onednn/CMakeLists.txt) + target_link_libraries(flashlight PRIVATE DNNL::dnnl) +endif() + target_compile_definitions( flashlight PUBLIC FL_USE_CUDNN=$ + FL_USE_ONEDNN=$ ) target_sources( diff --git a/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp b/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp new file mode 100644 index 0000000..f495e24 --- /dev/null +++ b/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp @@ -0,0 +1,281 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h" + +#include +#include +#include + +#include + +#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h" +#include "flashlight/fl/tensor/Index.h" + +namespace fl { + +namespace { + +// Flashlight accept HWCN order according to docs +constexpr size_t kHIdx = 0; +constexpr size_t kWIdx = 1; +constexpr size_t kChannelSizeIdx = 2; +constexpr size_t kBatchSizeIdx = 3; + +constexpr auto formatNCHW = dnnl::memory::format_tag::nchw; +constexpr auto formatX = dnnl::memory::format_tag::x; + +int getNfeatures(const Shape& inputShape, const std::vector& axes) { + int nfeatures = 1; + for (auto ax : axes) { + nfeatures *= inputShape.dim(ax); + } + return nfeatures; +} + +dnnl::memory::dims getInputOutputDims( + const int minAxis, + const int maxAxis, + const Tensor& input, + const int nfeatures) { + Shape inDescDims; + if (minAxis == 0) { + inDescDims = Shape( + {1, + 1, + nfeatures, + static_cast(input.elements() / nfeatures)}); + } else { + int batchsz = 1; + for (int i = maxAxis + 1; i < input.ndim(); ++i) { + batchsz *= input.dim(i); + } + inDescDims = Shape( + {1, + static_cast(input.elements() / (nfeatures * batchsz)), + nfeatures, + batchsz}); + } + + dnnl::memory::dims inputOutputDims = { + inDescDims[kBatchSizeIdx], + inDescDims[kChannelSizeIdx], + inDescDims[kHIdx], + inDescDims[kWIdx]}; + + return inputOutputDims; +} + +struct OneDnnBatchNormPayload : detail::AutogradPayloadData { + dnnl::batch_normalization_forward::primitive_desc fwdPrimDesc; + Tensor weights; // combined weight and bias + Tensor bias; + dnnl::memory::dims weightsDims; + dnnl::memory::dims biasDims; + dnnl::memory::desc outputMemoryDescriptor; + dnnl::memory meanMemory; + dnnl::memory varMemory; + dnnl::memory weightsMemory; + dnnl::memory biasMemory; +}; + +} // namespace + +Tensor OneDnnAutogradExtension::batchnorm( + Tensor& saveMean, + Tensor& saveVar, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + Tensor& runningMean, + Tensor& runningVar, + const std::vector& axes, + const bool train, + const double momentum, + const double epsilon, + std::shared_ptr autogradPayload) { + if (momentum != 0.) { + throw std::runtime_error("OneDNN batchnorm op doesn't support momentum."); + } + if (input.type() == fl::dtype::f16) { + throw std::runtime_error("OneDNN batchnorm op - f16 inputs not supported."); + } + + auto payload = std::make_shared(); + if (train && autogradPayload) { + autogradPayload->data = payload; + } + + auto output = Tensor(input.shape(), input.type()); + int nfeatures = getNfeatures(input.shape(), axes); + + if (runningVar.isEmpty()) { + runningVar = fl::full({nfeatures}, 1., input.type()); + } + + if (runningMean.isEmpty()) { + runningMean = fl::full({nfeatures}, 0., input.type()); + } + + // Check if axes are valid + auto maxAxis = *std::max_element(axes.begin(), axes.end()); + auto minAxis = *std::min_element(axes.begin(), axes.end()); + bool axesContinuous = (axes.size() == (maxAxis - minAxis + 1)); + if (!axesContinuous) { + throw std::invalid_argument("axis array should be continuous"); + } + + auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine(); + + // Prepare combined weights + // If empty, user specifies affine to false. Both not trainable. + auto weightNonempty = + weight.isEmpty() ? fl::full({nfeatures}, 1., fl::dtype::f32) : weight; + auto biasNonempty = + bias.isEmpty() ? fl::full({nfeatures}, 0., fl::dtype::f32) : bias; + + // DNNL only accepts weight and bias as a combined input. + // https://git.io/JLn9X + payload->weights = weightNonempty; + payload->bias = biasNonempty; + payload->weightsDims = detail::convertToDnnlDims({nfeatures}); + payload->biasDims = detail::convertToDnnlDims({nfeatures}); + auto inputOutputDims = getInputOutputDims(minAxis, maxAxis, input, nfeatures); + + // Memory for forward + const detail::DnnlMemoryWrapper inputMemory( + input, inputOutputDims, formatNCHW); + const detail::DnnlMemoryWrapper outputMemory( + output, inputOutputDims, formatNCHW); + const detail::DnnlMemoryWrapper meanMemory( + runningMean, {runningMean.dim(0)}, formatX); + const detail::DnnlMemoryWrapper varMemory( + runningVar, {runningVar.dim(0)}, formatX); + // combined scale and shift (weight and bias) + const detail::DnnlMemoryWrapper weightsMemory( + payload->weights, payload->weightsDims, formatX); + const detail::DnnlMemoryWrapper biasMemory( + payload->bias, payload->biasDims, formatX); + payload->meanMemory = meanMemory.getMemory(); + payload->varMemory = varMemory.getMemory(); + payload->weightsMemory = weightsMemory.getMemory(); + payload->biasMemory = biasMemory.getMemory(); + // Primitives and descriptors + auto kind = train ? dnnl::prop_kind::forward_training + : dnnl::prop_kind::forward_inference; + // https://fburl.com/6latj733 + dnnl::normalization_flags flag = train + ? dnnl::normalization_flags::none + : dnnl::normalization_flags::use_global_stats; + flag = flag | dnnl::normalization_flags::use_scale | + dnnl::normalization_flags::use_shift; + payload->fwdPrimDesc = dnnl::batch_normalization_forward::primitive_desc( + dnnlEngine, + kind, + inputMemory.getDescriptor(), + outputMemory.getDescriptor(), + epsilon, + flag); + payload->outputMemoryDescriptor = outputMemory.getDescriptor(); + auto bn = dnnl::batch_normalization_forward(payload->fwdPrimDesc); + std::unordered_map bnFwdArgs = { + {DNNL_ARG_SRC, inputMemory.getMemory()}, + {DNNL_ARG_MEAN, meanMemory.getMemory()}, + {DNNL_ARG_VARIANCE, varMemory.getMemory()}, + {DNNL_ARG_DST, outputMemory.getMemory()}, + {DNNL_ARG_SCALE, weightsMemory.getMemory()}, + {DNNL_ARG_SHIFT, biasMemory.getMemory()}}; + + // Execute + std::vector network; + std::vector> fwdArgs = {bnFwdArgs}; + network.push_back(bn); + detail::executeNetwork(network, fwdArgs); + + return output; +} + +std::tuple OneDnnAutogradExtension::batchnormBackward( + const Tensor& gradOutput, + const Tensor& saveMean, + const Tensor& saveVar, + const Tensor& input, + const Tensor& weight, + const std::vector& axes, + const bool train, + const float epsilon, + std::shared_ptr autogradPayload) { + if (!autogradPayload) { + throw std::invalid_argument( + "OneDnnAutogradExtension::pool2dBackward given null detail::AutogradPayload"); + } + auto payload = + std::static_pointer_cast(autogradPayload->data); + + auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine(); + + auto maxAxis = *std::max_element(axes.begin(), axes.end()); + auto minAxis = *std::min_element(axes.begin(), axes.end()); + const bool axesContinuous = (axes.size() == (maxAxis - minAxis + 1)); + if (!axesContinuous) { + throw std::invalid_argument("axis array should be continuous"); + } + + const int nfeatures = getNfeatures(input.shape(), axes); + auto inputOutputDims = getInputOutputDims(minAxis, maxAxis, input, nfeatures); + + auto gradInput = Tensor(input.shape(), input.type()); + auto gradWeights = Tensor(payload->weights.shape(), payload->weights.type()); + auto gradBias = Tensor(payload->bias.shape(), payload->bias.type()); + + const detail::DnnlMemoryWrapper inputMemory( + input, inputOutputDims, formatNCHW); + + // Memory for gradient computation + const detail::DnnlMemoryWrapper gradOutputMem( + gradOutput, inputOutputDims, formatNCHW); + const detail::DnnlMemoryWrapper gradInputMem( + gradInput, inputOutputDims, formatNCHW); + const detail::DnnlMemoryWrapper gradWeightsMem( + gradWeights, payload->weightsDims, formatX); + const detail::DnnlMemoryWrapper gradBiasMem( + gradBias, payload->biasDims, formatX); + + // Primitives and descriptors + auto bwdPrimitiveDesc = dnnl::batch_normalization_backward::primitive_desc( + dnnlEngine, + dnnl::prop_kind::backward, + gradOutputMem.getDescriptor(), + payload->outputMemoryDescriptor, + gradOutputMem.getDescriptor(), + epsilon, + dnnl::normalization_flags::use_scale | + dnnl::normalization_flags::use_shift, + payload->fwdPrimDesc // hint + ); + auto bwdPrim = + std::make_shared(bwdPrimitiveDesc); + // Execute + std::vector networkBackwards; + std::vector> bwdArgs = { + {{DNNL_ARG_SRC, inputMemory.getMemory()}, + {DNNL_ARG_MEAN, payload->meanMemory}, + {DNNL_ARG_VARIANCE, payload->varMemory}, + {DNNL_ARG_SCALE, payload->weightsMemory}, + //TODO dnnl_arg_shift was here, check if something can be optimized bc it's not needed + {DNNL_ARG_DIFF_SRC, gradInputMem.getMemory()}, + {DNNL_ARG_DIFF_DST, gradOutputMem.getMemory()}, + {DNNL_ARG_DIFF_SCALE, gradWeightsMem.getMemory()}, + {DNNL_ARG_DIFF_SHIFT, gradBiasMem.getMemory()}}}; + + networkBackwards.push_back(*bwdPrim); + detail::executeNetwork(networkBackwards, bwdArgs); + + return {gradInput, gradWeights, gradBias}; +}; + +} // namespace fl diff --git a/flashlight/fl/autograd/tensor/backend/onednn/CMakeLists.txt b/flashlight/fl/autograd/tensor/backend/onednn/CMakeLists.txt new file mode 100644 index 0000000..f4dc0fb --- /dev/null +++ b/flashlight/fl/autograd/tensor/backend/onednn/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.16) + +target_sources( + flashlight + PRIVATE + ${CMAKE_CURRENT_LIST_DIR}/OneDnnAutogradExtension.cpp + ${CMAKE_CURRENT_LIST_DIR}/Conv2D.cpp + ${CMAKE_CURRENT_LIST_DIR}/Pool2D.cpp + ${CMAKE_CURRENT_LIST_DIR}/RNN.cpp + ${CMAKE_CURRENT_LIST_DIR}/BatchNorm.cpp + ${CMAKE_CURRENT_LIST_DIR}/DnnlUtils.cpp +) diff --git a/flashlight/fl/autograd/tensor/backend/onednn/Conv2D.cpp b/flashlight/fl/autograd/tensor/backend/onednn/Conv2D.cpp new file mode 100644 index 0000000..d6e558f --- /dev/null +++ b/flashlight/fl/autograd/tensor/backend/onednn/Conv2D.cpp @@ -0,0 +1,509 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h" + +#include +#include +#include +#include + +#include + +#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h" + +using namespace dnnl; + +namespace fl { + +namespace { + +// Input, output: WHCN; weights: WHIO +constexpr size_t kWIdx = 0; +constexpr size_t kHIdx = 1; +constexpr size_t kIOChannelSizeIdx = 2; +constexpr size_t kIOBatchSizeIdx = 3; +constexpr size_t kWeightOutputChannelSizeIdx = 3; + +// Use memory::format_tag::any for memory formatting even if convolution +// inputs are shaped in a particular way. +constexpr auto formatAny = memory::format_tag::any; +constexpr auto formatNCHW = memory::format_tag::nchw; +constexpr auto formatBias = memory::format_tag::x; + +struct OneDnnConv2DData { + memory::dims inputDims; + memory::dims weightDims; + memory::dims outputDims; + memory::dims biasDims; + memory::dims strideDims; + memory::dims dilationDims; + memory::dims paddingDims; + // Memory descriptors + memory::desc inputMemDesc; + memory::desc outputMemDesc; + memory::desc weightMemDesc; + memory::desc biasMemDesc; + // used for creating a backward desc + convolution_forward::primitive_desc fwdPrimDesc; +}; + +OneDnnConv2DData createOneDnnConv2DData( + fl::dtype inputType, + const Shape& inputShape, + const Shape& weightsShape, + const Shape& biasShape, + const Shape& outputShape, + const int sx, + const int sy, + const int px, + const int py, + const int dx, + const int dy, + const int groups) { + const dnnl::memory::data_type dataType = detail::dnnlMapToType(inputType); + const auto formatWeight = + (groups == 1) ? memory::format_tag::oihw : memory::format_tag::goihw; + const bool hasBias = biasShape.elements() > 0; + + OneDnnConv2DData out; + // Create memory dims + out.inputDims = detail::convertToDnnlDims( + {inputShape.dim(kIOBatchSizeIdx), + inputShape.dim(kIOChannelSizeIdx), + inputShape.dim(kHIdx), + inputShape.dim(kWIdx)}); + if (groups == 1) { + out.weightDims = detail::convertToDnnlDims( + {weightsShape.dim(kWeightOutputChannelSizeIdx), + inputShape.dim(kIOChannelSizeIdx), + weightsShape.dim(kHIdx), + weightsShape.dim(kWIdx)}); + } else { + out.weightDims = detail::convertToDnnlDims( + {groups, + weightsShape.dim(kWeightOutputChannelSizeIdx) / groups, + inputShape.dim(kIOChannelSizeIdx) / groups, + weightsShape.dim(kHIdx), + weightsShape.dim(kWIdx)}); + } + out.outputDims = detail::convertToDnnlDims( + {inputShape.dim(kIOBatchSizeIdx), + weightsShape.dim(kWeightOutputChannelSizeIdx), + outputShape.dim(kHIdx), + outputShape.dim(kWIdx)}); + out.biasDims = detail::convertToDnnlDims( + {weightsShape.dim(kWeightOutputChannelSizeIdx)}); + out.strideDims = {sy, sx}; + out.paddingDims = {py, px}; + // NB: DNNL treats a dilation of 0 as a standard convolution and indexes + // larger dilations accordingly. See https://git.io/fhAT2 for more. + out.dilationDims = {dy - 1, dx - 1}; + + // Create memory descriptors. using format::any gives the best performance + out.inputMemDesc = memory::desc({out.inputDims}, dataType, formatAny); + out.outputMemDesc = memory::desc({out.outputDims}, dataType, formatAny); + out.weightMemDesc = memory::desc({out.weightDims}, dataType, formatWeight); + out.biasMemDesc = memory::desc({out.biasDims}, dataType, formatAny); + + // + const auto forwardMode = prop_kind::forward_training; + // TODO: determine train mode/assess perf impact of always choosing training + // (primitive cache storage overhead?) + // const auto forwardMode = + // train ? prop_kind::forward_training : prop_kind::forward_inference; + + auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine(); + convolution_forward::primitive_desc fwdPrimitiveDescriptor; + if (hasBias) { + fwdPrimitiveDescriptor = convolution_forward::primitive_desc( + dnnlEngine, + forwardMode, + algorithm::convolution_direct, + out.inputMemDesc, + out.weightMemDesc, + out.biasMemDesc, + out.outputMemDesc, + out.strideDims, + out.dilationDims, + out.paddingDims, + out.paddingDims); + } else { + fwdPrimitiveDescriptor = convolution_forward::primitive_desc( + dnnlEngine, + forwardMode, + algorithm::convolution_direct, + out.inputMemDesc, + out.weightMemDesc, + out.outputMemDesc, + out.strideDims, + out.dilationDims, + out.paddingDims, + out.paddingDims); + } + out.fwdPrimDesc = std::move(fwdPrimitiveDescriptor); + + return out; +} + +} // namespace + +Tensor OneDnnAutogradExtension::conv2d( + const Tensor& input, + const Tensor& weights, + const Tensor& bias, + const int sx, + const int sy, + const int px, + const int py, + const int dx, + const int dy, + const int groups, + std::shared_ptr) { + if (input.type() == fl::dtype::f16) { + throw std::runtime_error("Half precision is not supported in CPU."); + } + + // flashlight input, weight, and output shapes in column-major: + // - Input is WHCN + // - Weights are WHIO + // - Output is WHCN + // Since ArrayFire is column major, getting a raw pointer (1D + // representation) of these shapes and viewing as if the representation is + // row major transposes along all axis into NCHW for the input and output + // and OIHW for the weights + auto output = Tensor( + {1 + + (input.dim(kWIdx) + (2 * px) - (1 + (weights.dim(kWIdx) - 1) * dx)) / + sx, + 1 + + (input.dim(kHIdx) + (2 * py) - (1 + (weights.dim(kHIdx) - 1) * dy)) / + sy, + weights.dim(kWeightOutputChannelSizeIdx), + input.dim(kIOBatchSizeIdx)}, + input.type()); + auto hasBias = bias.elements() > 0; + + auto formatWeight = + (groups == 1) ? memory::format_tag::oihw : memory::format_tag::goihw; + auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine(); + + /********************************* Forward *******************************/ + OneDnnConv2DData conv2DData = createOneDnnConv2DData( + input.type(), + input.shape(), + weights.shape(), + bias.shape(), + output.shape(), + sx, + sy, + px, + py, + dx, + dy, + groups); + + // Create memory + const detail::DnnlMemoryWrapper inputMemInit( + input, {conv2DData.inputDims}, formatNCHW); + const detail::DnnlMemoryWrapper outputMemInit( + output, {conv2DData.outputDims}, formatNCHW); + const detail::DnnlMemoryWrapper weightsMem( + weights, {conv2DData.weightDims}, formatWeight); + + // Network for execution + std::vector network; + std::vector> fwdArgs; + + // DNNL suggests checking if the layout requested for the convolution + // is different from NCHW/OIHW (even if specified), and reordering if + // necessary, since the convolution itself may request a different + // ordering + auto inputDesc = conv2DData.fwdPrimDesc.src_desc(); + auto weightsDesc = conv2DData.fwdPrimDesc.weights_desc(); + auto outputDesc = conv2DData.fwdPrimDesc.dst_desc(); + // Input + auto inputMemory = detail::dnnlAlignOrdering( + network, fwdArgs, inputMemInit.getMemory(), inputDesc); + auto weightsMemory = detail::dnnlAlignOrdering( + network, fwdArgs, weightsMem.getMemory(), weightsDesc); + // Output - adds a reorder after the conv if needed + auto outputMemory = outputMemInit.getMemory(); + if (outputMemInit.getMemory().get_desc() != outputDesc) { + outputMemory = memory(outputDesc, dnnlEngine); + } + + // Create convolution + std::shared_ptr conv; + const detail::DnnlMemoryWrapper biasMemory( + bias, conv2DData.biasDims, formatBias); + conv = std::make_shared(conv2DData.fwdPrimDesc); + + network.push_back(*conv); + + // Conv fwd args + std::unordered_map convFwdArgs = { + {DNNL_ARG_SRC, inputMemory}, + {DNNL_ARG_WEIGHTS, weightsMemory}, + {DNNL_ARG_DST, outputMemory}}; + if (hasBias) { + convFwdArgs[DNNL_ARG_BIAS] = biasMemory.getMemory(); + } + fwdArgs.push_back(convFwdArgs); + + // Add output reordering if needed + if (outputMemory != outputMemInit.getMemory()) { + network.push_back(dnnl::reorder(outputMemory, outputMemInit.getMemory())); + fwdArgs.push_back( + {{DNNL_ARG_FROM, outputMemory}, + {DNNL_ARG_TO, outputMemInit.getMemory()}}); + } + + // Run + detail::executeNetwork(network, fwdArgs); + + return output; +} + +Tensor OneDnnAutogradExtension::conv2dBackwardData( + const Tensor& gradOutput, + const Tensor& input, + const Tensor& weights, + const int sx, + const int sy, + const int px, + const int py, + const int dx, + const int dy, + const int groups, + std::shared_ptr, + std::shared_ptr) { + auto gradInput = Tensor(input.shape(), input.type()); // Result + + auto formatWeight = + (groups == 1) ? memory::format_tag::oihw : memory::format_tag::goihw; + auto& dnnlEngineBwd = detail::DnnlEngine::getInstance().getEngine(); + + Tensor bias; // dummy + OneDnnConv2DData conv2DData = createOneDnnConv2DData( + input.type(), + input.shape(), + weights.shape(), + bias.shape(), + gradOutput.shape(), // has the same shape as the Conv output + sx, + sy, + px, + py, + dx, + dy, + groups); + + // Backward descriptor + convolution_backward_data::primitive_desc bwdDataPrimitiveDesc( + dnnlEngineBwd, + algorithm::convolution_direct, + conv2DData.inputMemDesc, + conv2DData.weightMemDesc, + conv2DData.outputMemDesc, + conv2DData.strideDims, + conv2DData.dilationDims, + conv2DData.paddingDims, + conv2DData.paddingDims, + conv2DData.fwdPrimDesc); + // Primitive descriptor + auto bwdData = + std::make_shared(bwdDataPrimitiveDesc); + + // Create memory + const detail::DnnlMemoryWrapper gradOutputMemInit( + gradOutput, conv2DData.outputDims, formatNCHW); + const detail::DnnlMemoryWrapper gradInputMemInit( + gradInput, conv2DData.inputDims, formatNCHW); + const detail::DnnlMemoryWrapper weightsMemInitBwd( + weights, conv2DData.weightDims, formatWeight); + + std::vector networkBackwards; + std::vector> bwdDataArgs; + + // Check for reorderings + auto gradOutputDesc = bwdDataPrimitiveDesc.diff_dst_desc(); + auto weightsDesc = bwdDataPrimitiveDesc.weights_desc(); + auto gradInputDesc = bwdDataPrimitiveDesc.diff_src_desc(); + auto gradOutputMemory = detail::dnnlAlignOrdering( + networkBackwards, + bwdDataArgs, + gradOutputMemInit.getMemory(), + gradOutputDesc); + auto weightsMemoryBackwards = detail::dnnlAlignOrdering( + networkBackwards, + bwdDataArgs, + weightsMemInitBwd.getMemory(), + weightsDesc); + auto gradInputMemory = gradInputMemInit.getMemory(); + // Don't reorder the gradient until after the conv + if (gradInputMemInit.getMemory().get_desc() != gradInputDesc) { + gradInputMemory = memory(gradInputDesc, dnnlEngineBwd); + } + + // Convolution backwards + auto convBwdData = + std::make_shared(bwdDataPrimitiveDesc); + + bwdDataArgs.push_back( + {{DNNL_ARG_DIFF_SRC, gradInputMemory}, + {DNNL_ARG_WEIGHTS, weightsMemoryBackwards}, + {DNNL_ARG_DIFF_DST, gradOutputMemory}}); + networkBackwards.push_back(*convBwdData); + + // Reorder the output (which is gradInput here) if necessary + if (gradInputMemory != gradInputMemInit.getMemory()) { + networkBackwards.push_back( + dnnl::reorder(gradInputMemory, gradInputMemInit.getMemory())); + bwdDataArgs.push_back( + {{DNNL_ARG_FROM, gradInputMemory}, + {DNNL_ARG_TO, gradInputMemInit.getMemory()}}); + } + + detail::executeNetwork(networkBackwards, bwdDataArgs); + + return gradInput; +} + +std::pair OneDnnAutogradExtension::conv2dBackwardFilterBias( + const Tensor& gradOutput, + const Tensor& input, + const Tensor& weights, + const Tensor& bias, + const int sx, + const int sy, + const int px, + const int py, + const int dx, + const int dy, + const int groups, + std::shared_ptr, + std::shared_ptr, + std::shared_ptr) { + auto gradWeights = Tensor(weights.shape(), weights.type()); // Result + + auto formatWeight = + (groups == 1) ? memory::format_tag::oihw : memory::format_tag::goihw; + auto& dnnlEngineBwd = detail::DnnlEngine::getInstance().getEngine(); + OneDnnConv2DData conv2DData = createOneDnnConv2DData( + input.type(), + input.shape(), + weights.shape(), + bias.shape(), + gradOutput.shape(), // has the same shape as the Conv output + sx, + sy, + px, + py, + dx, + dy, + groups); + + Tensor gradBias; + bool computeBiasGrad = !bias.isEmpty() && !conv2DData.biasMemDesc.is_zero(); + if (computeBiasGrad) { + gradBias = Tensor(bias.shape(), bias.type()); + } + + // Weight backward descriptor + convolution_backward_weights::primitive_desc bwdWeightPrimitiveDesc; + if (computeBiasGrad) { + bwdWeightPrimitiveDesc = convolution_backward_weights::primitive_desc( + dnnlEngineBwd, + algorithm::convolution_direct, + conv2DData.inputMemDesc, + conv2DData.weightMemDesc, + conv2DData.biasMemDesc, + conv2DData.outputMemDesc, + conv2DData.strideDims, + conv2DData.dilationDims, + conv2DData.paddingDims, + conv2DData.paddingDims, + conv2DData.fwdPrimDesc); + } else { + bwdWeightPrimitiveDesc = convolution_backward_weights::primitive_desc( + dnnlEngineBwd, + algorithm::convolution_direct, + conv2DData.inputMemDesc, + conv2DData.weightMemDesc, + conv2DData.outputMemDesc, + conv2DData.strideDims, + conv2DData.dilationDims, + conv2DData.paddingDims, + conv2DData.paddingDims, + conv2DData.fwdPrimDesc); + } + // Weight backward primitive descriptor + auto bwdWeights = + std::make_shared(bwdWeightPrimitiveDesc); + + // Create memory + const detail::DnnlMemoryWrapper inputRawMemInitBwd( + input, conv2DData.inputDims, formatNCHW); + const detail::DnnlMemoryWrapper gradOutputMemInit( + gradOutput, conv2DData.outputDims, formatNCHW); + const detail::DnnlMemoryWrapper gradWeightsMemInit( + gradWeights, conv2DData.weightDims, formatWeight); + + std::vector networkBackwards; + std::vector> bwdWeightsArgs; + + // Check for reorderings, reorder if needed + auto inputDesc = bwdWeightPrimitiveDesc.src_desc(); + auto gradOutputDesc = bwdWeightPrimitiveDesc.diff_dst_desc(); + auto gradWeightsDesc = bwdWeightPrimitiveDesc.diff_weights_desc(); + auto inputMemoryBackwards = detail::dnnlAlignOrdering( + networkBackwards, + bwdWeightsArgs, + inputRawMemInitBwd.getMemory(), + inputDesc); + auto gradOutputMemory = detail::dnnlAlignOrdering( + networkBackwards, + bwdWeightsArgs, + gradOutputMemInit.getMemory(), + gradOutputDesc); + // Don't reorder the grads until after the conv bwd + auto gradWeightsMemory = gradWeightsMemInit.getMemory(); + if (gradWeightsMemInit.getMemory().get_desc() != gradWeightsDesc) { + gradWeightsMemory = memory(gradWeightsDesc, dnnlEngineBwd); + } + + // Create the convolution backward weight + std::unordered_map bwdConvWeightsArgs = { + {DNNL_ARG_SRC, inputMemoryBackwards}, + {DNNL_ARG_DIFF_WEIGHTS, gradWeightsMemory}, + {DNNL_ARG_DIFF_DST, gradOutputMemory}}; + + if (computeBiasGrad) { + const detail::DnnlMemoryWrapper gradBiasMem( + gradBias, conv2DData.biasDims, formatBias); + bwdConvWeightsArgs[DNNL_ARG_DIFF_BIAS] = gradBiasMem.getMemory(); + } else { + } + networkBackwards.push_back(*bwdWeights); + bwdWeightsArgs.push_back(bwdConvWeightsArgs); + + // Reorder weight gradients if necessary + if (gradWeightsMemory != gradWeightsMemInit.getMemory()) { + networkBackwards.push_back( + dnnl::reorder(gradWeightsMemory, gradWeightsMemInit.getMemory())); + bwdWeightsArgs.push_back( + {{DNNL_ARG_FROM, gradWeightsMemory}, + {DNNL_ARG_TO, gradWeightsMemInit.getMemory()}}); + } + + detail::executeNetwork(networkBackwards, bwdWeightsArgs); + + return {gradWeights, gradBias}; +} + +} // namespace fl diff --git a/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.cpp b/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.cpp new file mode 100644 index 0000000..5fa5530 --- /dev/null +++ b/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h" + +#include +#include + +#if FL_BACKEND_OPENCL + #include +#endif + +#include "flashlight/fl/common/Defines.h" +#include "flashlight/fl/tensor/Compute.h" +#include "flashlight/fl/tensor/TensorBase.h" + +#if FL_BACKEND_OPENCL + #include "flashlight/fl/common/OpenClUtils.h" +#endif + +namespace fl::detail { + +DnnlStream::DnnlStream(dnnl::engine engine) { +#if FL_BACKEND_OPENCL + stream_ = dnnl::ocl_interop::make_stream(engine, fl::ocl::getQueue()); +#else + stream_ = dnnl::stream(engine); +#endif +} + +dnnl::stream& DnnlStream::getStream() { + return stream_; +} + +DnnlStream& DnnlStream::getInstance() { + static DnnlStream instance(DnnlEngine::getInstance().getEngine()); + return instance; +} + +DnnlEngine::DnnlEngine() { +#if FL_BACKEND_OPENCL + engine_ = dnnl::ocl_interop::make_engine( + fl::ocl::getDeviceId(), fl::ocl::getContext()); +#else + engine_ = dnnl::engine(dnnl::engine::kind::cpu, 0); +#endif +} + +dnnl::engine& DnnlEngine::getEngine() { + return engine_; +} + +DnnlEngine& DnnlEngine::getInstance() { + static DnnlEngine instance; + return instance; +} + +dnnl::memory::dims convertToDnnlDims(const std::vector& shape) { + return dnnl::memory::dims(shape.begin(), shape.end()); +} + +dnnl::memory::dims convertShapeToDnnlDims(const Shape& shape) { + return convertToDnnlDims(shape.get()); +} + +DnnlMemoryWrapper::DnnlMemoryWrapper( + const Tensor& tensor, + dnnl::memory::dims dims, + dnnl::memory::format_tag format) { +#if FL_BACKEND_OPENCL + fl::ocl::DevicePtrOpenCl _devicePtr(tensor); + cl_mem* buffer = _devicePtr.getAsClMem(); + devicePtr_ = std::move(_devicePtr); +#else + devicePtr_ = fl::DevicePtr(tensor); + void* buffer = devicePtr_.get(); +#endif + descriptor_ = + dnnl::memory::desc({dims}, detail::dnnlMapToType(tensor.type()), format); + memory_ = dnnl::memory( + descriptor_, detail::DnnlEngine::getInstance().getEngine(), buffer); +} + +DnnlMemoryWrapper& DnnlMemoryWrapper::operator=(DnnlMemoryWrapper&& other) { + devicePtr_ = std::move(other.devicePtr_); + memory_ = std::move(other.memory_); + descriptor_ = std::move(other.descriptor_); + return *this; +} + +dnnl::memory DnnlMemoryWrapper::getMemory() const { + return memory_; +} + +dnnl::memory::desc DnnlMemoryWrapper::getDescriptor() const { + return descriptor_; +} + +dnnl::memory dnnlAlignOrdering( + std::vector& net, + std::vector>& netArgs, + const dnnl::memory& memory, + const dnnl::memory::desc& desc) { + auto memoryOut = memory; + if (memory.get_desc() != desc) { + // use the ordering requested by the descriptor + memoryOut = + dnnl::memory(desc, detail::DnnlEngine::getInstance().getEngine()); + net.push_back(dnnl::reorder(memory, memoryOut)); + netArgs.push_back({{DNNL_ARG_FROM, memory}, {DNNL_ARG_TO, memoryOut}}); + } + return memoryOut; +} + +void executeNetwork( + std::vector& net, + std::vector>& netArgs) { + if (net.size() != netArgs.size()) { + throw std::invalid_argument( + "executeNetwork - given different size nets and netArgs"); + } + // TODO{fl::Tensor}{macros} -- improve this to work with other backend interop + // If on the CPU backend, there isn't a AF computation stream that facilitates + // enforcing that inputs to computation are ready; we're required to wait + // until all AF operations are done + if (FL_BACKEND_CPU) { + fl::sync(); + } + + for (size_t i = 0; i < net.size(); ++i) { + net.at(i).execute(DnnlStream::getInstance().getStream(), netArgs.at(i)); + } + + // TODO{fl::Tensor}{macros} -- improve this to work with other backend interop + if (FL_BACKEND_CPU) { + // Block the executing thread until the work is complete + DnnlStream::getInstance().getStream().wait(); + } +} + +dnnl::algorithm dnnlMapToPoolingMode(const PoolingMode mode) { + switch (mode) { + case PoolingMode::MAX: + return dnnl::algorithm::pooling_max; + case PoolingMode::AVG_INCLUDE_PADDING: + return dnnl::algorithm::pooling_avg_include_padding; + case PoolingMode::AVG_EXCLUDE_PADDING: + return dnnl::algorithm::pooling_avg_exclude_padding; + default: + throw std::invalid_argument("unsupported pooling mode for cuDNN"); + } +} + +} // namespace fl diff --git a/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h b/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h new file mode 100644 index 0000000..07be6db --- /dev/null +++ b/flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +#include "flashlight/fl/common/Defines.h" +#include "flashlight/fl/common/DevicePtr.h" +#include "flashlight/fl/tensor/Shape.h" +#include "flashlight/fl/tensor/Types.h" + +namespace fl { + +class Tensor; + +namespace detail { + +/** + * A singleton class that contains a static instance of a dnnl::stream. + */ +class DnnlStream { + public: + DnnlStream(dnnl::engine engine); + ~DnnlStream() = default; + + /// Prohibit assignment + DnnlStream& operator=(DnnlStream const& s) = delete; + + dnnl::stream& getStream(); + + static DnnlStream& getInstance(); + + private: + dnnl::stream stream_; +}; + +/** + * A singleton class that contains a static instance of a dnnl::engine. + */ +class DnnlEngine { + public: + DnnlEngine(); + ~DnnlEngine() = default; + + /// Prohibit assignment + DnnlEngine& operator=(DnnlEngine const& e) = delete; + + dnnl::engine& getEngine(); + + static DnnlEngine& getInstance(); + + private: + dnnl::engine engine_; +}; + +/** + * Helper for converting a Flashlight Shape into an DNNL-compatible input + * for dnnl::memory::dims. + */ +dnnl::memory::dims convertToDnnlDims(const std::vector& dims); +dnnl::memory::dims convertShapeToDnnlDims(const Shape& shape); + +/** + * A light wrapper around dnnl::memory that manages underlying memory lifetime + * in accordance with fl::DevicePtr. + */ +class DnnlMemoryWrapper { + public: + DnnlMemoryWrapper( + const Tensor& tensor, + dnnl::memory::dims dims, + dnnl::memory::format_tag format); + DnnlMemoryWrapper() = default; + + DnnlMemoryWrapper& operator=(DnnlMemoryWrapper&& other); + + dnnl::memory getMemory() const; + + dnnl::memory::desc getDescriptor() const; + + private: + dnnl::memory::desc descriptor_; + dnnl::memory memory_; + fl::DevicePtr devicePtr_; +}; + +/** + * Given some an dnnl network (a ``std::vector``), a + * ``dnnl::memory`` with some ordering, and a + * ``dnnl::memory::primitive_desc``, determines whether or not the memory + * needs to be ordered based on the primitive descriptor's required ordering. + * + * If so, adds a ``dnnl::reorder`` layer to the network, and returns a new + * memory descriptor that will be properly reordered. + */ +dnnl::memory dnnlAlignOrdering( + std::vector& net, + std::vector>& netArgs, + const dnnl::memory& memory, + const dnnl::memory::desc& desc); + +/** + * Executes a sequence of DNNL primitives in the default execution stream with + * the default execution engine. + * + * For each primitive, passes the corresponding arguments map for that index + * to the execution stream. The number of primitives and the number of + * arguments must be equal, else throws. + * + * Blocks calling thread until the enqueued work has been completed. + */ +void executeNetwork( + std::vector& net, + std::vector>& args); + +/** + * Given a flashlight pooling mode, returns the corresponding dnnl pooling + * mode. + */ +dnnl::algorithm dnnlMapToPoolingMode(const PoolingMode mode); + +/** + * Maps an ArrayFire array datatype into the corresponding DNNL datatype. + * + * Needs to be explicitly inlined due to a bug with DNNL. + */ +inline dnnl::memory::data_type dnnlMapToType(const fl::dtype t) { + if (t == fl::dtype::f16) { + return dnnl::memory::data_type::f16; + } else if (t == fl::dtype::f32) { + return dnnl::memory::data_type::f32; + } else if (t == fl::dtype::f64) { + throw std::invalid_argument("float64 is not supported by DNNL"); + } else { + throw std::invalid_argument("data type not supported with DNNL"); + } +} + +} // namespace detail +} // namespace fl diff --git a/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.cpp b/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.cpp new file mode 100644 index 0000000..d180fec --- /dev/null +++ b/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.cpp @@ -0,0 +1,18 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h" + +namespace fl { + +bool OneDnnAutogradExtension::isDataTypeSupported( + const fl::dtype& dtype) const { + // fp16 computation is not supported with onednn + return dtype != fl::dtype::f16; +} + +} // namespace fl diff --git a/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h b/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h new file mode 100644 index 0000000..310ecb9 --- /dev/null +++ b/flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "flashlight/fl/autograd/tensor/AutogradExtension.h" + +namespace fl { + +class OneDnnAutogradExtension : public AutogradExtension { + // TODO(jacobkahn): implement getEngine + + public: + bool isDataTypeSupported(const fl::dtype& dtype) const override; + + /**************************** Forward ****************************/ + Tensor conv2d( + const Tensor& input, + const Tensor& weights, + const Tensor& bias, + const int sx, + const int sy, + const int px, + const int py, + const int dx, + const int dy, + const int groups, + std::shared_ptr payload) override; + + Tensor pool2d( + const Tensor& input, + const int wx, + const int wy, + const int sx, + const int sy, + const int px, + const int py, + const PoolingMode mode, + std::shared_ptr payload) override; + + Tensor batchnorm( + Tensor& saveMean, + Tensor& saveVar, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + Tensor& runningMean, + Tensor& runningVar, + const std::vector& axes, + const bool train, + const double momentum, + const double epsilon, + std::shared_ptr payload) override; + + std::tuple rnn( + const Tensor& input, + const Tensor& hiddenState, + const Tensor& cellState, + const Tensor& weights, + const int hiddenSize, + const int numLayers, + const RnnMode mode, + const bool bidirectional, + const float dropout, + std::shared_ptr payload) override; + + /**************************** Backward ****************************/ + // ]----- Convolution + Tensor conv2dBackwardData( + const Tensor& gradOutput, + const Tensor& input, + const Tensor& weight, + const int sx, + const int sy, + const int px, + const int py, + const int dx, + const int dy, + const int groups, + std::shared_ptr dataGradBenchmark, + std::shared_ptr payload) override; + + std::pair conv2dBackwardFilterBias( + const Tensor& gradOutput, + const Tensor& input, + const Tensor& weights, + const Tensor& bias, + const int sx, + const int sy, + const int px, + const int py, + const int dx, + const int dy, + const int groups, + std::shared_ptr filterBench, + std::shared_ptr biasBench, + std::shared_ptr autogradPayload) override; + + // ]----- pool2D + Tensor pool2dBackward( + const Tensor& gradOutput, + const Tensor& input, + const Tensor& poolOutput, + const int wx, + const int wy, + const int sx, + const int sy, + const int px, + const int py, + const PoolingMode mode, + std::shared_ptr payload) override; + + // ]----- batchnorm + std::tuple batchnormBackward( + const Tensor& gradOutput, + const Tensor& saveMean, + const Tensor& saveVar, + const Tensor& input, + const Tensor& weight, + const std::vector& axes, + const bool train, + const float epsilon, + std::shared_ptr payload) override; + + // ]----- rnn + std::tuple rnnBackward( + const Tensor& input, + const Tensor& hiddenState, + const Tensor& cellState, + const Tensor& weights, + const std::shared_ptr gradData, + const Tensor& output, + const int numLayers, + const int hiddenSize, + const RnnMode mode, + const bool bidirectional, + const float dropProb, + std::shared_ptr payload) override; +}; + +} // namespace fl diff --git a/flashlight/fl/autograd/tensor/backend/onednn/Pool2D.cpp b/flashlight/fl/autograd/tensor/backend/onednn/Pool2D.cpp new file mode 100644 index 0000000..bf094b6 --- /dev/null +++ b/flashlight/fl/autograd/tensor/backend/onednn/Pool2D.cpp @@ -0,0 +1,249 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h" + +#include +#include + +#include + +#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h" +#include "flashlight/fl/tensor/Shape.h" +#include "flashlight/fl/tensor/TensorBase.h" + +using namespace dnnl; + +namespace fl { + +namespace { + +constexpr size_t kWIdx = 0; +constexpr size_t kHIdx = 1; +constexpr size_t kChannelSizeIdx = 2; +constexpr size_t kBatchSizeIdx = 3; + +// Use memory::format_tag::any for memory formatting even if pool +// inputs are shaped in a particular way. +constexpr auto formatAny = memory::format_tag::any; +constexpr auto formatNCHW = memory::format_tag::nchw; + +struct DimsData { + memory::dims inputDims; + memory::dims outputDims; + memory::dims windowDims; + memory::dims strideDims; + std::vector paddingDims; +}; + +DimsData getDimsData( + const Shape& input, + const Shape& output, + const int wx, + const int wy, + const int sx, + const int sy, + const int px, + const int py) { + DimsData d; + d.inputDims = detail::convertToDnnlDims( + {input.dim(kBatchSizeIdx), + input.dim(kChannelSizeIdx), + input.dim(kHIdx), + input.dim(kWIdx)}); + d.outputDims = detail::convertToDnnlDims( + {input.dim(kBatchSizeIdx), + input.dim(kChannelSizeIdx), + output.dim(kHIdx), + output.dim(kWIdx)}); + d.windowDims = {wy, wx}; + d.strideDims = {sy, sx}; + d.paddingDims = {py, px}; + return d; +} + +} // namespace + +struct OneDnnPool2DPayload : detail::AutogradPayloadData { + memory workspace; + memory outputMemory; + DimsData dimsData; + pooling_forward::primitive_desc poolingFwdPrimDesc; +}; + +Tensor OneDnnAutogradExtension::pool2d( + const Tensor& input, + const int wx, + const int wy, + const int sx, + const int sy, + const int px, + const int py, + const PoolingMode mode, + std::shared_ptr autogradPayload) { + const bool train = (autogradPayload != nullptr); + auto payload = std::make_shared(); + if (train) { + autogradPayload->data = payload; + } + + // inputX x inputY x channels x batch + auto ix = input.dim(kWIdx); + auto iy = input.ndim() > kHIdx ? input.dim(kHIdx) : 1; + auto c = input.ndim() > kChannelSizeIdx ? input.dim(kChannelSizeIdx) : 1; + auto b = input.ndim() > kBatchSizeIdx ? input.dim(kBatchSizeIdx) : 1; + + auto output = Tensor( + {1 + (ix + 2 * px - wx) / sx, 1 + (iy + 2 * py - wy) / sy, c, b}, + input.type()); + + payload->dimsData = + getDimsData({ix, iy, c, b}, output.shape(), wx, wy, sx, sy, px, py); + auto& d = payload->dimsData; + auto dataType = detail::dnnlMapToType(input.type()); + + // Memory desc + auto inputMD = memory::desc({d.inputDims}, dataType, formatNCHW); + auto outputMD = memory::desc({d.outputDims}, dataType, formatAny); + + // Memory + auto& dnnlEngine = detail::DnnlEngine::getInstance().getEngine(); + const detail::DnnlMemoryWrapper inputMemInit( + input, {d.inputDims}, formatNCHW); + const detail::DnnlMemoryWrapper outputMemInit( + output, {d.outputDims}, formatNCHW); + + // Choose a mode based on whether gradients are needed + auto forwardMode = train ? prop_kind::forward : prop_kind::forward_inference; + + // Descriptors + auto poolingMode = detail::dnnlMapToPoolingMode(mode); + payload->poolingFwdPrimDesc = pooling_forward::primitive_desc( + dnnlEngine, + forwardMode, + poolingMode, + inputMD, + outputMD, + d.strideDims, + d.windowDims, + memory::dims{0, 0}, // dilation -- TODO: add to API + d.paddingDims, + d.paddingDims); + auto& primDesc = payload->poolingFwdPrimDesc; + + // Network + std::vector network; + std::vector> fwdArgs; + // Reorder if needed + auto inputDesc = primDesc.src_desc(); + auto outputDesc = primDesc.dst_desc(); + auto inputMemory = detail::dnnlAlignOrdering( + network, fwdArgs, inputMemInit.getMemory(), inputDesc); + payload->outputMemory = outputMemInit.getMemory(); + if (outputMemInit.getMemory().get_desc() != outputDesc) { + payload->outputMemory = memory(outputDesc, dnnlEngine); + } + // Workspace and layer (only training mode requires a workspace) + std::shared_ptr pooling; + std::unordered_map fwdPoolingArgs; + fwdPoolingArgs[DNNL_ARG_SRC] = inputMemory; + fwdPoolingArgs[DNNL_ARG_DST] = payload->outputMemory; + if (train) { + payload->workspace = memory(primDesc.workspace_desc(), dnnlEngine); + pooling = std::make_shared(primDesc); + fwdPoolingArgs[DNNL_ARG_WORKSPACE] = payload->workspace; + } else { + pooling = std::make_shared(primDesc); + } + network.push_back(*pooling); + fwdArgs.push_back(fwdPoolingArgs); + + // Add output reordering if needed + if (payload->outputMemory != outputMemInit.getMemory()) { + network.push_back( + dnnl::reorder(payload->outputMemory, outputMemInit.getMemory())); + fwdArgs.push_back( + {{DNNL_ARG_FROM, payload->outputMemory}, + {DNNL_ARG_TO, outputMemInit.getMemory()}}); + } + + detail::executeNetwork(network, fwdArgs); + return output; +} + +Tensor OneDnnAutogradExtension::pool2dBackward( + const Tensor& gradOutput, + const Tensor& input, + const Tensor& poolOutput, + const int wx, + const int wy, + const int sx, + const int sy, + const int px, + const int py, + const PoolingMode mode, + std::shared_ptr autogradPayload) { + if (!autogradPayload) { + throw std::invalid_argument( + "OneDnnAutogradExtension::pool2dBackward given null detail::AutogradPayload"); + } + auto payload = + std::static_pointer_cast(autogradPayload->data); + + auto gradInput = Tensor(input.shape(), fl::dtype::f32); + auto& dnnlEngineBwd = detail::DnnlEngine::getInstance().getEngine(); + + DimsData& d = payload->dimsData; + auto poolingMode = detail::dnnlMapToPoolingMode(mode); + + // Memory + const detail::DnnlMemoryWrapper gradInputMemInit( + gradInput, {d.inputDims}, formatNCHW); + const detail::DnnlMemoryWrapper gradOutputMemInit( + gradOutput, {d.outputDims}, formatNCHW); + + // Descriptors + // Memory descriptors from initialized memory must be used since + // pooling_backward descriptors require an ordering + auto gradInputMD = gradInputMemInit.getMemory().get_desc(); + auto gradOutputMD = gradOutputMemInit.getMemory().get_desc(); + auto bwdPrimitiveDesc = pooling_backward::primitive_desc( + dnnlEngineBwd, + poolingMode, + gradInputMD, + gradOutputMD, + d.strideDims, + d.windowDims, + memory::dims{0, 0}, // dilation - TODO: add to API + d.paddingDims, + d.paddingDims, + payload->poolingFwdPrimDesc // hint + ); + + std::vector networkBackward; + std::vector> bwdArgs; + // Reorder output memory if required + auto gradOutputMemory = detail::dnnlAlignOrdering( + networkBackward, + bwdArgs, + gradOutputMemInit.getMemory(), + payload->outputMemory.get_desc()); + + auto poolBwd = pooling_backward(bwdPrimitiveDesc); + std::unordered_map bwdPoolingArgs = { + {DNNL_ARG_DIFF_SRC, gradInputMemInit.getMemory()}, + {DNNL_ARG_DIFF_DST, gradOutputMemory}, + {DNNL_ARG_WORKSPACE, payload->workspace}}; + bwdArgs.push_back(bwdPoolingArgs); + networkBackward.push_back(poolBwd); + + detail::executeNetwork(networkBackward, bwdArgs); + + return gradInput; +} + +} // namespace fl diff --git a/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp b/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp new file mode 100644 index 0000000..dd1d8a0 --- /dev/null +++ b/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp @@ -0,0 +1,575 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "flashlight/fl/autograd/tensor/backend/onednn/OneDnnAutogradExtension.h" + +#include +#include +#include +#include + +#include + +#include "flashlight/fl/autograd/tensor/backend/onednn/DnnlUtils.h" +#include "flashlight/fl/tensor/Index.h" + +namespace fl { +namespace { + +struct ParsedWeightsAndBias { + // First layer - will be empty if inSize == hiddenSize + Tensor weightsInput1L; + Tensor weightsHidden1L; + Tensor bias1L; + // All other layers + Tensor weightsInput; + Tensor weightsHidden; + Tensor bias; +}; + +// Each gate's weights have dimensions d1 x d2 +Tensor reorderLbrGruWeights(int d1, int d2, const Tensor& weights) { + // LBR GRU requires switch the given the r, u, o gate order from cuDNN to u, + // r, o as required by oneDNN (this from empirical verification) + int weightsSize = d1 * d2; + if (weights.elements() != weightsSize * 3) { + throw std::invalid_argument( + "RNN reorderLbrGruWeights given invalid weights tensor or dims - " + "weights of size " + + std::to_string(weights.elements()) + " which should be exactly " + + std::to_string(weightsSize * 3)); + } + return fl::concatenate( + 0, + weights.flat(fl::range(weightsSize, 2 * weightsSize)), + weights.flat(fl::range(0, weightsSize)), + weights.flat(fl::range(2 * weightsSize, fl::end))); +} + +/** + * Converts flat cuDNN weights into the corresponding oneDNN onednn RNN weights. + */ +ParsedWeightsAndBias parseWeights( + const Tensor& weights, + RnnMode mode, + int numLayers, + int directionMult, + int inSize, + int numGates, + int hiddenSize) { + ParsedWeightsAndBias out; + + // Per-layer sizes for weightsInput and weightsHidden. + // If inSize == hiddenSize, then weightsInputSize == weightsHiddenSize for all + // layers, else all but the first layer + int weightsInputSize1L = directionMult * inSize * numGates * hiddenSize; + int weightsHiddenSize = directionMult * hiddenSize * numGates * hiddenSize; + int weightsInputSize = weightsHiddenSize; + int lbrGruBias = mode == RnnMode::GRU ? 1 : 0; + int biasSize = + numLayers * directionMult * (numGates + lbrGruBias) * hiddenSize; + + bool firstLayerDifferent = inSize != hiddenSize; + // Adjusted if skipping first layer parsing + int numWeightsLayers = firstLayerDifferent ? numLayers - 1 : numLayers; + int weightsOffset = + firstLayerDifferent ? weightsInputSize1L + weightsHiddenSize : 0; + // If skipping the first layer, parse then skip over the first layer + // weights and parse the remaining layers. Parsing all bias layers is still + // fine since biases for each layer have the same size + if (firstLayerDifferent) { + out.weightsInput1L = weights.flat(fl::range(weightsInputSize1L)); + out.weightsHidden1L = weights.flat( + fl::range(weightsInputSize1L, weightsInputSize1L + weightsHiddenSize)); + + if (mode == RnnMode::GRU) { + out.weightsInput1L = + reorderLbrGruWeights(inSize, hiddenSize, out.weightsInput1L); + out.weightsHidden1L = + reorderLbrGruWeights(hiddenSize, hiddenSize, out.weightsHidden1L); + } + } + + auto weightsFlat = weights.flatten().astype(weights.type()); + // cuDNN RNN weights, for each layer, are arranged with a chunk of + // input-hidden weights for each layer followed by a chunk of hidden-hidden + // weights for each layer: + // {[layers x [hiddenSize, inputSize]], [layers x [hiddenSize, hiddenSize]] } + // Rearrange this to what oneDNN expects (or will reorder if not optimal), + // which is numLayers chunks of two chunks containing input-hidden and + // hidden-hidden: + // {[layers x [[hiddenSize x inSize], [hiddenSize x hiddenSize]]]} + // Note that the loop is over the total number of layers in case we'r doing a + // single-layer operation where input size and hidden size are different but + // we'll call another primitive with the output of that first layer as the + // input to the next layers + auto weightsInput = Tensor({0}, weights.type()); + auto weightsHidden = Tensor({0}, weights.type()); + Tensor weightsFlatOffset = + weightsFlat.flat(fl::range(weightsOffset, fl::end)); + // Specifically ignore the first layer's weights, so inSize == hiddenSize + for (int i = 0; i < numWeightsLayers; ++i) { + // number of input/hidden weights + // TODO: Will change for bidirectional + int chunkSize = hiddenSize * hiddenSize * numGates; + // weights per layer + int layerChunkSize = chunkSize + chunkSize; + + // Grab input-hidden weights and chunk them together + auto inputWeightsChunk = weightsFlatOffset.flat( + fl::range(layerChunkSize * i, layerChunkSize * i + chunkSize)); + // Grab hidden-hidden weights and chunk them together + auto inputHiddenChunk = weightsFlatOffset.flat(fl::range( + layerChunkSize * i + chunkSize, + layerChunkSize * i + chunkSize + chunkSize)); + + if (mode == RnnMode::GRU) { + inputWeightsChunk = + reorderLbrGruWeights(hiddenSize, hiddenSize, inputWeightsChunk); + inputHiddenChunk = + reorderLbrGruWeights(hiddenSize, hiddenSize, inputHiddenChunk); + } + + weightsInput = fl::concatenate(2, weightsInput, inputWeightsChunk); + weightsHidden = fl::concatenate(2, weightsHidden, inputHiddenChunk); + } + out.weightsInput = weightsInput; + out.weightsHidden = weightsHidden; + + // Reduce the weights to form biases. cuDNN uses two separate bias terms: + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNMode_t - + // oneDNN expects only one bias term. Sum together the coefficients for both + // bias terms to get a single bias term for oneDNN. The gradients for + // each term can be computed as one since the gradients with respect to + // the bias subarrays will simply be half of the computed gradient with + // oneDNN + Tensor bias(weights.type()); + int biasStartOffset = numLayers * weightsHiddenSize + + (numLayers - 1) * weightsInputSize + weightsInputSize1L; + // In vanilla RNN modes, the biases can be simply added: + // two biases for each bias in fl cuDNN with CUDNN_RNN_DOUBLE_BIAS (default) + int numBiases = 2; + // First, grab a subarray which contains only both bias terms; then add them + Tensor biasFlat = weightsFlat.flat(fl::range(biasStartOffset, fl::end)); + // Layout is: {numLayers x [numBiases x [bias shape]]} + for (int i = 0; i < numLayers; ++i) { + if (mode == RnnMode::GRU) { + int lbrGruChunkSize = hiddenSize * 6; + // In the case of the LBR GRU, there's an extra bias term which shouldn't + // be combined with the first two pairs of biases. Six chunks total. + // cuDNN --> oneDNN transformation for ordering: + // r1, u1, o, r2, u2, u' --> u1 + u2, r1 + r2, o, u' + int base = i * lbrGruChunkSize; + // The sum of the following tensors yields the correct bias + // u1, r1, o, u' + auto biases1 = fl::concatenate( + 0, + // u1 -- [1, 2] + biasFlat.flat( + fl::range(base + hiddenSize * 1, base + hiddenSize * 2)), + // r1 -- [0, 1] + biasFlat.flat( + fl::range(base + hiddenSize * 0, base + hiddenSize * 1)), + // o -- [2, 3] + biasFlat.flat( + fl::range(base + hiddenSize * 2, base + hiddenSize * 3)), + // 'u -- [5, 6] + biasFlat.flat( + fl::range(base + hiddenSize * 5, base + hiddenSize * 6))); + // u2, r2, 0, 0 + auto biases2 = fl::concatenate( + 0, + // u2 -- [4, 5] + biasFlat.flat( + fl::range(base + hiddenSize * 4, base + hiddenSize * 5)), + // r2 -- [3, 4] + biasFlat.flat( + fl::range(base + hiddenSize * 3, base + hiddenSize * 4)), + // zeroes to add to o and u' + fl::full({hiddenSize * 2}, 0., biasFlat.type())); + auto layerBiasCombined = biases1 + biases2; + bias = fl::concatenate(0, bias, layerBiasCombined); + } else { + // The number of bias terms in the tensor per-layer + int layerStride = biasSize / numLayers * numBiases; + auto biases1 = biasFlat(fl::range( + layerStride * i, layerStride * i + layerStride / numBiases)); + auto biases2 = biasFlat(fl::range( + layerStride * i + layerStride / numBiases, layerStride * (i + 1))); + auto layerBiasCombined = biases1 + biases2; + bias = fl::concatenate(0, bias, layerBiasCombined); + } + } + + if (firstLayerDifferent) { + out.bias1L = bias.flat(fl::range(biasSize / numLayers)); + if (numLayers > 1) { + // bias for the second --> last layer + bias = bias.flat(fl::range(biasSize / numLayers, fl::end)); + } + } + out.bias = bias; + + // Case for a single layer of different in/hidden size + if (firstLayerDifferent && numLayers == 1) { + out.weightsInput = out.weightsInput1L; + out.weightsHidden = out.weightsHidden1L; + out.bias = out.bias1L; + } + + return out; +} + +struct RnnResult { + dnnl::memory workspace; + Tensor y; // output + Tensor hy; // hidden output + Tensor cy; // cell output +}; + +/* + * Does forward for a single onednn RNN primitive + */ +RnnResult rnnImpl( + const Tensor& input, + const Tensor& hiddenState, + const Tensor& cellState, + const Tensor& weightsInput, + const Tensor& weightsHidden, + const Tensor& bias, + int hiddenSize, + int numLayers, + RnnMode mode, + dnnl::algorithm activation, + int numGates, + dnnl::rnn_direction direction, + int directionMult, + dnnl::prop_kind kind, + float dropout) { + RnnResult result; + auto dnnlEngine = detail::DnnlEngine::getInstance().getEngine(); + + // Dimensions + int inSize = input.dim(0); + int batchSize = input.ndim() < 2 ? 1 : input.dim(1); + int seqLength = input.ndim() < 3 ? 1 : input.dim(2); + dnnl::memory::dims inputDims = {seqLength, batchSize, inSize}; + dnnl::memory::dims outputDims = { + seqLength, batchSize, hiddenSize * directionMult}; + auto dType = detail::dnnlMapToType(input.type()); + int totalLayers = numLayers; + int outSize = hiddenSize; + dnnl::memory::dims hDims = { + totalLayers, directionMult, batchSize, hiddenSize}; + dnnl::memory::dims cDims = { + totalLayers, directionMult, batchSize, hiddenSize}; + int extraBias = mode == RnnMode::GRU ? 1 : 0; // for LBR GRU + dnnl::memory::dims biasDims = { + numLayers, directionMult, numGates + extraBias, hiddenSize}; + // ldigo + dnnl::memory::dims weightsInputDims = { + numLayers, directionMult, inSize, numGates, hiddenSize}; + dnnl::memory::dims weightsHiddenDims = { + numLayers, directionMult, hiddenSize, numGates, hiddenSize}; + + // Out tensors: output (y), hidden state output (hy), cell state output (cy) + auto y = Tensor({outSize, batchSize, seqLength}, input.type()); + auto hy = Tensor({hiddenSize, batchSize, totalLayers}, input.type()); + Tensor cy; + if (mode == RnnMode::LSTM) { + cy = Tensor(hy.shape(), input.type()); + } + + // Memory for forward + auto tnc = dnnl::memory::format_tag::tnc; + auto ldnc = dnnl::memory::format_tag::ldnc; + auto ldgoi = dnnl::memory::format_tag::ldgoi; + auto ldgo = dnnl::memory::format_tag::ldgo; + const detail::DnnlMemoryWrapper inputMemInit( + input.asContiguousTensor(), {inputDims}, tnc); + const detail::DnnlMemoryWrapper outputMemInit(y, {outputDims}, tnc); + detail::DnnlMemoryWrapper hiddenInMemInit; + if (!hiddenState.isEmpty()) { + hiddenInMemInit = detail::DnnlMemoryWrapper( + hiddenState.asContiguousTensor(), {hDims}, ldnc); + } + const detail::DnnlMemoryWrapper hiddenOutMemInit(hy, {hDims}, ldnc); + const detail::DnnlMemoryWrapper weightsInputMemRawInit( + weightsInput.asContiguousTensor(), {weightsInputDims}, ldgoi); + const detail::DnnlMemoryWrapper weightsHiddenMemRawInit( + weightsHidden.asContiguousTensor(), {weightsHiddenDims}, ldgoi); + const detail::DnnlMemoryWrapper biasMemInit( + bias.asContiguousTensor(), {biasDims}, ldgo); + + // TODO(jacobkahn): don't force a format tag - use any and do a reorder based + // on the format of the primitive - what it says - like you're supposed to + // Primitive for reordering input weights: ldgoi --> ldigo + auto weightsInputMemDesc = dnnl::memory::desc( + weightsInputDims, dType, dnnl::memory::format_tag::ldigo); + auto weightsInputMemInit = dnnl::memory(weightsInputMemDesc, dnnlEngine); + // Primitive for reordering iter/hidden weights: ldgoi --> ldigo + auto weightsHiddenMemDesc = dnnl::memory::desc( + weightsHiddenDims, dType, dnnl::memory::format_tag::ldigo); + auto weightsHiddenMemInit = dnnl::memory(weightsHiddenMemDesc, dnnlEngine); + + // Add arguments + std::unordered_map rnnFwdArgs = { + {DNNL_ARG_SRC_LAYER, inputMemInit.getMemory()}, + {DNNL_ARG_SRC_ITER, hiddenInMemInit.getMemory()}, + {DNNL_ARG_WEIGHTS_LAYER, weightsInputMemInit}, + {DNNL_ARG_WEIGHTS_ITER, weightsHiddenMemInit}, + {DNNL_ARG_BIAS, biasMemInit.getMemory()}, + {DNNL_ARG_DST_LAYER, outputMemInit.getMemory()}, + {DNNL_ARG_DST_ITER, hiddenOutMemInit.getMemory()}}; + + // Workspace memory, if needed + dnnl::memory workspace; + std::vector network; + std::vector> fwdArgs; + + // reorder input weights + network.push_back( + dnnl::reorder(weightsInputMemRawInit.getMemory(), weightsInputMemInit)); + fwdArgs.push_back( + {{DNNL_ARG_FROM, weightsInputMemRawInit.getMemory()}, + {DNNL_ARG_TO, weightsInputMemInit}}); + // reorder iter weights + network.push_back( + dnnl::reorder(weightsHiddenMemRawInit.getMemory(), weightsHiddenMemInit)); + fwdArgs.push_back( + {{DNNL_ARG_FROM, weightsHiddenMemRawInit.getMemory()}, + {DNNL_ARG_TO, weightsHiddenMemInit}}); + + // Initialize descriptors + if (mode == RnnMode::RELU || mode == RnnMode::TANH) { + auto vanillaPd = dnnl::vanilla_rnn_forward::primitive_desc( + dnnlEngine, + kind, + activation, + direction, + inputMemInit.getDescriptor(), + hiddenInMemInit.getDescriptor(), + weightsInputMemDesc, // weights "layer" + weightsHiddenMemDesc, // weights "iter" + biasMemInit.getDescriptor(), + outputMemInit.getDescriptor(), + hiddenOutMemInit.getDescriptor()); + network.push_back(dnnl::vanilla_rnn_forward(vanillaPd)); + workspace = dnnl::memory(vanillaPd.workspace_desc(), dnnlEngine); + + } else if (mode == RnnMode::LSTM) { + // LSTM-only + // input cell state + // TODO(jacobkahn): function that takes the array and + // returns the desciptor and memory -- takes an argument for + // which determines whether or not it's ok to return empty + // descriptors if the array is empty + detail::DnnlMemoryWrapper cellInMemInit; + if (!cellState.isEmpty()) { + cellInMemInit = detail::DnnlMemoryWrapper( + cellState.asContiguousTensor(), {cDims}, ldnc); + } + // output cell state + detail::DnnlMemoryWrapper cellOutMemInit(cy, cDims, ldnc); + + auto lstmPd = dnnl::lstm_forward::primitive_desc( + dnnlEngine, + kind, + direction, + inputMemInit.getDescriptor(), + hiddenInMemInit.getDescriptor(), + cellInMemInit.getDescriptor(), + weightsInputMemDesc, // weights "layer" + weightsHiddenMemDesc, // weights "iter" + biasMemInit.getDescriptor(), + outputMemInit.getDescriptor(), + hiddenOutMemInit.getDescriptor(), + cellOutMemInit.getDescriptor()); + network.push_back(dnnl::lstm_forward(lstmPd)); + workspace = dnnl::memory(lstmPd.workspace_desc(), dnnlEngine); + rnnFwdArgs.insert({DNNL_ARG_SRC_ITER_C, cellInMemInit.getMemory()}); + rnnFwdArgs.insert({DNNL_ARG_DST_ITER_C, cellOutMemInit.getMemory()}); + + } else if (mode == RnnMode::GRU) { + // Use a linear-before-reset GRU so we can have parity with cuDNN + auto gruPd = dnnl::lbr_gru_forward::primitive_desc( + dnnlEngine, + kind, + direction, + inputMemInit.getDescriptor(), + hiddenInMemInit.getDescriptor(), + weightsInputMemDesc, + weightsHiddenMemDesc, + biasMemInit.getDescriptor(), + outputMemInit.getDescriptor(), + hiddenOutMemInit.getDescriptor()); + network.push_back(dnnl::lbr_gru_forward(gruPd)); + workspace = dnnl::memory(gruPd.workspace_desc(), dnnlEngine); + } + rnnFwdArgs.insert({DNNL_ARG_WORKSPACE, workspace}); + fwdArgs.push_back(rnnFwdArgs); + + detail::executeNetwork(network, fwdArgs); + + result.y = y; + result.hy = hy; + result.cy = cy; + result.workspace = workspace; + return result; +} + +} // namespace + +std::tuple OneDnnAutogradExtension::rnn( + const Tensor& input, + const Tensor& hiddenState, + const Tensor& cellState, + const Tensor& weights, + const int hiddenSize, + const int numLayers, + const RnnMode mode, + const bool bidirectional, + const float dropout, + std::shared_ptr autogradPayload) { + if (dropout > 0.0) { + throw std::invalid_argument("onednn RNN: dropout > 0.0 unsupported"); + } + if (bidirectional) { + throw std::invalid_argument("onednn RNN: bidirectional not yet supported"); + } + + const bool train = (autogradPayload != nullptr); + + // Constants + auto direction = bidirectional + ? dnnl::rnn_direction::bidirectional_concat + : dnnl::rnn_direction::unidirectional_left2right; + int directionMult = bidirectional ? 2 : 1; + auto kind = train ? dnnl::prop_kind::forward_training + : dnnl::prop_kind::forward_inference; + int numGates = 1; + auto activation = dnnl::algorithm::undef; + switch (mode) { + case RnnMode::LSTM: + numGates = 4; + break; + case RnnMode::GRU: + numGates = 3; + break; + case RnnMode::RELU: + activation = dnnl::algorithm::eltwise_relu; + break; + case RnnMode::TANH: + activation = dnnl::algorithm::eltwise_tanh; + break; + default: + break; + } + + int inSize = input.dim(0); + + // In Flashlight, all RNN weights are stored as one contiguous tensor, so we + // have to parse out the input weights, input biases, hidden weights, and + // hidden biases from one tensor. Order doesn't matter since the arrangement + // is a black box + auto parsedWeights = parseWeights( + weights, mode, numLayers, directionMult, inSize, numGates, hiddenSize); + + RnnResult result; + // The oneDNN RNN primitive has an API limitation where input size and + // hidden size can only differ if the primitive has exactly one layer. + // Therefore, for computations for more than one layer, first do the + // operation for one layer, which gives an output vector of size [hidden + // size, batch size, sequence length * number of directions], then use + // that output as the input for layers [2, L]. Since the input size dim 0 + // is now the hidden size, the primitive can fuse computation for + // arbitrarily-many layers. + if (input.dim(0) == hiddenSize || numLayers == 1) { + // Input and hidden size are the same, or we only have one layer, which + // means we can call the impl as is and parse weights "normally" + result = rnnImpl( + input, + hiddenState, + cellState, + parsedWeights.weightsInput, + parsedWeights.weightsHidden, + parsedWeights.bias, + hiddenSize, + numLayers, + mode, + activation, + numGates, + direction, + directionMult, + kind, + dropout); + } else { + // We require more than one layer with different input and hidden states - + // see the above. Seek to the first layer's hidden/cell state, weights, and + // bias + RnnResult resultL1 = rnnImpl( + input, + hiddenState(fl::span, fl::span, 0), + cellState(fl::span, fl::span, 0), + parsedWeights.weightsInput1L, + parsedWeights.weightsHidden1L, + parsedWeights.bias1L, + hiddenSize, + 1, + mode, + activation, + numGates, + direction, + directionMult, + kind, + dropout); + + /* Layers [2..N] */ + // Seek past the first layer's hidden/cell state, weights, and bias + RnnResult resultL2N = rnnImpl( + resultL1.y, // fixme + hiddenState(fl::span, fl::span, fl::range(1, fl::end)), + cellState(fl::span, fl::span, fl::range(1, fl::end)), + parsedWeights.weightsInput, + parsedWeights.weightsHidden, + parsedWeights.bias, + hiddenSize, + numLayers - 1, // layers [2..N] + mode, + activation, + numGates, + direction, + directionMult, + kind, + dropout); + + result.y = resultL2N.y; + result.hy = fl::concatenate(2, resultL1.hy, resultL2N.hy); + result.cy = fl::concatenate(2, resultL1.cy, resultL2N.cy); + } + + return {result.y, result.hy, result.cy}; +} + +std::tuple OneDnnAutogradExtension::rnnBackward( + const Tensor& input, + const Tensor& hiddenState, + const Tensor& cellState, + const Tensor& weights, + const std::shared_ptr gradData, + const Tensor& output, + const int numLayers, + const int hiddenSize, + const RnnMode mode, + const bool bidirectional, + const float dropProb, + const std::shared_ptr payload) { + throw std::runtime_error( + "onednn RNN: Gradient computation not yet supported"); +} + +} // namespace fl diff --git a/flashlight/fl/common/Histogram.h b/flashlight/fl/common/Histogram.h index f57b26a..df0b300 100644 --- a/flashlight/fl/common/Histogram.h +++ b/flashlight/fl/common/Histogram.h @@ -49,7 +49,7 @@ template struct HistogramBucket { T startInclusive = 0; //! left boundary of the bucket. T endExclusive = 0; //! right boundary of the bucket. - size_t count = 0; //! Number of elements in this bucket.88 + size_t count = 0; //! Number of elements in this bucket. std::string prettyString( double countPerTick, // ratio of count/bar_length @@ -146,8 +146,8 @@ HistogramStats FixedBucketSizeHistogram( stats.mean = simpleMovingAverage; // Calculate bucket size - long range = stats.max - stats.min; - double bucketWidth = range / nBuckets; + double range = stats.max - stats.min; + auto bucketWidth = range / nBuckets; if (range == 0 || bucketWidth == 0) { stats.buckets[0].count = stats.numValues; stats.maxNumValuesPerBucket = stats.numValues; @@ -157,11 +157,11 @@ HistogramStats FixedBucketSizeHistogram( // Calculate count per bucket stats.maxNumValuesPerBucket = 0; for (auto itr = begin; itr != end; ++itr) { - if (*itr < clipMinValueInclusive || *itr > clipMaxValueExclusive) { + if (*itr < clipMinValueInclusive || *itr >= clipMaxValueExclusive) { continue; } double index = - std::round(static_cast(*itr - stats.min) / bucketWidth); + std::floor(static_cast(*itr - stats.min) / bucketWidth); size_t intIndex = std::min(static_cast(index), nBuckets - 1); HistogramBucket& bucket = stats.buckets[intIndex]; diff --git a/flashlight/fl/common/Logging.cpp b/flashlight/fl/common/Logging.cpp index 181a8d7..d666739 100644 --- a/flashlight/fl/common/Logging.cpp +++ b/flashlight/fl/common/Logging.cpp @@ -87,13 +87,18 @@ void addContext( std::stringstream* outputStream) { // report only the last threadIdNumDigits of the thread ID for succinctness // and compatibility with glog. - constexpr size_t threadIdNumDigits = 5; + constexpr size_t maxThreadIdNumDigits = 5; std::stringstream ss; ss << std::this_thread::get_id(); - const std::string threadId = ss.str(); - + + std::string threadId = ss.str(); + if(threadId.size() > maxThreadIdNumDigits){ + threadId = threadId.substr(threadId.size() - maxThreadIdNumDigits); + } + + (*outputStream) << dateTimeWithMicroSeconds() << ' ' - << threadId.substr(threadId.size() - threadIdNumDigits) << ' ' + << threadId << ' ' << getFileName(fullPath) << ':' << lineNumber << ' '; } diff --git a/flashlight/fl/dataset/BlobDataset.cpp b/flashlight/fl/dataset/BlobDataset.cpp index 563fba5..1b605f5 100644 --- a/flashlight/fl/dataset/BlobDataset.cpp +++ b/flashlight/fl/dataset/BlobDataset.cpp @@ -159,7 +159,7 @@ std::vector BlobDataset::readRawArray( buffer.resize(fl::getTypeSize(e.type) * e.dims.elements()); readData( e.offset, - (char*)buffer.data(), + reinterpret_cast(buffer.data()), fl::getTypeSize(e.type) * e.dims.elements()); } return buffer; diff --git a/flashlight/fl/dataset/FileBlobDataset.cpp b/flashlight/fl/dataset/FileBlobDataset.cpp index 2f8d027..f432935 100644 --- a/flashlight/fl/dataset/FileBlobDataset.cpp +++ b/flashlight/fl/dataset/FileBlobDataset.cpp @@ -16,7 +16,8 @@ FileBlobDataset::FileBlobDataset( bool rw, bool truncate) : name_(name) { - mode_ = (rw ? std::ios_base::in | std::ios_base::out : std::ios_base::in); + mode_ = (rw ? std::ios_base::in | std::ios_base::out : std::ios_base::in) | + std::ios_base::binary; { std::ofstream fs(name_, (truncate ? mode_ | std::ios_base::trunc : mode_)); if (!fs.is_open()) { diff --git a/flashlight/fl/tensor/backend/af/CMakeLists.txt b/flashlight/fl/tensor/backend/af/CMakeLists.txt index 95b9420..da4fbdb 100644 --- a/flashlight/fl/tensor/backend/af/CMakeLists.txt +++ b/flashlight/fl/tensor/backend/af/CMakeLists.txt @@ -74,6 +74,33 @@ if (${FL_ARRAYFIRE_USE_CUDA}) fl_set_backend_state(ENABLE CUDA) elseif(${FL_ARRAYFIRE_USE_CPU}) target_link_libraries(flashlight PUBLIC ArrayFire::afcpu) + + # af forgets dependencies, on windows we have to copy them + if(WIN32) + include(fm_target_utilities) + + set(AF_LIB_DIR "${ArrayFire_DIR}/../lib") + + fm_glob( + MKL_DLLS + "${AF_LIB_DIR}/" + PATTERNS + "mkl_*.dll" + "libiomp5md.dll" + ) + + fm_target_attach_dependency( + flashlight + NOLINK + ${MKL_DLLS} + ) + endif() + + if(LINUX) + find_package(OpenMP REQUIRED COMPONENTS CXX) + target_link_libraries(flashlight PUBLIC OpenMP::OpenMP_CXX) + endif() + fl_set_backend_state(ENABLE CPU) elseif(${FL_ARRAYFIRE_USE_OPENCL}) target_link_libraries(flashlight PUBLIC ArrayFire::afopencl) diff --git a/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp b/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp index 420f547..d0ee0d4 100644 --- a/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp +++ b/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp @@ -126,11 +126,11 @@ TEST(AutogradBinaryOpsTest, Linear) { auto wt = Variable(fl::rand({6, 3}, fl::dtype::f64) * 2 - 1, true); auto bs = Variable(fl::rand({6}, fl::dtype::f64) * 2 - 1, true); auto funcLinIn = [&](Variable& input) { return linear(input, wt, bs); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinIn, in, 1E-8)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinIn, in, 1E-8, 1E-4, {&wt, &bs})); auto funcLinWt = [&](Variable& weight) { return linear(in, weight, bs); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinWt, wt, 1E-8)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinWt, wt, 1E-8, 1E-4, {&in, &bs})); auto funcLinBs = [&](Variable& bias) { return linear(in, wt, bias); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinBs, bs, 1E-8)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinBs, bs, 1E-8, 1E-4, {&in, &wt})); } } @@ -146,11 +146,11 @@ TEST_F(AutogradTestF16, LinearF16) { auto wt = Variable(fl::rand({2, 2}, fl::dtype::f16) * scale, true); auto bs = Variable(fl::rand({2}, fl::dtype::f16) * scale, true); auto funcLinIn = [&](Variable& input) { return linear(input, wt, bs); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinIn, in, 5E-2, 5E-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinIn, in, 5E-2, 5E-1, {&wt, &bs})); auto funcLinWt = [&](Variable& weight) { return linear(in, weight, bs); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinWt, wt, 5E-2, 5E-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinWt, wt, 5E-2, 5E-1, {&in, &bs})); auto funcLinBs = [&](Variable& bias) { return linear(in, wt, bias); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinBs, bs, 5E-2, 5E-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLinBs, bs, 5E-2, 5E-1, {&in, &wt})); } } diff --git a/flashlight/fl/test/autograd/AutogradConv2DTest.cpp b/flashlight/fl/test/autograd/AutogradConv2DTest.cpp index 88cd170..c13f3c6 100644 --- a/flashlight/fl/test/autograd/AutogradConv2DTest.cpp +++ b/flashlight/fl/test/autograd/AutogradConv2DTest.cpp @@ -22,6 +22,7 @@ TEST(AutogradConv2DTest, Convolve) { auto in = Variable(fl::rand({10, 9, 8, 7}, fl::dtype::f32), true); auto wt = Variable(fl::rand({4, 3, 8, 6}, fl::dtype::f32), true); auto bs = Variable(fl::rand({1, 1, 6, 1}, fl::dtype::f32), true); + int px = 2, py = 1; int sx = 1, sy = 1; int dx = 1, dy = 1; @@ -40,7 +41,8 @@ TEST(AutogradConv2DTest, Convolve) { /* groups */ 1, benchmarks); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06, 1E-4, {&wt})); + auto funcConvWt = [&](Variable& weight) { return conv2d( in, @@ -55,7 +57,8 @@ TEST(AutogradConv2DTest, Convolve) { /* groups */ 1, benchmarks); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.06)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.06, 1E-4, {&in})); + auto funcConvBs = [&](Variable& bias) { return conv2d( in, @@ -70,7 +73,7 @@ TEST(AutogradConv2DTest, Convolve) { /* groups */ 1, benchmarks); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 0.03)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 0.03, 1E-4, {&in, &wt})); } TEST_F(AutogradTestF16, ConvolveF16) { @@ -83,6 +86,7 @@ TEST_F(AutogradTestF16, ConvolveF16) { Variable(fl::rand({3, 1, 2, 1}, fl::dtype::f16) * scaleFactor, true); auto wt = Variable(fl::rand({2, 1, 2, 1}, fl::dtype::f16), true); auto bs = Variable(fl::rand({1, 1, 1, 1}, fl::dtype::f16), true); + int px = 1, py = 1; int sx = 1, sy = 1; int dx = 1, dy = 1; @@ -101,7 +105,8 @@ TEST_F(AutogradTestF16, ConvolveF16) { /* groups */ 1, benchmarks); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 5e-1, 0.1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 5e-1, 0.1, {&wt, &bs})); + auto funcConvWt = [&](Variable& weight) { return conv2d( in, @@ -116,7 +121,8 @@ TEST_F(AutogradTestF16, ConvolveF16) { /* groups */ 1, benchmarks); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 5e-2, 0.1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 5e-2, 0.1, {&in, &bs})); + auto funcConvBs = [&](Variable& bias) { return conv2d( in, @@ -131,7 +137,7 @@ TEST_F(AutogradTestF16, ConvolveF16) { /* groups */ 1, benchmarks); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 3e-2, 0.1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 3e-2, 0.1, {&in, &wt})); } TEST(AutogradConv2DTest, ConvolveFilterGroups) { @@ -150,15 +156,15 @@ TEST(AutogradConv2DTest, ConvolveFilterGroups) { auto funcConvIn = [&](Variable& input) { return conv2d(input, wt, bs, sx, sy, px, py, dx, dy, groups); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06, 1E-4, {&wt, &bs})); auto funcConvWt = [&](Variable& weight) { return conv2d(in, weight, bs, sx, sy, px, py, dx, dy, groups); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.05)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.05, 1E-4, {&in, &bs})); auto foncConvBs = [&](Variable& bias) { return conv2d(in, wt, bias, sx, sy, px, py, dx, dy, groups); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(foncConvBs, bs, 0.02)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(foncConvBs, bs, 0.02, 1E-4, {&in, &wt})); } TEST(AutogradConv2DTest, ConvolveDilation) { @@ -181,7 +187,7 @@ TEST(AutogradConv2DTest, ConvolveDilation) { dy, /* groups */ 1); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvIn, in, 0.06, 1E-4, {&wt, &bs})); auto funcConvWt = [&](Variable& weight) { return conv2d( in, @@ -195,7 +201,7 @@ TEST(AutogradConv2DTest, ConvolveDilation) { dy, /* groups */ 1); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.05)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvWt, wt, 0.05, 1E-4, {&in, &bs})); auto funcConvBs = [&](Variable& bias) { return conv2d( in, @@ -209,7 +215,7 @@ TEST(AutogradConv2DTest, ConvolveDilation) { dy, /* groups */ 1); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 0.02)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConvBs, bs, 0.02, 1E-4, {&in, &wt})); } TEST(AutogradConv2DTest, WeightNormConv) { @@ -233,7 +239,7 @@ TEST(AutogradConv2DTest, WeightNormConv) { /* dy */ 1, /* groups */ 1); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormIn, in, 3E-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormIn, in, 3E-1, 1E-4, {&v, &g})); auto funcWeightNormV = [&](Variable& input) { auto w = input * @@ -250,7 +256,7 @@ TEST(AutogradConv2DTest, WeightNormConv) { /* dy */ 1, /* groups */ 1); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormV, v, 2E-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormV, v, 2E-1, 1E-4, {&g, &in})); auto funcWeightNormG = [&](Variable& input) { auto w = v * @@ -267,7 +273,7 @@ TEST(AutogradConv2DTest, WeightNormConv) { /* dy */ 1, /* groups */ 1); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormG, g, 2E-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcWeightNormG, g, 2E-1, 1E-4, {&v, &in})); } int main(int argc, char** argv) { diff --git a/flashlight/fl/test/autograd/AutogradNormalizationTest.cpp b/flashlight/fl/test/autograd/AutogradNormalizationTest.cpp index 81479e1..7663b9b 100644 --- a/flashlight/fl/test/autograd/AutogradNormalizationTest.cpp +++ b/flashlight/fl/test/autograd/AutogradNormalizationTest.cpp @@ -240,19 +240,22 @@ TEST(AutogradNormalizationTest, BatchNormJacobian) { return (batchnorm( in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 1e-2, 1e-4)); + + + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 1e-2, 1e-4, {&weight, &bias})); auto funcBnWt = [&](Variable& wt) { return (batchnorm( input, wt, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 1e-2, 1e-4)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 1e-2, 1e-4, {&input, &bias})); + auto funcBnBs = [&](Variable& bs) { return (batchnorm( input, weight, bs, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 1e-2, 1e-4)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 1e-2, 1e-4, {&input, &weight})); } TEST_F(AutogradTestF16, BatchNormJacobianF16) { @@ -276,25 +279,25 @@ TEST_F(AutogradTestF16, BatchNormJacobianF16) { return (batchnorm( in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 5e-2, 1e-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 5e-2, 1e-1, {&weight, &bias})); auto funcBnWt = [&](Variable& wt) { return (batchnorm( input, wt, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 5e-2, 1e-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 5e-2, 1e-1, {&input, &bias})); auto funcBnBs = [&](Variable& bs) { return (batchnorm( input, weight, bs, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 5e-2, 1e-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 5e-2, 1e-1, {&input, &weight})); } TEST(AutogradNormalizationTest, BatchNormJacobianMultipleAxes) { // Jacobian Test with trainMode = true; std::vector featAxes = {0, 1, 2}; - auto input = Variable(fl::rand({8, 8, 3, 16}, fl::dtype::f32), true); + auto input = Variable(fl::rand({4, 4, 3, 4}, fl::dtype::f32), true); auto nfeatures = 1; for (auto ax : featAxes) { nfeatures *= input.dim(ax); @@ -308,19 +311,19 @@ TEST(AutogradNormalizationTest, BatchNormJacobianMultipleAxes) { return (batchnorm( in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 1e-2, 1e-3)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnIn, input, 1e-2, 1e-3, {&weight, &bias})); auto funcBnWt = [&](Variable& wt) { return (batchnorm( input, wt, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 1e-2, 1e-3)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 1e-2, 1e-3, {&input, &bias})); auto funcBnBs = [&](Variable& bs) { return (batchnorm( input, weight, bs, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 1e-2, 1e-3)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 1e-2, 1e-3, {&input, &weight})); } TEST_F(AutogradTestF16, BatchNormJacobianMultipleAxesF16) { @@ -347,19 +350,19 @@ TEST_F(AutogradTestF16, BatchNormJacobianMultipleAxesF16) { in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; ASSERT_TRUE(fl::detail::jacobianTestImpl( - funcBnIn, input, 5e-2, 1e-1)); // TODO: investigate + funcBnIn, input, 5e-2, 1e-1, {&weight, &bias})); // TODO: investigate auto funcBnWt = [&](Variable& wt) { return (batchnorm( input, wt, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 5e-2, 1e-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnWt, weight, 5e-2, 1e-1, {&input, &bias})); auto funcBnBs = [&](Variable& bs) { return (batchnorm( input, weight, bs, runningMean, runningVar, featAxes, true, 0.0, 1E-5)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 5e-2, 1e-1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcBnBs, bias, 5e-2, 1e-1, {&input, &weight})); } TEST(AutogradNormalizationTest, LayerNormJacobian) { @@ -379,7 +382,7 @@ TEST(AutogradNormalizationTest, LayerNormJacobian) { in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLnIn, input, 1e-2, 1e-4)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLnIn, input, 1e-2, 1e-4, {&weight, &bias})); } TEST_F(AutogradTestF16, LayerNormJacobianF16) { @@ -405,7 +408,7 @@ TEST_F(AutogradTestF16, LayerNormJacobianF16) { in, weight, bias, runningMean, runningVar, featAxes, true, 0.0, 1E-5); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLnIn, input, 1e-4, 1e-2)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcLnIn, input, 1e-4, 1e-2, {&weight, &bias})); } int main(int argc, char** argv) { diff --git a/flashlight/fl/test/autograd/AutogradRnnTest.cpp b/flashlight/fl/test/autograd/AutogradRnnTest.cpp index 2201fdb..a0eb335 100644 --- a/flashlight/fl/test/autograd/AutogradRnnTest.cpp +++ b/flashlight/fl/test/autograd/AutogradRnnTest.cpp @@ -64,7 +64,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) { bidirectional, 0.0)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnIn, in, expectedPrecision, perturbation)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnIn, in, expectedPrecision, perturbation, {&w})); auto funcRnnW = [&](Variable& weights) -> Variable { return std::get<0>( @@ -78,7 +78,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) { bidirectional, 0.0)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnW, w, expectedPrecision, perturbation)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnW, w, expectedPrecision, perturbation, {&in})); // We get the correct gradient for hx auto hx = Variable( @@ -98,7 +98,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) { bidirectional, 0.0)); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnHx, hx, expectedPrecision, perturbation)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRnnHx, hx, expectedPrecision, perturbation, {&in, &w})); // We can compute the gradient w.r.t. hy auto funcRnnInDhy = [&](Variable& input) -> Variable { @@ -114,7 +114,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) { 0.0)); }; ASSERT_TRUE( - fl::detail::jacobianTestImpl(funcRnnInDhy, in, expectedPrecision, perturbation)); + fl::detail::jacobianTestImpl(funcRnnInDhy, in, expectedPrecision, perturbation, {&w})); if (mode == RnnMode::LSTM) { // We get the correct gradient for cx @@ -136,7 +136,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) { 0.0)); }; ASSERT_TRUE( - fl::detail::jacobianTestImpl(funcRnnCx, cx, expectedPrecision, perturbation)); + fl::detail::jacobianTestImpl(funcRnnCx, cx, expectedPrecision, perturbation, {&in, &w})); // We can compute the gradient w.r.t. cy auto funcRnnInDcy = [&](Variable& input) -> Variable { @@ -152,7 +152,7 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) { 0.0)); }; ASSERT_TRUE( - fl::detail::jacobianTestImpl(funcRnnInDcy, in, expectedPrecision, perturbation)); + fl::detail::jacobianTestImpl(funcRnnInDcy, in, expectedPrecision, perturbation, {&w})); } } diff --git a/flashlight/fl/test/autograd/AutogradTest.cpp b/flashlight/fl/test/autograd/AutogradTest.cpp index 4253545..f506e32 100644 --- a/flashlight/fl/test/autograd/AutogradTest.cpp +++ b/flashlight/fl/test/autograd/AutogradTest.cpp @@ -198,12 +198,12 @@ TEST(AutogradTest, Concatenate) { auto funcConcatenateT1 = [x2, x3, x4](Variable& in) { return concatenate({in, x2, x3, x4}, 2); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT1, x1)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT1, x1, 1E-5, 1E-4, {&x2, &x3, &x4})); auto funcConcatenateT2 = [x1, x2, x4](Variable& in) { return concatenate({x1, x2, in, x4}, 2); }; - ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT2, x3)); + ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT2, x3, 1E-5, 1E-4, {&x1, &x2, &x4})); } TEST(AutogradTest, Split) { diff --git a/flashlight/fl/test/autograd/AutogradTestUtils.h b/flashlight/fl/test/autograd/AutogradTestUtils.h index 39bd605..9dafbdb 100644 --- a/flashlight/fl/test/autograd/AutogradTestUtils.h +++ b/flashlight/fl/test/autograd/AutogradTestUtils.h @@ -34,7 +34,8 @@ inline bool jacobianTestImpl( const JacobianFunc& func, Variable& input, float precision = 1E-5, - float perturbation = 1E-4) { + float perturbation = 1E-4, + const std::vector& zeroGradientVariables = {}) { auto fwdJacobian = Tensor({func(input).elements(), input.elements()}, fl::dtype::f32); @@ -60,6 +61,9 @@ inline bool jacobianTestImpl( for (int i = 0; i < dout.elements(); ++i) { dout.tensor().flat(i) = 1; // element in 1D view input.zeroGrad(); + for (auto* var : zeroGradientVariables) { + var->zeroGrad(); + } auto out = func(input); out.backward(dout); diff --git a/flashlight/fl/test/common/DynamicBenchmarkTest.cpp b/flashlight/fl/test/common/DynamicBenchmarkTest.cpp index 79fb76d..8d7fce7 100644 --- a/flashlight/fl/test/common/DynamicBenchmarkTest.cpp +++ b/flashlight/fl/test/common/DynamicBenchmarkTest.cpp @@ -77,7 +77,7 @@ TEST_F(DynamicBenchmark, OptionsStateTimed) { TEST_F(DynamicBenchmark, DynamicBenchmarkSimple) { size_t maxCount = 5; - std::vector sleepTimes = {4, 2, 6}; + std::vector sleepTimes = {30, 16, 40}; //min 16ms (win) auto options = std::make_shared>(sleepTimes, maxCount); @@ -90,12 +90,12 @@ TEST_F(DynamicBenchmark, DynamicBenchmarkSimple) { } ASSERT_TRUE(options->timingsComplete()); // sleeping for fewer miliseconds is faster - ASSERT_EQ(options->currentOption(), 2); + ASSERT_EQ(options->currentOption(), sleepTimes[1]); } TEST_F(DynamicBenchmark, DynamicBenchmarkDisjointLambdas) { size_t maxCount = 5; - std::vector sleepTimes = {4, 2, 6}; + std::vector sleepTimes = {30, 16, 40}; auto options = std::make_shared>(sleepTimes, maxCount); @@ -120,7 +120,7 @@ TEST_F(DynamicBenchmark, DynamicBenchmarkDisjointLambdas) { } ASSERT_TRUE(options->timingsComplete()); // option 2 is still fastest disregarding intermediate time - ASSERT_EQ(options->currentOption(), 2); + ASSERT_EQ(options->currentOption(), sleepTimes[1]); } TEST_F(DynamicBenchmark, DynamicBenchmarkMatmul) { diff --git a/flashlight/fl/test/common/LoggingTest.cpp b/flashlight/fl/test/common/LoggingTest.cpp index 0807336..def67cf 100644 --- a/flashlight/fl/test/common/LoggingTest.cpp +++ b/flashlight/fl/test/common/LoggingTest.cpp @@ -135,7 +135,7 @@ TEST(LoggingDeathTest, FatalOnOff) { std::cerr.rdbuf(origStderrBuffer); Logging::setMaxLoggingLevel(fl::LogLevel::FATAL); - EXPECT_DEATH({ FL_LOG(fl::LogLevel::FATAL) << "log-fatal"; }, ""); + EXPECT_DEATH_IF_SUPPORTED({ FL_LOG(fl::LogLevel::FATAL) << "log-fatal"; }, ""); } } // namespace diff --git a/flashlight/fl/test/runtime/DeviceManagerTest.cpp b/flashlight/fl/test/runtime/DeviceManagerTest.cpp index f2d228e..bbe6426 100644 --- a/flashlight/fl/test/runtime/DeviceManagerTest.cpp +++ b/flashlight/fl/test/runtime/DeviceManagerTest.cpp @@ -38,7 +38,7 @@ TEST(DeviceManagerTest, getDeviceCount) { ASSERT_NO_THROW(manager.getDeviceCount(DeviceType::CUDA)); } else { ASSERT_THROW(manager.getDeviceCount(DeviceType::CUDA), - std::invalid_argument); + std::runtime_error); } } @@ -54,7 +54,7 @@ TEST(DeviceManagerTest, getDevicesOfType) { } } else { ASSERT_THROW(manager.getDeviceCount(DeviceType::CUDA), - std::invalid_argument); + std::runtime_error); } } } @@ -72,7 +72,7 @@ TEST(DeviceManagerTest, getActiveDevice) { if (manager.isDeviceTypeAvailable(type)) { ASSERT_EQ(manager.getActiveDevice(type).type(), type); } else { - ASSERT_THROW(manager.getActiveDevice(type), std::invalid_argument); + ASSERT_THROW(manager.getActiveDevice(type), std::runtime_error); } } } diff --git a/flashlight/fl/test/tensor/TensorBaseTest.cpp b/flashlight/fl/test/tensor/TensorBaseTest.cpp index fd0553b..283c55e 100644 --- a/flashlight/fl/test/tensor/TensorBaseTest.cpp +++ b/flashlight/fl/test/tensor/TensorBaseTest.cpp @@ -475,8 +475,11 @@ void assertScalarBehavior(fl::dtype type) { << ", ScalarArgType: " << dtype_traits::getName(); } - auto a = fl::rand({5, 6}, type); - ASSERT_TRUE(allClose(fl::full({1}, a.scalar(), type), a(0, 0))) + + ScalarArgType val = static_cast(rand()); + auto a = fl::full({5, 6}, val, type); + + ASSERT_TRUE(allClose(fl::full({1}, a.template scalar(), type), a(0, 0))) << "dtype: " << type << ", ScalarArgType: " << dtype_traits::getName(); } diff --git a/flashlight/fl/test/tensor/TensorReductionTest.cpp b/flashlight/fl/test/tensor/TensorReductionTest.cpp index dac0757..04ce54b 100644 --- a/flashlight/fl/test/tensor/TensorReductionTest.cpp +++ b/flashlight/fl/test/tensor/TensorReductionTest.cpp @@ -23,8 +23,9 @@ TEST(TensorReductionTest, countNonzero) { a(idx / 10, idx % 10) = 0; } - ASSERT_TRUE( - allClose(fl::fromScalar(a.elements() - idxs.size()), fl::countNonzero(a))); + ASSERT_TRUE(allClose( + fl::fromScalar(a.elements() - idxs.size(), a.type()), + fl::countNonzero(a))); std::vector sizes(a.shape().dim(0)); for (unsigned i = 0; i < a.shape().dim(0); ++i) { @@ -44,7 +45,7 @@ TEST(TensorReductionTest, countNonzero) { fl::Tensor::fromVector({2}, {4, 1}), fl::countNonzero(b, {0, 1}))); ASSERT_TRUE( - allClose(fl::fromScalar(b.elements() - 3), fl::countNonzero(b, {0, 1, 2}))); + allClose(fl::fromScalar(b.elements() - 3, b.type()), fl::countNonzero(b, {0, 1, 2}))); } TEST(TensorReductionTest, amin) { diff --git a/flashlight/pkg/speech/audio/feature/CMakeLists.txt b/flashlight/pkg/speech/audio/feature/CMakeLists.txt index f972509..4028388 100644 --- a/flashlight/pkg/speech/audio/feature/CMakeLists.txt +++ b/flashlight/pkg/speech/audio/feature/CMakeLists.txt @@ -2,14 +2,19 @@ cmake_minimum_required(VERSION 3.16) # ----------------------------- Dependencies ----------------------------- # BLAS -find_package(MKL) +find_package(MKL CONFIG REQUIRED) if (MKL_FOUND) set(FL_USE_MKL ON) - setup_install_find_module(${PROJECT_SOURCE_DIR}/cmake/FindMKL.cmake) - set(CBLAS_LIBRARIES ${MKL_LIBRARIES}) - set(CBLAS_INCLUDE_DIR ${MKL_INCLUDE_DIR}) + #setup_install_find_module(${PROJECT_SOURCE_DIR}/cmake/FindMKL.cmake) + #set(CBLAS_LIBRARIES ${MKL_LIBRARIES}) + #set(CBLAS_INCLUDE_DIR ${MKL_INCLUDE_DIR}) # TODO: remove me when we consolidate build options + + # TODO linking to MKL doesn't work with gcc bc of openmpi issues target_compile_definitions(fl_pkg_speech PUBLIC FL_USE_MKL=$) + target_link_libraries(fl_pkg_speech PUBLIC MKL::MKL) + + else() find_package(CBLAS REQUIRED) setup_install_find_module(${PROJECT_SOURCE_DIR}/cmake/FindCBLAS.cmake) @@ -29,11 +34,11 @@ endif() target_link_libraries(fl_pkg_speech PUBLIC $) # OpenMP -if (NOT MKL_FOUND) +#if (NOT MKL_FOUND) # NB: MKL provides iomp if enabled - find_package(OpenMP REQUIRED) - target_link_libraries(fl_pkg_speech PRIVATE OpenMP::OpenMP_CXX) -endif() +# find_package(OpenMP REQUIRED) +# target_link_libraries(fl_pkg_speech PRIVATE OpenMP::OpenMP_CXX) +#endif() # Threads find_package(Threads REQUIRED) diff --git a/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp b/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp index c1bc4d9..1bddd6b 100644 --- a/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp +++ b/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp @@ -22,7 +22,8 @@ bool jacobianTestImpl( const JacobianFunc& func, Variable& input, float precision = 1E-5, - float perturbation = 1E-4) { + float perturbation = 1E-4, + const std::vector& zeroGradientVariables = {}) { auto fwdJacobian = Tensor({func(input).elements(), input.elements()}, fl::dtype::f32); @@ -48,6 +49,9 @@ bool jacobianTestImpl( for (int i = 0; i < dout.elements(); ++i) { dout.tensor().flat(i) = 1; // element in 1D view input.zeroGrad(); + for (auto* var : zeroGradientVariables) { + var->zeroGrad(); + } auto out = func(input); out.backward(dout); diff --git a/vcpkg.json b/vcpkg.json index c91b75c..7dcf289 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -12,7 +12,9 @@ }, "cpu": { "description": "Dependencies for cpu backend", - "dependencies": [] + "dependencies": [ + "onednn" + ] } } }