pax_global_header00006660000000000000000000000064145651566420014530gustar00rootroot0000000000000052 comment=52541176253e74467dabc42eeee63d9a62c199f6 Halide-17.0.1/000077500000000000000000000000001456515664200130045ustar00rootroot00000000000000Halide-17.0.1/.clang-format000066400000000000000000000026631456515664200153660ustar00rootroot00000000000000--- AccessModifierOffset: -4 AlignEscapedNewlines: Left AlignTrailingComments: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: Empty AllowShortFunctionsOnASingleLine: None AllowShortIfStatementsOnASingleLine: Always AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackParameters: true BreakBeforeBinaryOperators: None BreakBeforeBraces: Attach BreakBeforeTernaryOperators: false BreakConstructorInitializersBeforeComma: false ColumnLimit: 0 ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false IndentCaseLabels: false IndentWrappedFunctionNames: false IndentWidth: 4 MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 60 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Right SpaceBeforeParens: ControlStatements SpaceAfterCStyleCast: false SpaceAfterTemplateKeyword: false SpaceBeforeAssignmentOperators: true SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 2 SpacesInAngles: false SpacesInCStyleCastParentheses: false SpacesInParentheses: false Standard: c++17 TabWidth: 8 UseTab: Never ... Halide-17.0.1/.clang-format-ignore000066400000000000000000000005771456515664200166510ustar00rootroot00000000000000# .clang-format-ignore ./bin ./build ./cmake_build ./cmake_build_static ./cmake_build_shared ./distrib ./doc ./include ./lib # Our tutorials have special formatting: skip them ./tutorial # hexagon_remote/bin/src is also special ./src/runtime/hexagon_remote/bin/src # mini_webgpu.h is copied from upstream with some local mods ./src/runtime/mini_webgpu.h ./dependencies/spirv *.fbs.h Halide-17.0.1/.clang-tidy000066400000000000000000000156051456515664200150470ustar00rootroot00000000000000# TODO: some of the blocklisted bugprone checks can/should be re-enabled # one at a time (with careful code fixes made as necessary). --- Checks: > -*, # For reasons that aren't clear, clang-tidy-16 will apparently # ignore disable-check requests if they were already enabled # by a glob, so we will individually enumerate all the ones we want (or don't want). # bugprone-*, bugprone-argument-comment, bugprone-assert-side-effect, -bugprone-assignment-in-if-condition, bugprone-bad-signal-to-kill-thread, bugprone-bool-pointer-implicit-conversion, -bugprone-branch-clone, bugprone-copy-constructor-init, bugprone-dangling-handle, bugprone-dynamic-static-initializers, -bugprone-easily-swappable-parameters, -bugprone-exception-escape, bugprone-fold-init-type, bugprone-forward-declaration-namespace, bugprone-forwarding-reference-overload, -bugprone-implicit-widening-of-multiplication-result, bugprone-inaccurate-erase, bugprone-incorrect-roundings, bugprone-infinite-loop, -bugprone-integer-division, bugprone-lambda-function-name, bugprone-macro-parentheses, bugprone-macro-repeated-side-effects, bugprone-misplaced-operator-in-strlen-in-alloc, bugprone-misplaced-pointer-arithmetic-in-alloc, bugprone-misplaced-widening-cast, bugprone-move-forwarding-reference, bugprone-multiple-statement-macro, -bugprone-narrowing-conversions,, bugprone-no-escape, bugprone-not-null-terminated-result, bugprone-parent-virtual-call, bugprone-posix-return, bugprone-redundant-branch-condition, -bugprone-reserved-identifier, bugprone-shared-ptr-array-mismatch, bugprone-signal-handler, -bugprone-signed-char-misuse, bugprone-sizeof-container, bugprone-sizeof-expression, bugprone-spuriously-wake-up-functions, bugprone-standalone-empty, bugprone-string-constructor, bugprone-string-integer-assignment, bugprone-string-literal-with-embedded-nul, bugprone-stringview-nullptr, bugprone-suspicious-enum-usage, bugprone-suspicious-include, bugprone-suspicious-memory-comparison, bugprone-suspicious-memset-usage, bugprone-suspicious-missing-comma, bugprone-suspicious-realloc-usage, bugprone-suspicious-semicolon, bugprone-suspicious-string-compare, bugprone-swapped-arguments, bugprone-terminating-continue, bugprone-throw-keyword-missing, bugprone-too-small-loop-variable, bugprone-unchecked-optional-access, bugprone-undefined-memory-manipulation, bugprone-undelegated-constructor, bugprone-unhandled-exception-at-new, bugprone-unhandled-self-assignment, bugprone-unused-raii, bugprone-unused-return-value, bugprone-use-after-move, bugprone-virtual-near-miss, clang-diagnostic-shadow-field, misc-confusable-identifiers, -misc-const-correctness, misc-definitions-in-headers, misc-misleading-bidirectional, misc-misleading-identifier, misc-misplaced-const, misc-new-delete-overloads, -misc-no-recursion, misc-non-copyable-objects, -misc-non-private-member-variables-in-classes, misc-redundant-expression, misc-static-assert, misc-throw-by-value-catch-by-reference, -misc-unconventional-assign-operator, misc-uniqueptr-reset-release, misc-unused-alias-decls, -misc-unused-parameters, misc-unused-using-decls, misc-use-anonymous-namespace, -modernize-avoid-bind, -modernize-avoid-c-arrays, -modernize-concat-nested-namespaces, modernize-deprecated-headers, -modernize-deprecated-ios-base-aliases, -modernize-loop-convert, -modernize-macro-to-enum, modernize-make-shared, modernize-make-unique, -modernize-pass-by-value, -modernize-raw-string-literal, modernize-redundant-void-arg, -modernize-replace-auto-ptr, -modernize-replace-disallow-copy-and-assign-macro, -modernize-replace-random-shuffle, -modernize-return-braced-init-list, -modernize-shrink-to-fit, -modernize-unary-static-assert, -modernize-use-auto, modernize-use-bool-literals, modernize-use-default-member-init, modernize-use-emplace, modernize-use-equals-default, modernize-use-equals-delete, -modernize-use-nodiscard, -modernize-use-noexcept, modernize-use-nullptr, modernize-use-override, -modernize-use-trailing-return-type, -modernize-use-transparent-functors, -modernize-use-uncaught-exceptions, -modernize-use-using performance-faster-string-find, performance-for-range-copy, performance-implicit-conversion-in-loop, performance-inefficient-algorithm, -performance-inefficient-string-concatenation, -performance-inefficient-vector-operation, performance-move-const-arg, performance-move-constructor-init, performance-no-automatic-move, -performance-no-int-to-ptr, performance-noexcept-move-constructor, performance-trivially-destructible, performance-type-promotion-in-math-fn, performance-unnecessary-copy-initialization, performance-unnecessary-value-param, readability-avoid-const-params-in-decls, readability-braces-around-statements, readability-const-return-type, -readability-container-contains, -readability-container-data-pointer, readability-container-size-empty, -readability-convert-member-functions-to-static, -readability-delete-null-pointer, -readability-duplicate-include, -readability-else-after-return, -readability-function-cognitive-complexity, -readability-function-size, -readability-identifier-length, -readability-identifier-naming, -readability-implicit-bool-conversion, -readability-inconsistent-declaration-parameter-name, -readability-isolate-declaration, -readability-magic-numbers, -readability-make-member-function-const, -readability-misleading-indentation, readability-misplaced-array-index, -readability-named-parameter, -readability-non-const-parameter, readability-qualified-auto, readability-redundant-access-specifiers, readability-redundant-control-flow, -readability-redundant-declaration, readability-redundant-function-ptr-dereference, -readability-redundant-member-init, readability-redundant-preprocessor, readability-redundant-smartptr-get, readability-redundant-string-cstr, -readability-redundant-string-init, -readability-simplify-boolean-expr, readability-simplify-subscript-expr, readability-static-accessed-through-instance, readability-static-definition-in-anonymous-namespace, -readability-string-compare, -readability-suspicious-call-argument, -readability-uniqueptr-delete-release, -readability-uppercase-literal-suffix, -readability-use-anyofallof, WarningsAsErrors: '*' HeaderFilterRegex: '.*' FormatStyle: 'file' CheckOptions: - key: modernize-use-default-member-init.UseAssignment value: 1 ... Halide-17.0.1/.gitattributes000066400000000000000000000005261456515664200157020ustar00rootroot00000000000000# Set the default behavior, in case people don't have core.autocrlf set. * text=auto # Explicitly declare text files you want to always be normalized and converted # to native line endings on checkout. *.cpp text *.c text *.h text # Denote all files that are truly binary and should not be modified. *.png binary *.jpg binary *.tiff binary Halide-17.0.1/.github/000077500000000000000000000000001456515664200143445ustar00rootroot00000000000000Halide-17.0.1/.github/workflows/000077500000000000000000000000001456515664200164015ustar00rootroot00000000000000Halide-17.0.1/.github/workflows/pip.yml000066400000000000000000000170021456515664200177140ustar00rootroot00000000000000# Relevant GHA docs links: # https://docs.github.com/en/actions/using-jobs/running-jobs-in-a-container # https://docs.github.com/en/packages/managing-github-packages-using-github-actions-workflows/publishing-and-installing-a-package-with-github-actions#upgrading-a-workflow-that-accesses-ghcrio name: Build PyPI package on: push: branches: [ main ] release: types: [ created ] concurrency: group: '${{ github.workflow }}-${{ github.event.pull_request.head.label || github.head_ref || github.ref }}' cancel-in-progress: true env: LLVM_VER: 15.0.7 permissions: contents: read # to fetch code (actions/checkout) packages: read # to fetch packages (docker) jobs: # When creating 'dev' (e.g. nightly) PyPI packages, we need to create a unique # label for each upload. For simplicity, we choose the Unix time-since-epoch in # UTC form (aka `date +%s`). pip-labels: name: Create Label for PyPI Packages runs-on: ubuntu-latest outputs: halide_pypi_label: ${{ steps.make_label.outputs.unix_time_utc }} steps: - id: make_label run: echo "unix_time_utc=$(date +%s)" >> "$GITHUB_OUTPUT" pip-linux: name: Package Halide Python bindings runs-on: ubuntu-latest needs: pip-labels strategy: fail-fast: false matrix: arch: [ x86_64, aarch64 ] steps: - uses: actions/checkout@v3 - name: Log in to GitHub Container Registry uses: docker/login-action@v2.1.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Set up QEMU uses: docker/setup-qemu-action@v2.1.0 with: platforms: all - name: Build wheels uses: pypa/cibuildwheel@v2.11.2 env: CIBW_ARCHS_LINUX: "${{ matrix.arch }}" CIBW_BUILD: "cp38-manylinux* cp39-manylinux* cp310-manylinux*" CIBW_CONFIG_SETTINGS: "--global-option=egg_info --global-option=-b.dev${{ needs.pip-labels.outputs.halide_pypi_label }} --global-option=--build-number --global-option=${{github.run_id}}" CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/halide/manylinux2014_x86_64-llvm:${{ env.LLVM_VER }} # CIBW_MANYLINUX_I686_IMAGE: ghcr.io/halide/manylinux2014_i686-llvm:${{ env.LLVM_VER }} CIBW_MANYLINUX_AARCH64_IMAGE: ghcr.io/halide/manylinux2014_aarch64-llvm:${{ env.LLVM_VER }} CIBW_BEFORE_ALL_LINUX: > cmake -G Ninja -S . -B build -DCMAKE_BUILD_TYPE=Release -DWITH_DOCS=NO -DWITH_PYTHON_BINDINGS=NO -DWITH_TESTS=NO -DWITH_TUTORIALS=NO -DWITH_UTILS=NO -DWITH_PYTHON_STUBS=NO && cmake --build build --target install - uses: actions/upload-artifact@v3 with: name: wheels path: ./wheelhouse/*.whl pip-other: name: Package Halide Python bindings runs-on: ${{ matrix.runner }} needs: pip-labels strategy: fail-fast: false matrix: include: - runner: windows-latest pytag: win_amd64 arch: x64 - runner: macos-latest pytag: macosx_universal2 arch: x86_64;arm64 steps: - uses: actions/checkout@v3 - name: Cache LLVM build folder id: cache-llvm uses: actions/cache@v3.0.11 with: path: local-llvm key: llvmorg-${{ env.LLVM_VER }}-${{ runner.os }} - uses: ilammy/msvc-dev-cmd@v1 - uses: lukka/get-cmake@latest - uses: actions/checkout@v3 if: steps.cache-llvm.outputs.cache-hit != 'true' with: path: llvm-src repository: llvm/llvm-project ref: llvmorg-${{ env.LLVM_VER }} - name: Configure LLVM if: steps.cache-llvm.outputs.cache-hit != 'true' run: > cmake -G Ninja -S llvm-src/llvm -B llvm-build -DCMAKE_BUILD_TYPE=Release "-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64" "-DLLVM_TARGETS_TO_BUILD=X86;ARM;NVPTX;AArch64;Hexagon;WebAssembly" "-DLLVM_ENABLE_PROJECTS=clang;lld" -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_LIBXML2=OFF -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ZSTD=OFF -DLLVM_ENABLE_ZLIB=OFF -DLLVM_ENABLE_OCAMLDOC=OFF -DLLVM_ENABLE_BINDINGS=OFF -DLLVM_ENABLE_IDE=OFF - name: Build LLVM if: steps.cache-llvm.outputs.cache-hit != 'true' run: cmake --build llvm-build - name: Install LLVM if: steps.cache-llvm.outputs.cache-hit != 'true' run: cmake --install llvm-build --prefix local-llvm # Remove the LLVM source tree after building it, otherwise we can # run out of local space while building halide - name: Clean LLVM Source if: steps.cache-llvm.outputs.cache-hit != 'true' shell: bash run: rm -rf llvm-src - name: Configure Halide if: runner.os == 'Windows' run: > cmake -G "Visual Studio 17 2022" -T ClangCL -A "${{ matrix.arch }}" -S . -B halide-build -DWITH_DOCS=NO -DWITH_PYTHON_BINDINGS=NO -DWITH_TESTS=NO -DWITH_TUTORIALS=NO -DWITH_UTILS=NO -DWITH_PYTHON_STUBS=NO -DLLVM_DIR=${{ github.workspace }}/local-llvm/lib/cmake/llvm - name: Configure Halide if: runner.os != 'Windows' run: > cmake -G Ninja -S . -B halide-build -DCMAKE_BUILD_TYPE=Release "-DCMAKE_OSX_ARCHITECTURES=${{ matrix.arch }}" -DWITH_DOCS=NO -DWITH_PYTHON_BINDINGS=NO -DWITH_TESTS=NO -DWITH_TUTORIALS=NO -DWITH_UTILS=NO -DWITH_PYTHON_STUBS=NO -DLLVM_DIR=${{ github.workspace }}/local-llvm/lib/cmake/llvm - name: Build Halide run: cmake --build halide-build --config Release - name: Install Halide run: cmake --install halide-build --config Release --prefix local-halide - name: Build wheels uses: pypa/cibuildwheel@v2.10.2 env: CMAKE_PREFIX_PATH: ${{ github.workspace }}/local-halide CIBW_BUILD: "cp38-${{ matrix.pytag }} cp39-${{ matrix.pytag }} cp310-${{ matrix.pytag }}" CIBW_CONFIG_SETTINGS: "--global-option=egg_info --global-option=-b.dev${{ needs.pip-labels.outputs.halide_pypi_label }} --global-option=--build-number --global-option=${{github.run_id}}" CIBW_ARCHS_MACOS: "universal2" - uses: actions/upload-artifact@v3 with: name: wheels path: ./wheelhouse/*.whl pip-sdist: name: Make SDist runs-on: ubuntu-latest needs: pip-labels steps: - uses: actions/checkout@v3 - run: pipx run build --sdist -C--global-option=egg_info -C--global-option=-b.dev${{ needs.pip-labels.outputs.halide_pypi_label }} - uses: actions/upload-artifact@v3 with: name: wheels path: dist/*.tar.gz publish: name: Publish on PyPI needs: [ pip-linux, pip-other, pip-sdist ] runs-on: ubuntu-latest steps: - uses: actions/download-artifact@v3 with: name: wheels path: dist - uses: pypa/gh-action-pypi-publish@v1.5.1 with: user: __token__ password: ${{ secrets.TEST_PYPI_TOKEN }} repository_url: https://test.pypi.org/legacy/ - uses: pypa/gh-action-pypi-publish@v1.5.1 if: github.event_name == 'release' && github.event.action == 'published' with: user: __token__ password: ${{ secrets.PYPI_TOKEN }} Halide-17.0.1/.github/workflows/presubmit.yml000066400000000000000000000103661456515664200211440ustar00rootroot00000000000000name: Halide Presubmit Checks on: # We don't want 'edited' (that's basically just the description, title, etc) # We don't want 'review_requested' (that's redundant to the ones below for our purposes) pull_request: types: [opened, synchronize, reopened] paths: - '**.h' - '**.c' - '**.cpp' permissions: contents: read jobs: check_clang_format: name: Check clang-format runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 - uses: DoozyX/clang-format-lint-action@v0.16.2 with: source: '.' extensions: 'h,c,cpp' clangFormatVersion: 16 # As of Aug 2023, the macOS runners have more RAM (14GB vs 7GB) and CPU (3 cores vs 2) # than the Linux and Windows runners, so let's use those instead, since clang-tidy is # a bit of a sluggard check_clang_tidy: name: Check clang-tidy runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 - name: Install clang-tidy run: | # from apt.llvm.org # wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 15CF4D18AF4F7421 sudo apt-add-repository "deb https://apt.llvm.org/$(lsb_release -sc)/ llvm-toolchain-$(lsb_release -sc)-16 main" sudo apt-get update sudo apt-get install llvm-16 clang-16 liblld-16-dev libclang-16-dev clang-tidy-16 ninja-build - name: Run clang-tidy run: | export CC=clang-16 export CXX=clang++-16 export CLANG_TIDY_LLVM_INSTALL_DIR=/usr/lib/llvm-16 export CMAKE_GENERATOR=Ninja ./run-clang-tidy.sh check_cmake_file_lists: name: Check CMake file lists runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 - name: Run test sources check run: | shopt -s nullglob (cd test/autoschedulers/adams2019 && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/autoschedulers/anderson2021 && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/autoschedulers/li2018 && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/autoschedulers/mullapudi2016 && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/correctness && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/error && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/failing_with_issue && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/fuzz && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/generator && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/performance && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/runtime && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) (cd test/warning && comm -23 <(ls *.{c,cpp} | sort) <(grep -P '^\s*#?\s*[A-Za-z0-9_.]+$' CMakeLists.txt | tr -d '# ' | sort) | tee missing_files && [ ! -s missing_files ]) Halide-17.0.1/.gitignore000066400000000000000000000116521456515664200150010ustar00rootroot00000000000000# NOTE: one can debug these rules with the following commands: # # $ git clean -ffdx # $ find . -not -path './.git/*' | git check-ignore --stdin --no-index # # The first command will delete all files that are ignored by Git (be warned!). # The second command will print all files that are checked in, but _would be_ # ignored under the rules in this file. Such files should either be explicitly # added to the exclusions at the bottom of this file, or the rule excluding them # should be refined. ################################################################################ ## Exclude files without extensions * !*.* !*/ ################################################################################ ## Halide-specific exclusions # Images only allowed in apps and directories named "images" *.png !apps/**/*.png !**/images/**/*.png # Pre-trained weights only allowed in autoscheduler directories *.weights !src/autoschedulers/**/*.weights ################################################################################ ## Halide-specific build artifacts # Apps apps/*/*.def apps/*/*.ptx apps/*/*.sass apps/*/*out*.png apps/*/filter apps/*/passes.txt apps/HelloAndroidGL/jni/halide_gl_filter.h # Autoschedulers **/src/autoschedulers/adams2019/baseline.cpp **/src/autoschedulers/adams2019/cost_model.h **/src/autoschedulers/adams2019/demo.h **/src/autoschedulers/adams2019/included_schedule_file.h **/src/autoschedulers/adams2019/train_cost_model.h **/src/autoschedulers/li2018/demo_gradient.h # CMake configuration Halide-*-deps.cmake # Distribution headers **/include/Halide*.h **/include/wasm-rt*.h # Generator executables *.generator # Generator outputs *.bc *.featurization *.halide_compiler_log *.halide_generated.cpp *.ll *.py.cpp *.pytorch.h *.registration.cpp *.s *.schedule.h *.stmt *.stmt.html *.stub.h # Linker scripts py_*.ldscript* # Runtime modules _initmod*.cpp # Tests **/python_bindings/correctness/generators/*.h **/test/generator/*.h compile_log.txt stderr.txt stdout.txt # Tutorials **/tutorial/auto_schedule_false.h **/tutorial/auto_schedule_true.h **/tutorial/brighten_either.h **/tutorial/brighten_interleaved.h **/tutorial/brighten_planar.h **/tutorial/brighten_specialized.h **/tutorial/lesson_10_halide.h **/tutorial/my_first_generator_win32.h **/tutorial/my_first_generator.h **/tutorial/my_second_generator_1.h **/tutorial/my_second_generator_2.h **/tutorial/my_second_generator_3.h # Tutorial images that were copied to the install tree **/tutorial/images/ !tutorial/images/ ################################################################################ ## Common build artifacts # Directories bin/ distrib/ lib/ lib64/ share/ # Binaries *.a *.cubin *.dll *.dylib *.exe *.lib *.o *.obj *.so *.so.* a.out # Compiler intermediates / debugging info *.[ip]db *.[pg]ch *.d *.dSYM # Package files *.deb *.tar.gz *.tgz *.zip ################################################################################ ## Temporary and swap files temp/ tmp/ .*.swp .\#* .DS_Store *.log *.tmp *.txt.user* *~ \#*\# ################################################################################ ## Python # Common virtual environment directory names .venv/ venv/ # Python binary caches __pycache__ *.py[cod] # Python package build artifacts *.egg-info/ *.whl MANIFEST.in dist/ ################################################################################ ## CMake # User-specific configuration files CMakeUserPresets.json # Common build directory names build*/ cmake[-_]build*/ # Generated config files *-config-version.cmake *-config.cmake *Config.cmake *ConfigVersion.cmake # Build directory contents _deps/ .cmake/ cmake_install.cmake CMakeCache.txt CMakeFiles/ compile_commands.json CPack*.cmake CTest*.cmake CTest*.txt install_manifest.txt # Ninja files *.ninja* ################################################################################ ## IDE directories and metadata # Visual Studio .vs/ out/ CMakeSettings.json # XCode *.xcworkspacedata tools/objc/*.mobileprovision tools/objc/BUILD xcuserdata # CLion .idea/ # VSCode .vscode/ # TextMate .tm_properties # Sublime Text .tags .tags_sorted_by_file *.sublime-* # Vim .clang_complete # NeoVim + clangd .cache # Emacs tags TAGS ################################################################################ ## Halide-specific rule overrides # Allow particular extension-less files !gradlew !Makefile !packaging/ubuntu/changelog !packaging/ubuntu/copyright !packaging/ubuntu/triggers # Allow XCode PCHs in the HelloiOS app !apps/HelloiOS/**/*-Prefix.pch # Allow the runtime to have handwritten LLVM modules !src/runtime/*.ll # Allow precompiled Nvidia bitcode !src/runtime/nvidia_libdevice_bitcode/*.bc # Anything goes in the hexagon_remote binaries !src/runtime/hexagon_remote/**/* # TODO: should this be checked in? !src/autoschedulers/adams2019/included_schedule_file.schedule.h # TODO: these should become .cmake.in !packaging/common/HalideConfig.cmake !packaging/common/HalideHelpersConfig.cmake Halide-17.0.1/.gitmodules000066400000000000000000000000001456515664200151470ustar00rootroot00000000000000Halide-17.0.1/CMakeLists.txt000066400000000000000000000151451456515664200155520ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.22...3.23) project(Halide VERSION 17.0.1 DESCRIPTION "Halide compiler and libraries" HOMEPAGE_URL "https://halide-lang.org") enable_testing() ## # Set up project-wide properties ## # Import useful standard modules include(CMakeDependentOption) include(CheckCXXSymbolExists) # Make our custom helpers available throughout the project via include(). list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake) include(HalideGeneratorHelpers) # Build Halide as a shared lib by default, but still honor command-line settings. option(BUILD_SHARED_LIBS "Build shared libraries" ON) # Warn if the user did not set a build type and is using a single-configuration generator. get_property(IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) if (NOT IS_MULTI_CONFIG AND NOT DEFINED CMAKE_BUILD_TYPE) message(WARNING "Single-configuration generators require CMAKE_BUILD_TYPE to be set.") endif () # Windows has file name length restrictions and lacks an RPATH mechanism. # We work around this by setting a path max and putting all exes / dlls in # the same output directory. if (CMAKE_SYSTEM_NAME MATCHES "Windows") set(CMAKE_OBJECT_PATH_MAX 260) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin") message(STATUS "Windows: setting CMAKE_OBJECT_PATH_MAX to ${CMAKE_OBJECT_PATH_MAX}") endif () # Export all symbols on Windows to match GCC/Clang behavior on Linux/macOS set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) # Require standard C++17 set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard to use. Halide requires 17 or higher.") option(CMAKE_CXX_STANDARD_REQUIRED "When enabled, the value of CMAKE_CXX_STANDARD is a requirement." ON) option(CMAKE_CXX_EXTENSIONS "When enabled, compiler-specific language extensions are enabled (e.g. -std=gnu++17)" OFF) if(CMAKE_CXX_STANDARD LESS 17) message(FATAL_ERROR "Halide requires C++17 or newer but CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}") endif() # Build Halide with ccache if the package is present option(Halide_CCACHE_BUILD "Set to ON for a ccache enabled build" OFF) mark_as_advanced(Halide_CCACHE_BUILD) if (Halide_CCACHE_BUILD) find_program(CCACHE_PROGRAM ccache REQUIRED) set(Halide_CCACHE_PARAMS CCACHE_CPP2=yes CCACHE_HASHDIR=yes CCACHE_SLOPPINESS=pch_defines,time_macros,include_file_mtime,include_file_ctime CACHE STRING "Parameters to pass through to ccache") mark_as_advanced(Halide_CCACHE_PARAMS) set(CMAKE_C_COMPILER_LAUNCHER ${CMAKE_COMMAND} -E env ${Halide_CCACHE_PARAMS} ${CCACHE_PROGRAM}) set(CMAKE_CXX_COMPILER_LAUNCHER ${CMAKE_COMMAND} -E env ${Halide_CCACHE_PARAMS} ${CCACHE_PROGRAM}) # Per https://ccache.dev/manual/latest.html#_precompiled_headers, # we must set -fno-pch-timestamp when using Clang + CCache + PCH if(CMAKE_C_COMPILER_ID MATCHES "Clang") string(APPEND CMAKE_C_FLAGS " -Xclang -fno-pch-timestamp") endif() if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") string(APPEND CMAKE_CXX_FLAGS " -Xclang -fno-pch-timestamp") endif() message(STATUS "Enabling ccache usage for building.") endif () # Detect whether or not ASAN is enabled. Don't cache the result to ensure this # check happens every time we reconfigure. unset(Halide_ASAN_ENABLED CACHE) check_cxx_symbol_exists(HALIDE_INTERNAL_USING_ASAN "${Halide_SOURCE_DIR}/src/Util.h" Halide_ASAN_ENABLED) if (Halide_ASAN_ENABLED) set(Halide_ANY_SANITIZERS_ENABLED 1) else () set(Halide_ANY_SANITIZERS_ENABLED 0) endif () # Enable the SPIR-V target if requested (must declare before processing dependencies) option(TARGET_SPIRV "Include SPIR-V target" OFF) option(TARGET_VULKAN "Include Vulkan target" ON) if (TARGET_VULKAN) set(TARGET_SPIRV ON) # required endif() ## # Import dependencies ## ## Threads option(THREADS_PREFER_PTHREAD_FLAG "When enabled, prefer to use the -pthread flag to explicit linking" ON) find_package(Threads REQUIRED) ## Complex dependencies add_subdirectory(dependencies) ## Image formats # This changes how find_xxx() commands work; the default is to find frameworks before # standard libraries or headers, but this can be a problem on systems that have Mono # installed, as it has a framework with the libjpeg and libpng headers present -- so # CMake finds the headers from Mono but the libraries from Homebrew, and hilarity ensues. # Setting this to "last" means we always try the standard libraries before the frameworks. set(CMAKE_FIND_FRAMEWORK LAST) # TODO: these really belong in tools/, but CMake has a weird bug with $ # https://gitlab.kitware.com/cmake/cmake/-/issues/25033 find_package(JPEG) find_package(PNG) ## # Declare options ## # Declare these options after we include dependencies (since it declares Halide_ENABLE_RTTI etc) # but before we add any subdirectories, since any option you test before it is defined is # implicitly false the *first* time that the build file is processed, and there are some # out-of-order dependencies here (e.g, code in src/ eventually checks WITH_UTILS). # This is especially subtle since it means that some options can end up with different # values if you build a target as part of the initial CMake run, so (e.g.) a `make install` # from as totally clean build might neglect to install some pieces. option(WITH_TESTS "Build tests" "${PROJECT_IS_TOP_LEVEL}") option(WITH_TUTORIALS "Build tutorials" "${PROJECT_IS_TOP_LEVEL}") option(WITH_DOCS "Build documentation" OFF) option(WITH_UTILS "Build utils" "${PROJECT_IS_TOP_LEVEL}") cmake_dependent_option( WITH_PYTHON_BINDINGS "Build Python bindings" "${PROJECT_IS_TOP_LEVEL}" "Halide_ENABLE_RTTI AND Halide_ENABLE_EXCEPTIONS" OFF ) ## # Add source directories ## add_subdirectory(src) add_subdirectory(tools) ## # Add tests, tutorials, etc. if we're not being imported into another CMake project. ## if (WITH_TESTS) message(STATUS "Building tests enabled") add_subdirectory(test) else () message(STATUS "Building tests disabled") endif () if (WITH_PYTHON_BINDINGS) message(STATUS "Building Python bindings enabled") add_subdirectory(python_bindings) else () message(STATUS "Building Python bindings disabled") endif () if (WITH_TUTORIALS) message(STATUS "Building tutorials enabled") add_subdirectory(tutorial) else () message(STATUS "Building tutorials disabled") endif () if (WITH_DOCS) message(STATUS "Building docs enabled") add_subdirectory(doc) else () message(STATUS "Building docs disabled") endif () if (WITH_UTILS) message(STATUS "Building utils enabled") add_subdirectory(util) else () message(STATUS "Building utils disabled") endif () add_subdirectory(packaging) Halide-17.0.1/CMakePresets.json000066400000000000000000000154151456515664200162330ustar00rootroot00000000000000{ "version": 3, "cmakeMinimumRequired": { "major": 3, "minor": 22, "patch": 0 }, "configurePresets": [ { "name": "base", "hidden": true, "binaryDir": "build/${presetName}", "installDir": "install/${presetName}" }, { "name": "ci", "hidden": true, "inherits": "base", "toolchainFile": "${sourceDir}/cmake/toolchain.${presetName}.cmake", "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, { "name": "windows-only", "hidden": true, "condition": { "type": "equals", "lhs": "${hostSystemName}", "rhs": "Windows" } }, { "name": "vcpkg", "hidden": true, "toolchainFile": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" }, { "name": "vs2022", "hidden": true, "inherits": [ "vcpkg", "windows-only" ], "generator": "Visual Studio 17 2022", "toolset": "host=x64" }, { "name": "debug", "inherits": "base", "displayName": "Debug", "description": "Debug build with no special settings", "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, { "name": "release", "inherits": "base", "displayName": "Release", "description": "Release build with no special settings", "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, { "name": "debian-debug", "inherits": "debug", "displayName": "Debian (Debug)", "description": "Debug build assuming Debian-provided dependencies", "cacheVariables": { "Halide_SHARED_LLVM": "ON" } }, { "name": "debian-release", "inherits": "debian-debug", "displayName": "Debian (Release)", "description": "Release build assuming Debian-provided dependencies", "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, { "name": "win32", "inherits": [ "vs2022", "base" ], "displayName": "Win32 (Visual Studio)", "description": "Visual Studio-based Win32 build with vcpkg dependencies.", "architecture": "Win32" }, { "name": "win64", "inherits": [ "vs2022", "base" ], "displayName": "Win64 (Visual Studio)", "description": "Visual Studio-based x64 build with vcpkg dependencies.", "architecture": "x64" }, { "name": "package", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release", "LLVM_DIR": "$env{LLVM_DIR}", "Clang_DIR": "$env{Clang_DIR}", "LLD_DIR": "$env{LLD_DIR}", "WITH_TESTS": "NO", "WITH_TUTORIALS": "NO", "WITH_DOCS": "YES", "WITH_UTILS": "YES", "WITH_PYTHON_BINDINGS": "NO", "CMAKE_INSTALL_DATADIR": "share/Halide" } }, { "name": "package-windows", "inherits": [ "package", "vs2022" ], "displayName": "Package ZIP for Windows", "description": "Build for packaging Windows shared libraries.", "binaryDir": "${sourceDir}/build", "cacheVariables": { "BUILD_SHARED_LIBS": "YES", "CMAKE_INSTALL_BINDIR": "bin/$", "CMAKE_INSTALL_LIBDIR": "lib/$", "Halide_INSTALL_CMAKEDIR": "lib/cmake/Halide", "Halide_INSTALL_HELPERSDIR": "lib/cmake/HalideHelpers" } }, { "name": "package-unix-shared", "inherits": "package", "displayName": "Package UNIX shared libs", "description": "Build for packaging UNIX shared libraries.", "binaryDir": "shared-Release", "cacheVariables": { "BUILD_SHARED_LIBS": "YES" } }, { "name": "package-unix-static", "inherits": "package", "displayName": "Package UNIX static libs", "description": "Build for packaging UNIX static libraries.", "binaryDir": "static-Release", "cacheVariables": { "BUILD_SHARED_LIBS": "NO", "Halide_BUNDLE_LLVM": "YES" } }, { "name": "linux-x64-asan", "inherits": "ci", "displayName": "ASAN (Linux x64)", "description": "Build everything with ASAN enabled", "cacheVariables": { "LLVM_ROOT": "$penv{LLVM_ROOT}" } }, { "name": "linux-x64-fuzzer", "inherits": "ci", "displayName": "Fuzzer (Linux x64)", "description": "Build everything with fuzzing enabled", "cacheVariables": { "LLVM_ROOT": "$penv{LLVM_ROOT}", "TARGET_WEBASSEMBLY": "NO", "WITH_TUTORIALS": "NO", "WITH_UTILS": "NO", "WITH_PYTHON_BINDINGS": "NO", "WITH_TESTS": "YES", "WITH_TEST_AUTO_SCHEDULE": "NO", "WITH_TEST_CORRECTNESS": "NO", "WITH_TEST_ERROR": "NO", "WITH_TEST_WARNING": "NO", "WITH_TEST_PERFORMANCE": "NO", "WITH_TEST_RUNTIME": "NO", "WITH_TEST_GENERATOR": "NO", "WITH_TEST_FUZZ": "YES", "BUILD_SHARED_LIBS": "NO" } } ], "buildPresets": [ { "name": "debug", "configurePreset": "debug", "displayName": "Debug", "description": "Debug build with no special settings" }, { "name": "release", "configurePreset": "release", "displayName": "Release", "description": "Release build with no special settings" }, { "name": "linux-x64-asan", "configurePreset": "linux-x64-asan", "displayName": "ASAN (Linux x64)", "description": "Build everything with ASAN enabled" }, { "name": "linux-x64-fuzzer", "configurePreset": "linux-x64-fuzzer", "displayName": "Fuzzing (Linux x64)", "description": "Build everything with fuzzing enabled" } ], "testPresets": [ { "name": "debug", "configurePreset": "debug", "displayName": "Debug", "description": "Test everything with Debug build", "output": { "outputOnFailure": true } }, { "name": "release", "configurePreset": "release", "displayName": "Release", "description": "Test everything with Release build", "output": { "outputOnFailure": true } }, { "name": "linux-x64-asan", "configurePreset": "linux-x64-asan", "displayName": "ASAN (Linux x64)", "description": "Test everything with ASAN enabled", "environment": { "ASAN_OPTIONS": "detect_leaks=0:detect_container_overflow=0" }, "output": { "outputOnFailure": true } }, { "name": "linux-x64-fuzzer", "configurePreset": "linux-x64-fuzzer", "displayName": "Fuzzing (Linux x64)", "description": "Test everything with fuzzing enabled", "output": { "outputOnFailure": true } } ] } Halide-17.0.1/CODE_OF_CONDUCT.md000066400000000000000000000067501456515664200156130ustar00rootroot00000000000000The Halide community has always worked to be a welcoming and respectful community, and we want to ensure that doesn’t change as we grow and evolve. To that end, we have a few ground rules that we ask people to adhere to: - **Be friendly and patient.** - **Be welcoming.** We strive to be a community that welcomes and supports people of all backgrounds and identities. This includes, but is not limited to members of any race, ethnicity, culture, national origin, colour, immigration status, social and economic class, educational level, sex, sexual orientation, gender identity and expression, age, size, family status, political belief, religion, and mental and physical ability. - **Be considerate.** Your work will be used by other people, and you in turn will depend on the work of others. Any decision you take will affect users and colleagues, and you should take those consequences into account when making decisions. Remember that we're a world-wide community, so you might not be communicating in someone else's primary language. - **Be respectful.** Not all of us will agree all the time, but disagreement is no excuse for poor behavior and poor manners. We might all experience some frustration now and then, but we cannot allow that frustration to turn into a personal attack. It’s important to remember that a community where people feel uncomfortable or threatened is not a productive one. Members of the Halide community should be respectful when dealing with other members as well as with people outside the Halide community. - **Be careful in the words that you choose.** We are a community of professionals, and we conduct ourselves professionally. Be kind to others. Do not insult or put down other participants. Harassment and other exclusionary behavior aren't acceptable. This includes, but is not limited to: - Violent threats or language directed against another person. - Discriminatory jokes and language. - Posting sexually explicit or violent material. - Posting (or threatening to post) other people's personally identifying information ("doxing"). - Personal insults, especially those using racist or sexist terms. - Unwelcome sexual attention. - Advocating for, or encouraging, any of the above behavior. - Repeated harassment of others. In general, if someone asks you to stop, then stop. - **When we disagree, try to understand why.** Disagreements, both social and technical, happen all the time and Halide is no exception. It is important that we resolve disagreements and differing views constructively. Being unable to understand why someone holds a viewpoint doesn't mean that they’re wrong. Don’t forget that it is human to err and blaming each other doesn't get us anywhere. Instead, focus on helping to resolve issues and learning from mistakes. - **Give credit where it's due.** If you use code or ideas from other people, projects, or publications, say so. Add a comment in the source code at the point where the idea is used. If adapting code, this requirement is above and beyond any requirements placed on you by the license of the original code. We all like recognition for our work. To that end... **Acknowledgements.** This code of conduct is a mix of [LLVM's](https://llvm.org/docs/CodeOfConduct.html) and [Django's](https://www.djangoproject.com/conduct/), which both ultimately derive from the code of conduct from the [Speak Up!](http://web.archive.org/web/20141109123859/http://speakup.io/coc.html) project. Halide-17.0.1/LICENSE.txt000066400000000000000000000345731456515664200146430ustar00rootroot00000000000000Copyright (c) 2012-2020 MIT CSAIL, Google, Facebook, Adobe, NVIDIA CORPORATION, and other contributors. Developed by: The Halide team http://halide-lang.org Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ----- apps/bgu is Copyright 2016 Google Inc. and is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS ----- apps/support/cmdline.h is Copyright (c) 2009, Hideyuki Tanaka and is licensed under the BSD 3-Clause license. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---- dependencies/spirv is Copyright (c) 2014-2018 The Khronos Group Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and/or associated documentation files (the "Materials"), to deal in the Materials without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Materials, and to permit persons to whom the Materials are furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Materials. MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. ---- src/mini_vulkan.h is Copyright (c) 2014-2017 The Khronos Group Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ---- apps/linear_algebra/include/cblas.h is licensed under the BLAS license. The reference BLAS is a freely-available software package. It is available from netlib via anonymous ftp and the World Wide Web. Thus, it can be included in commercial software packages (and has been). We only ask that proper credit be given to the authors. Like all software, it is copyrighted. It is not trademarked, but we do ask the following: If you modify the source for these routines we ask that you change the name of the routine and comment the changes made to the original. We will gladly answer any questions regarding the software. If a modification is done, however, it is the responsibility of the person who modified the routine to provide support. Halide-17.0.1/MANIFEST.in000066400000000000000000000002371456515664200145440ustar00rootroot00000000000000graft python_bindings prune python_bindings/apps prune python_bindings/test prune python_bindings/tutorial prune python_bindings/stub include README_python.md Halide-17.0.1/Makefile000066400000000000000000003241341456515664200144530ustar00rootroot00000000000000# 'make' builds libHalide.a, the internal test suite, and runs the internal test suite # 'make run_tests' builds and runs all the end-to-end tests in the test subdirectory # 'make {error,performance}_foo' builds and runs test/{...}/foo.cpp for any # c_source file in the corresponding subdirectory of the test folder # 'make correctness_foo' builds and runs test/correctness/foo.cpp for any # c_source file in the correctness/ subdirectory of the test folder # 'make test_apps' checks some of the apps build and run (but does not check their output) # 'make time_compilation_tests' records the compile time for each test module into a csv file. # For correctness and performance tests this include halide build time and run time. For # the tests in test/generator/ this times only the halide build time. # Disable built-in makefile rules for all apps to avoid pointless file-system # scanning and general weirdness resulting from implicit rules. MAKEFLAGS += --no-builtin-rules .SUFFIXES: UNAME = $(shell uname) ifeq ($(OS), Windows_NT) $(error Halide no longer supports the MinGW environment. Please use MSVC through CMake instead.) else # let's assume "normal" UNIX such as linux COMMON_LD_FLAGS=$(LDFLAGS) -ldl -lpthread -lz FPIC=-fPIC ifeq ($(UNAME), Darwin) SHARED_EXT=dylib else SHARED_EXT=so endif endif # We want to build Halide plugins as .so on all posixy systems, including OSX. # This is called out as a named var to make it clear that the use # is deliberate, not an accident. PLUGIN_EXT=so ifeq ($(UNAME), Darwin) # Anything that we us install_name_tool on needs these linker flags # to ensure there is enough padding for install_name_tool to use INSTALL_NAME_TOOL_LD_FLAGS=-Wl,-headerpad_max_install_names else INSTALL_NAME_TOOL_LD_FLAGS= endif ifeq ($(UNAME), Darwin) define alwayslink -Wl,-force_load,$(1) endef else define alwayslink -Wl,--whole-archive $(1) -Wl,-no-whole-archive endef endif SHELL = bash CXX ?= g++ PREFIX ?= /usr/local LLVM_CONFIG ?= llvm-config LLVM_COMPONENTS= $(shell $(LLVM_CONFIG) --components) LLVM_VERSION = $(shell $(LLVM_CONFIG) --version | sed 's/\([0-9][0-9]*\)\.\([0-9]\).*/\1.\2/') LLVM_FULL_VERSION = $(shell $(LLVM_CONFIG) --version) LLVM_BINDIR = $(shell $(LLVM_CONFIG) --bindir | sed -e 's/\\/\//g' -e 's/\([a-zA-Z]\):/\/\1/g') LLVM_LIBDIR = $(shell $(LLVM_CONFIG) --libdir | sed -e 's/\\/\//g' -e 's/\([a-zA-Z]\):/\/\1/g') # Apparently there is no llvm_config flag to get canonical paths to tools, # so we'll just construct one relative to --src-root and hope that is stable everywhere. LLVM_SYSTEM_LIBS=$(shell ${LLVM_CONFIG} --system-libs --link-static | sed -e 's/[\/&]/\\&/g' | sed 's/-llibxml2.tbd/-lxml2/') LLVM_AS = $(LLVM_BINDIR)/llvm-as LLVM_NM = $(LLVM_BINDIR)/llvm-nm # Note, removing -D_GLIBCXX_ASSERTIONS is a workaround for https://reviews.llvm.org/D142279 LLVM_CXX_FLAGS = -std=c++17 $(filter-out -O% -g -fomit-frame-pointer -pedantic -W% -W, $(shell $(LLVM_CONFIG) --cxxflags | sed -e 's/ -D_GLIBCXX_ASSERTIONS / /g' -e 's/\\/\//g' -e 's/\([a-zA-Z]\):/\/\1/g;s/-D/ -D/g;s/-O/ -O/;s/c++14/c++17/g')) OPTIMIZE ?= -O3 OPTIMIZE_FOR_BUILD_TIME ?= -O0 CLANG ?= $(LLVM_BINDIR)/clang CLANG_VERSION = $(shell $(CLANG) --version) SANITIZER_FLAGS ?= # TODO: this is suboptimal hackery; we should really add the relevant # support libs for the sanitizer(s) as weak symbols in Codegen_LLVM. # (Note also that, in general, most Sanitizers work most reliably with an all-Clang # build system.) ifneq (,$(findstring tsan,$(HL_TARGET)$(HL_JIT_TARGET))) # Note that attempting to use TSAN with the JIT can produce false positives # if libHalide is not also compiled with TSAN enabled; we tack the relevant # flag onto OPTIMIZE here, but that's really only effective if you ensure # to do a clean build before testing. (In general, most of the Sanitizers # only work well when used in a completely clean environment.) OPTIMIZE += -fsanitize=thread SANITIZER_FLAGS += -fsanitize=thread endif ifneq (,$(findstring asan,$(HL_TARGET)$(HL_JIT_TARGET))) OPTIMIZE += -fsanitize=address SANITIZER_FLAGS += -fsanitize=address endif COMMON_LD_FLAGS += $(SANITIZER_FLAGS) LLVM_VERSION_TIMES_10 = $(shell $(LLVM_CONFIG) --version | sed 's/\([0-9][0-9]*\)\.\([0-9]\).*/\1\2/') LLVM_CXX_FLAGS += -DLLVM_VERSION=$(LLVM_VERSION_TIMES_10) # All WITH_* flags are either empty or not-empty. They do not behave # like true/false values in most languages. To turn one off, either # edit this file, add "WITH_FOO=" (no assigned value) to the make # line, or define an environment variable WITH_FOO that has an empty # value. WITH_X86 ?= $(findstring x86, $(LLVM_COMPONENTS)) WITH_ARM ?= $(findstring arm, $(LLVM_COMPONENTS)) WITH_HEXAGON ?= $(findstring hexagon, $(LLVM_COMPONENTS)) ifeq ($(shell test $(LLVM_VERSION_TIMES_10) -ge 170; echo $$?),0) WITH_RISCV ?= $(findstring riscv, $(LLVM_COMPONENTS)) else # leave WITH_RISCV undefined endif WITH_AARCH64 ?= $(findstring aarch64, $(LLVM_COMPONENTS)) WITH_POWERPC ?= $(findstring powerpc, $(LLVM_COMPONENTS)) WITH_NVPTX ?= $(findstring nvptx, $(LLVM_COMPONENTS)) WITH_WEBASSEMBLY ?= $(findstring webassembly, $(LLVM_COMPONENTS)) # AMDGPU target is WIP WITH_AMDGPU ?= $(findstring amdgpu, $(LLVM_COMPONENTS)) WITH_OPENCL ?= not-empty WITH_METAL ?= not-empty WITH_OPENGLCOMPUTE ?= not-empty WITH_D3D12 ?= not-empty WITH_VULKAN ?= not-empty WITH_SPIRV ?= not-empty WITH_WEBGPU ?= not-empty WITH_INTROSPECTION ?= not-empty WITH_EXCEPTIONS ?= WITH_LLVM_INSIDE_SHARED_LIBHALIDE ?= not-empty # If HL_TARGET or HL_JIT_TARGET aren't set, use host HL_TARGET ?= host HL_JIT_TARGET ?= host X86_CXX_FLAGS=$(if $(WITH_X86), -DWITH_X86, ) X86_LLVM_CONFIG_LIB=$(if $(WITH_X86), x86, ) ARM_CXX_FLAGS=$(if $(WITH_ARM), -DWITH_ARM, ) ARM_LLVM_CONFIG_LIB=$(if $(WITH_ARM), arm, ) POWERPC_CXX_FLAGS=$(if $(WITH_POWERPC), -DWITH_POWERPC, ) POWERPC_LLVM_CONFIG_LIB=$(if $(WITH_POWERPC), powerpc, ) PTX_CXX_FLAGS=$(if $(WITH_NVPTX), -DWITH_NVPTX, ) PTX_LLVM_CONFIG_LIB=$(if $(WITH_NVPTX), nvptx, ) PTX_DEVICE_INITIAL_MODULES=$(if $(WITH_NVPTX), libdevice.compute_20.10.bc libdevice.compute_30.10.bc libdevice.compute_35.10.bc, ) AMDGPU_CXX_FLAGS=$(if $(WITH_AMDGPU), -DWITH_AMDGPU, ) AMDGPU_LLVM_CONFIG_LIB=$(if $(WITH_AMDGPU), amdgpu, ) # TODO add bitcode files OPENCL_CXX_FLAGS=$(if $(WITH_OPENCL), -DWITH_OPENCL, ) OPENCL_LLVM_CONFIG_LIB=$(if $(WITH_OPENCL), , ) METAL_CXX_FLAGS=$(if $(WITH_METAL), -DWITH_METAL, ) METAL_LLVM_CONFIG_LIB=$(if $(WITH_METAL), , ) OPENGLCOMPUTE_CXX_FLAGS=$(if $(WITH_OPENGLCOMPUTE), -DWITH_OPENGLCOMPUTE, ) D3D12_CXX_FLAGS=$(if $(WITH_D3D12), -DWITH_D3D12, ) D3D12_LLVM_CONFIG_LIB=$(if $(WITH_D3D12), , ) WEBGPU_CXX_FLAGS=$(if $(WITH_WEBGPU), -DWITH_WEBGPU, ) AARCH64_CXX_FLAGS=$(if $(WITH_AARCH64), -DWITH_AARCH64, ) AARCH64_LLVM_CONFIG_LIB=$(if $(WITH_AARCH64), aarch64, ) RISCV_CXX_FLAGS=$(if $(WITH_RISCV), -DWITH_RISCV, ) RISCV_LLVM_CONFIG_LIB=$(if $(WITH_RISCV), riscv, ) INTROSPECTION_CXX_FLAGS=$(if $(WITH_INTROSPECTION), -DWITH_INTROSPECTION, ) EXCEPTIONS_CXX_FLAGS=$(if $(WITH_EXCEPTIONS), -DHALIDE_WITH_EXCEPTIONS -fexceptions, ) HEXAGON_CXX_FLAGS=$(if $(WITH_HEXAGON), -DWITH_HEXAGON, ) HEXAGON_LLVM_CONFIG_LIB=$(if $(WITH_HEXAGON), hexagon, ) SPIRV_CXX_FLAGS=$(if $(WITH_SPIRV), -DWITH_SPIRV -isystem $(ROOT_DIR)/dependencies/spirv/include, ) SPIRV_LLVM_CONFIG_LIB=$(if $(WITH_SPIRV), , ) VULKAN_CXX_FLAGS=$(if $(WITH_VULKAN), -DWITH_VULKAN, ) VULKAN_LLVM_CONFIG_LIB=$(if $(WITH_VULKAN), , ) WEBASSEMBLY_CXX_FLAGS=$(if $(WITH_WEBASSEMBLY), -DWITH_WEBASSEMBLY, ) WEBASSEMBLY_LLVM_CONFIG_LIB=$(if $(WITH_WEBASSEMBLY), webassembly, ) LLVM_HAS_NO_RTTI = $(findstring -fno-rtti, $(LLVM_CXX_FLAGS)) WITH_RTTI ?= $(if $(LLVM_HAS_NO_RTTI),, not-empty) RTTI_CXX_FLAGS=$(if $(WITH_RTTI), , -fno-rtti ) CXX_VERSION = $(shell $(CXX) --version | head -n1) CXX_WARNING_FLAGS = -Wall -Werror -Wno-unused-function -Wcast-qual -Wignored-qualifiers -Wno-comment -Wsign-compare -Wno-unknown-warning-option -Wno-psabi -Wno-mismatched-new-delete ifneq (,$(findstring g++,$(CXX_VERSION))) GCC_MAJOR_VERSION := $(shell $(CXX) -dumpfullversion -dumpversion | cut -f1 -d.) GCC_MINOR_VERSION := $(shell $(CXX) -dumpfullversion -dumpversion | cut -f2 -d.) ifeq (1,$(shell expr $(GCC_MAJOR_VERSION) \> 5 \| $(GCC_MAJOR_VERSION) = 5 \& $(GCC_MINOR_VERSION) \>= 1)) CXX_WARNING_FLAGS += -Wsuggest-override endif endif ifneq (,$(findstring clang,$(CXX_VERSION))) LLVM_CXX_FLAGS_LIBCPP := $(findstring -stdlib=libc++, $(LLVM_CXX_FLAGS)) endif CXX_FLAGS = $(CXXFLAGS) $(CXX_WARNING_FLAGS) $(RTTI_CXX_FLAGS) -Woverloaded-virtual $(FPIC) $(OPTIMIZE) -fno-omit-frame-pointer -DCOMPILING_HALIDE CXX_FLAGS += $(LLVM_CXX_FLAGS) CXX_FLAGS += $(PTX_CXX_FLAGS) CXX_FLAGS += $(ARM_CXX_FLAGS) CXX_FLAGS += $(HEXAGON_CXX_FLAGS) CXX_FLAGS += $(AARCH64_CXX_FLAGS) CXX_FLAGS += $(X86_CXX_FLAGS) CXX_FLAGS += $(OPENCL_CXX_FLAGS) CXX_FLAGS += $(METAL_CXX_FLAGS) CXX_FLAGS += $(OPENGLCOMPUTE_CXX_FLAGS) CXX_FLAGS += $(D3D12_CXX_FLAGS) CXX_FLAGS += $(WEBGPU_CXX_FLAGS) CXX_FLAGS += $(POWERPC_CXX_FLAGS) CXX_FLAGS += $(INTROSPECTION_CXX_FLAGS) CXX_FLAGS += $(EXCEPTIONS_CXX_FLAGS) CXX_FLAGS += $(AMDGPU_CXX_FLAGS) CXX_FLAGS += $(RISCV_CXX_FLAGS) CXX_FLAGS += $(SPIRV_CXX_FLAGS) CXX_FLAGS += $(VULKAN_CXX_FLAGS) CXX_FLAGS += $(WEBASSEMBLY_CXX_FLAGS) # Serialization requires flatc and flatbuffers.h # On ubuntu, this requires packages flatbuffers-compiler and libflatbuffers-dev ifneq (,$(shell which flatc)) CXX_FLAGS += -DWITH_SERIALIZATION -I $(BUILD_DIR) -I $(shell which flatc | sed 's/bin.flatc/include/') # Note: if updating here, be sure to update in CMakeLists.txt as well HALIDE_SERIALIZATION_VERSION_MINOR ?= 1 HALIDE_SERIALIZATION_VERSION_PATCH ?= 0 endif # This is required on some hosts like powerpc64le-linux-gnu because we may build # everything with -fno-exceptions. Without -funwind-tables, libHalide.so fails # to propagate exceptions and causes a test failure. CXX_FLAGS += -funwind-tables print-%: @echo '$*=$($*)' LLVM_STATIC_LIBFILES = \ bitwriter \ bitreader \ linker \ ipo \ passes \ orcjit \ $(X86_LLVM_CONFIG_LIB) \ $(ARM_LLVM_CONFIG_LIB) \ $(OPENCL_LLVM_CONFIG_LIB) \ $(METAL_LLVM_CONFIG_LIB) \ $(PTX_LLVM_CONFIG_LIB) \ $(AARCH64_LLVM_CONFIG_LIB) \ $(POWERPC_LLVM_CONFIG_LIB) \ $(HEXAGON_LLVM_CONFIG_LIB) \ $(AMDGPU_LLVM_CONFIG_LIB) \ $(SPIRV_LLVM_CONFIG_LIB) \ $(VULKAN_LLVM_CONFIG_LIB) \ $(WEBASSEMBLY_LLVM_CONFIG_LIB) \ $(RISCV_LLVM_CONFIG_LIB) LLVM_STATIC_LIBS = -L $(LLVM_LIBDIR) $(shell $(LLVM_CONFIG) --link-static --libfiles $(LLVM_STATIC_LIBFILES) | sed -e 's/\\/\//g' -e 's/\([a-zA-Z]\):/\/\1/g') # Add a rpath to the llvm used for linking, in case multiple llvms are # installed. Bakes a path on the build system into the .so, so don't # use this config for distributions. LLVM_SHARED_LIBS = -Wl,-rpath=$(LLVM_LIBDIR) -L $(LLVM_LIBDIR) -lLLVM LLVM_LIBS_FOR_SHARED_LIBHALIDE=$(if $(WITH_LLVM_INSIDE_SHARED_LIBHALIDE),$(LLVM_STATIC_LIBS),$(LLVM_SHARED_LIBS)) TUTORIAL_CXX_FLAGS ?= -std=c++17 -g -fno-omit-frame-pointer $(RTTI_CXX_FLAGS) -I $(ROOT_DIR)/tools $(SANITIZER_FLAGS) $(LLVM_CXX_FLAGS_LIBCPP) # The tutorials contain example code with warnings that we don't want # to be flagged as errors, so the test flags are the tutorial flags # plus our warning flags. # Also allow tests, via conditional compilation, to use the entire # capability of the CPU being compiled on via -march=native. This # presumes tests are run on the same machine they are compiled on. TEST_CXX_FLAGS ?= $(TUTORIAL_CXX_FLAGS) $(CXX_WARNING_FLAGS) TEST_LD_FLAGS = -L$(BIN_DIR) -lHalide $(COMMON_LD_FLAGS) # In the tests, some of our expectations change depending on the llvm version TEST_CXX_FLAGS += -DLLVM_VERSION=$(LLVM_VERSION_TIMES_10) # In the tests, default to exporting no symbols that aren't explicitly exported TEST_CXX_FLAGS += -fvisibility=hidden -fvisibility-inlines-hidden # gcc 4.8 fires a bogus warning on old versions of png.h ifneq (,$(findstring g++,$(CXX_VERSION))) ifneq (,$(findstring 4.8,$(CXX_VERSION))) TEST_CXX_FLAGS += -Wno-literal-suffix endif endif ifeq ($(UNAME), Linux) TEST_LD_FLAGS += -rdynamic -Wl,--rpath=$(CURDIR)/$(BIN_DIR) endif ifeq ($(WITH_LLVM_INSIDE_SHARED_LIBHALIDE), ) TEST_LD_FLAGS += -Wl,--rpath=$(LLVM_LIBDIR) endif ifneq ($(WITH_NVPTX), ) ifneq (,$(findstring ptx,$(HL_TARGET))) TEST_CUDA = 1 endif ifneq (,$(findstring cuda,$(HL_TARGET))) TEST_CUDA = 1 endif endif ifneq ($(WITH_OPENCL), ) ifneq (,$(findstring opencl,$(HL_TARGET))) TEST_OPENCL = 1 endif endif ifneq ($(WITH_METAL), ) ifneq (,$(findstring metal,$(HL_TARGET))) TEST_METAL = 1 endif endif ifneq ($(WITH_VULKAN), ) ifneq (,$(findstring vulkan,$(HL_TARGET))) TEST_VULKAN = 1 endif endif ifeq ($(UNAME), Linux) ifneq ($(TEST_CUDA), ) CUDA_LD_FLAGS ?= -L/usr/lib/nvidia-current -lcuda endif ifneq ($(TEST_OPENCL), ) OPENCL_LD_FLAGS ?= -lOpenCL endif ifneq ($(TEST_VULKAN), ) VULKAN_LD_FLAGS ?= -lvulkan endif OPENGL_LD_FLAGS ?= -lGL HOST_OS=linux endif ifeq ($(UNAME), Darwin) # Someone with an osx box with cuda installed please fix the line below ifneq ($(TEST_CUDA), ) CUDA_LD_FLAGS ?= -L/usr/local/cuda/lib -lcuda endif ifneq ($(TEST_OPENCL), ) OPENCL_LD_FLAGS ?= -framework OpenCL endif ifneq ($(TEST_VULKAN), ) # The Vulkan loader is distributed as a dylib on OSX (not a framework) VULKAN_LD_FLAGS ?= -lvulkan endif ifneq ($(TEST_METAL), ) METAL_LD_FLAGS ?= -framework Metal -framework Foundation endif OPENGL_LD_FLAGS ?= -framework OpenGL HOST_OS=os_x endif ifneq ($(TEST_OPENCL), ) TEST_CXX_FLAGS += -DTEST_OPENCL endif ifneq ($(TEST_VULKAN), ) TEST_CXX_FLAGS += -DTEST_VULKAN endif ifneq ($(TEST_METAL), ) # Using Metal APIs requires writing Objective-C++ (or Swift). Add ObjC++ # to allow tests to create and destroy Metal contexts, etc. This requires # tests to be valid Objective-C++, e.g. avoiding using the identifier "id" # in certain ways. In practice this is not enough of a problem to justify # the work to limit which files are compiled this way. TEST_CXX_FLAGS += -DTEST_METAL -ObjC++ -Werror,-Wunused-command-line-argument endif ifneq ($(TEST_CUDA), ) TEST_CXX_FLAGS += -DTEST_CUDA TEST_CXX_FLAGS += -I/usr/local/cuda/include endif # Compiling the tutorials requires libpng LIBPNG_LIBS_DEFAULT = $(shell libpng-config --ldflags) LIBPNG_CXX_FLAGS ?= $(shell libpng-config --cflags) # Workaround for libpng-config pointing to 64-bit versions on linux even when we're building for 32-bit ifneq (,$(findstring -m32,$(CXX))) ifneq (,$(findstring x86_64,$(LIBPNG_LIBS_DEFAULT))) LIBPNG_LIBS ?= -lpng endif endif LIBPNG_LIBS ?= $(LIBPNG_LIBS_DEFAULT) # Workaround brew Cellar path for libpng-config output. LIBJPEG_LINKER_PATH ?= $(shell echo $(LIBPNG_LIBS_DEFAULT) | sed -e'/-L.*[/][Cc]ellar[/]libpng/!d;s=\(.*\)/[Cc]ellar/libpng/.*=\1/lib=') LIBJPEG_LIBS ?= $(LIBJPEG_LINKER_PATH) -ljpeg # There's no libjpeg-config, unfortunately. We should look for # jpeglib.h one directory level up from png.h . Also handle # Mac OS brew installs where libpng-config returns paths # into the PNG cellar. LIBPNG_INCLUDE_DIRS = $(filter -I%,$(LIBPNG_CXX_FLAGS)) LIBJPEG_CXX_FLAGS ?= $(shell echo $(LIBPNG_INCLUDE_DIRS) | sed -e'/[Cc]ellar[/]libpng/!s=\(.*\)=\1/..=;s=\(.*\)/[Cc]ellar/libpng/.*=\1/include=') IMAGE_IO_LIBS = $(LIBPNG_LIBS) $(LIBJPEG_LIBS) IMAGE_IO_CXX_FLAGS = $(LIBPNG_CXX_FLAGS) $(LIBJPEG_CXX_FLAGS) # We're building into the current directory $(CURDIR). Find the Halide # repo root directory (the location of the makefile) THIS_MAKEFILE = $(realpath $(filter %Makefile, $(MAKEFILE_LIST))) ROOT_DIR = $(strip $(shell dirname $(THIS_MAKEFILE))) SRC_DIR = $(ROOT_DIR)/src TARGET=$(if $(HL_TARGET),$(HL_TARGET),host) # The following directories are all relative to the output directory (i.e. $(CURDIR), not $(SRC_DIR)) LIB_DIR = lib BIN_DIR = bin DISTRIB_DIR = distrib INCLUDE_DIR = include SHARE_DIR = share DOC_DIR = $(SHARE_DIR)/doc/Halide BUILD_DIR = $(BIN_DIR)/build FILTERS_DIR = $(BIN_DIR)/$(TARGET)/build TMP_DIR = $(BUILD_DIR)/tmp HEXAGON_RUNTIME_LIBS_DIR = src/runtime/hexagon_remote/bin HEXAGON_RUNTIME_LIBS = \ $(HEXAGON_RUNTIME_LIBS_DIR)/arm-32-android/libhalide_hexagon_host.so \ $(HEXAGON_RUNTIME_LIBS_DIR)/arm-64-android/libhalide_hexagon_host.so \ $(HEXAGON_RUNTIME_LIBS_DIR)/host/libhalide_hexagon_host.so \ $(HEXAGON_RUNTIME_LIBS_DIR)/v65/hexagon_sim_remote \ $(HEXAGON_RUNTIME_LIBS_DIR)/v65/libhalide_hexagon_remote_skel.so \ $(HEXAGON_RUNTIME_LIBS_DIR)/v65/signed_by_debug/libhalide_hexagon_remote_skel.so # Keep this list sorted in alphabetical order. SOURCE_FILES = \ AbstractGenerator.cpp \ AddAtomicMutex.cpp \ AddImageChecks.cpp \ AddParameterChecks.cpp \ AlignLoads.cpp \ AllocationBoundsInference.cpp \ ApplySplit.cpp \ Argument.cpp \ AssociativeOpsTable.cpp \ Associativity.cpp \ AsyncProducers.cpp \ AutoScheduleUtils.cpp \ BoundaryConditions.cpp \ Bounds.cpp \ BoundsInference.cpp \ BoundConstantExtentLoops.cpp \ BoundSmallAllocations.cpp \ Buffer.cpp \ Callable.cpp \ CanonicalizeGPUVars.cpp \ Closure.cpp \ ClampUnsafeAccesses.cpp \ CodeGen_ARM.cpp \ CodeGen_C.cpp \ CodeGen_D3D12Compute_Dev.cpp \ CodeGen_GPU_Dev.cpp \ CodeGen_Hexagon.cpp \ CodeGen_Internal.cpp \ CodeGen_LLVM.cpp \ CodeGen_Metal_Dev.cpp \ CodeGen_OpenCL_Dev.cpp \ CodeGen_Vulkan_Dev.cpp \ CodeGen_OpenGLCompute_Dev.cpp \ CodeGen_Posix.cpp \ CodeGen_PowerPC.cpp \ CodeGen_PTX_Dev.cpp \ CodeGen_PyTorch.cpp \ CodeGen_RISCV.cpp \ CodeGen_WebAssembly.cpp \ CodeGen_WebGPU_Dev.cpp \ CodeGen_X86.cpp \ CompilerLogger.cpp \ CPlusPlusMangle.cpp \ CSE.cpp \ Debug.cpp \ DebugArguments.cpp \ DebugToFile.cpp \ Definition.cpp \ Deinterleave.cpp \ Derivative.cpp \ DerivativeUtils.cpp \ Deserialization.cpp \ DeviceArgument.cpp \ DeviceInterface.cpp \ Dimension.cpp \ DistributeShifts.cpp \ EarlyFree.cpp \ Elf.cpp \ EliminateBoolVectors.cpp \ EmulateFloat16Math.cpp \ Error.cpp \ Expr.cpp \ ExtractTileOperations.cpp \ FastIntegerDivide.cpp \ FindCalls.cpp \ FindIntrinsics.cpp \ FlattenNestedRamps.cpp \ Float16.cpp \ Func.cpp \ Function.cpp \ FuseGPUThreadLoops.cpp \ FuzzFloatStores.cpp \ Generator.cpp \ HexagonOffload.cpp \ HexagonOptimize.cpp \ ImageParam.cpp \ InferArguments.cpp \ InjectHostDevBufferCopies.cpp \ Inline.cpp \ InlineReductions.cpp \ IntegerDivisionTable.cpp \ Interval.cpp \ Introspection.cpp \ IR.cpp \ IREquality.cpp \ IRMatch.cpp \ IRMutator.cpp \ IROperator.cpp \ IRPrinter.cpp \ IRVisitor.cpp \ JITModule.cpp \ Lambda.cpp \ Lerp.cpp \ LICM.cpp \ LLVM_Output.cpp \ LLVM_Runtime_Linker.cpp \ LoopCarry.cpp \ Lower.cpp \ LowerParallelTasks.cpp \ LowerWarpShuffles.cpp \ Memoization.cpp \ Module.cpp \ ModulusRemainder.cpp \ Monotonic.cpp \ ObjectInstanceRegistry.cpp \ OffloadGPULoops.cpp \ OptimizeShuffles.cpp \ OutputImageParam.cpp \ ParallelRVar.cpp \ Parameter.cpp \ PartitionLoops.cpp \ Pipeline.cpp \ Prefetch.cpp \ PrintLoopNest.cpp \ Profiling.cpp \ PurifyIndexMath.cpp \ PythonExtensionGen.cpp \ Qualify.cpp \ Random.cpp \ RDom.cpp \ Realization.cpp \ RealizationOrder.cpp \ RebaseLoopsToZero.cpp \ Reduction.cpp \ RegionCosts.cpp \ RemoveDeadAllocations.cpp \ RemoveExternLoops.cpp \ RemoveUndef.cpp \ Schedule.cpp \ ScheduleFunctions.cpp \ SelectGPUAPI.cpp \ Serialization.cpp \ Simplify.cpp \ Simplify_Add.cpp \ Simplify_And.cpp \ Simplify_Call.cpp \ Simplify_Cast.cpp \ Simplify_Reinterpret.cpp \ Simplify_Div.cpp \ Simplify_EQ.cpp \ Simplify_Exprs.cpp \ Simplify_Let.cpp \ Simplify_LT.cpp \ Simplify_Max.cpp \ Simplify_Min.cpp \ Simplify_Mod.cpp \ Simplify_Mul.cpp \ Simplify_Not.cpp \ Simplify_Or.cpp \ Simplify_Select.cpp \ Simplify_Shuffle.cpp \ Simplify_Stmts.cpp \ Simplify_Sub.cpp \ SimplifyCorrelatedDifferences.cpp \ SimplifySpecializations.cpp \ SkipStages.cpp \ SlidingWindow.cpp \ Solve.cpp \ SpirvIR.cpp \ SplitTuples.cpp \ StageStridedLoads.cpp \ StmtToHTML.cpp \ StorageFlattening.cpp \ StorageFolding.cpp \ StrictifyFloat.cpp \ Substitute.cpp \ Target.cpp \ Tracing.cpp \ TrimNoOps.cpp \ Tuple.cpp \ Type.cpp \ UnifyDuplicateLets.cpp \ UniquifyVariableNames.cpp \ UnpackBuffers.cpp \ UnrollLoops.cpp \ UnsafePromises.cpp \ Util.cpp \ Var.cpp \ VectorizeLoops.cpp \ WasmExecutor.cpp \ WrapCalls.cpp C_TEMPLATE_FILES = \ CodeGen_C_prologue \ CodeGen_C_vectors HTML_TEMPLATE_FILES = \ StmtToHTML_dependencies.html \ StmtToHTML.js \ StmtToHTML.css # The externally-visible header files that go into making Halide.h. # Don't include anything here that includes llvm headers. # Also *don't* include anything that's only used internally (eg SpirvIR.h). # Keep this list sorted in alphabetical order. HEADER_FILES = \ AbstractGenerator.h \ AddAtomicMutex.h \ AddImageChecks.h \ AddParameterChecks.h \ AlignLoads.h \ AllocationBoundsInference.h \ ApplySplit.h \ Argument.h \ AssociativeOpsTable.h \ Associativity.h \ AsyncProducers.h \ AutoScheduleUtils.h \ BoundaryConditions.h \ Bounds.h \ BoundsInference.h \ BoundConstantExtentLoops.h \ BoundSmallAllocations.h \ Buffer.h \ Callable.h \ CanonicalizeGPUVars.h \ ClampUnsafeAccesses.h \ Closure.h \ CodeGen_C.h \ CodeGen_D3D12Compute_Dev.h \ CodeGen_GPU_Dev.h \ CodeGen_Internal.h \ CodeGen_LLVM.h \ CodeGen_Metal_Dev.h \ CodeGen_OpenCL_Dev.h \ CodeGen_Vulkan_Dev.h \ CodeGen_OpenGLCompute_Dev.h \ CodeGen_Posix.h \ CodeGen_PTX_Dev.h \ CodeGen_PyTorch.h \ CodeGen_Targets.h \ CodeGen_WebGPU_Dev.h \ CompilerLogger.h \ ConciseCasts.h \ CPlusPlusMangle.h \ CSE.h \ Debug.h \ DebugArguments.h \ DebugToFile.h \ Definition.h \ Deinterleave.h \ Derivative.h \ DerivativeUtils.h \ Deserialization.h \ DeviceAPI.h \ DeviceArgument.h \ DeviceInterface.h \ Dimension.h \ DistributeShifts.h \ EarlyFree.h \ Elf.h \ EliminateBoolVectors.h \ EmulateFloat16Math.h \ Error.h \ Expr.h \ ExprUsesVar.h \ Extern.h \ ExternFuncArgument.h \ ExtractTileOperations.h \ FastIntegerDivide.h \ FindCalls.h \ FindIntrinsics.h \ FlattenNestedRamps.h \ Float16.h \ Func.h \ Function.h \ FunctionPtr.h \ FuseGPUThreadLoops.h \ FuzzFloatStores.h \ Generator.h \ HexagonOffload.h \ HexagonOptimize.h \ ImageParam.h \ InferArguments.h \ InjectHostDevBufferCopies.h \ Inline.h \ InlineReductions.h \ IntegerDivisionTable.h \ Interval.h \ Introspection.h \ IntrusivePtr.h \ IR.h \ IREquality.h \ IRMatch.h \ IRMutator.h \ IROperator.h \ IRPrinter.h \ IRVisitor.h \ WasmExecutor.h \ JITModule.h \ Lambda.h \ Lerp.h \ LICM.h \ LLVM_Output.h \ LLVM_Runtime_Linker.h \ LoopCarry.h \ Lower.h \ LowerParallelTasks.h \ LowerWarpShuffles.h \ MainPage.h \ Memoization.h \ Module.h \ ModulusRemainder.h \ Monotonic.h \ ObjectInstanceRegistry.h \ OffloadGPULoops.h \ OptimizeShuffles.h \ OutputImageParam.h \ ParallelRVar.h \ Param.h \ Parameter.h \ PartitionLoops.h \ Pipeline.h \ Prefetch.h \ Profiling.h \ PurifyIndexMath.h \ PythonExtensionGen.h \ Qualify.h \ Random.h \ Realization.h \ RDom.h \ RealizationOrder.h \ RebaseLoopsToZero.h \ Reduction.h \ RegionCosts.h \ RemoveDeadAllocations.h \ RemoveExternLoops.h \ RemoveUndef.h \ runtime/HalideBuffer.h \ runtime/HalideRuntime.h \ Schedule.h \ ScheduleFunctions.h \ Scope.h \ SelectGPUAPI.h \ Serialization.h \ Simplify.h \ SimplifyCorrelatedDifferences.h \ SimplifySpecializations.h \ SkipStages.h \ SlidingWindow.h \ Solve.h \ SplitTuples.h \ StageStridedLoads.h \ StmtToHTML.h \ StorageFlattening.h \ StorageFolding.h \ StrictifyFloat.h \ Substitute.h \ Target.h \ Tracing.h \ TrimNoOps.h \ Tuple.h \ Type.h \ UnifyDuplicateLets.h \ UniquifyVariableNames.h \ UnpackBuffers.h \ UnrollLoops.h \ UnsafePromises.h \ Util.h \ Var.h \ VectorizeLoops.h \ WrapCalls.h OBJECTS = $(SOURCE_FILES:%.cpp=$(BUILD_DIR)/%.o) HEADERS = $(HEADER_FILES:%.h=$(SRC_DIR)/%.h) RUNTIME_CPP_COMPONENTS = \ aarch64_cpu_features \ alignment_128 \ alignment_32 \ alignment_64 \ allocation_cache \ android_clock \ android_host_cpu_count \ android_io \ arm_cpu_features \ cache \ can_use_target \ cuda \ destructors \ device_interface \ errors \ fake_get_symbol \ fake_thread_pool \ float16_t \ fopen \ fopen_lfs \ force_include_types \ fuchsia_clock \ fuchsia_host_cpu_count \ fuchsia_yield \ gpu_device_selection \ halide_buffer_t \ hexagon_cache_allocator \ hexagon_cpu_features \ hexagon_dma \ hexagon_dma_pool \ hexagon_host \ ios_io \ linux_clock \ linux_host_cpu_count \ linux_yield \ metal \ metal_objc_arm \ metal_objc_x86 \ module_aot_ref_count \ module_jit_ref_count \ msan \ msan_stubs \ opencl \ opengl_egl_context \ opengl_glx_context \ openglcompute \ osx_clock \ osx_get_symbol \ osx_host_cpu_count \ osx_opengl_context \ osx_yield \ posix_aligned_alloc \ posix_allocator \ posix_clock \ posix_error_handler \ posix_get_symbol \ posix_io \ posix_print \ posix_threads \ posix_threads_tsan \ posix_timer_profiler \ powerpc_cpu_features \ prefetch \ profiler \ profiler_inlined \ pseudostack \ qurt_allocator \ qurt_hvx \ qurt_hvx_vtcm \ qurt_threads \ qurt_threads_tsan \ qurt_yield \ riscv_cpu_features \ runtime_api \ timer_profiler \ to_string \ trace_helper \ tracing \ wasm_cpu_features \ webgpu_dawn \ webgpu_emscripten \ windows_clock \ windows_cuda \ windows_d3d12compute_arm \ windows_d3d12compute_x86 \ windows_get_symbol \ windows_io \ windows_opencl \ windows_profiler \ windows_threads \ windows_threads_tsan \ windows_vulkan \ windows_yield \ write_debug_image \ vulkan \ x86_cpu_features \ RUNTIME_LL_COMPONENTS = \ aarch64 \ arm \ arm_no_neon \ hvx_128 \ posix_math \ powerpc \ ptx_dev \ wasm_math \ win32_math \ x86 \ x86_amx \ x86_avx \ x86_avx2 \ x86_avx512 \ x86_sse41 RUNTIME_EXPORTED_INCLUDES = $(INCLUDE_DIR)/HalideRuntime.h \ $(INCLUDE_DIR)/HalideRuntimeD3D12Compute.h \ $(INCLUDE_DIR)/HalideRuntimeCuda.h \ $(INCLUDE_DIR)/HalideRuntimeHexagonDma.h \ $(INCLUDE_DIR)/HalideRuntimeHexagonHost.h \ $(INCLUDE_DIR)/HalideRuntimeOpenCL.h \ $(INCLUDE_DIR)/HalideRuntimeOpenGLCompute.h \ $(INCLUDE_DIR)/HalideRuntimeMetal.h \ $(INCLUDE_DIR)/HalideRuntimeQurt.h \ $(INCLUDE_DIR)/HalideRuntimeVulkan.h \ $(INCLUDE_DIR)/HalideRuntimeWebGPU.h \ $(INCLUDE_DIR)/HalideBuffer.h \ $(INCLUDE_DIR)/HalidePyTorchHelpers.h \ $(INCLUDE_DIR)/HalidePyTorchCudaHelpers.h INITIAL_MODULES = $(RUNTIME_CPP_COMPONENTS:%=$(BUILD_DIR)/initmod.%_32.o) \ $(RUNTIME_CPP_COMPONENTS:%=$(BUILD_DIR)/initmod.%_64.o) \ $(RUNTIME_CPP_COMPONENTS:%=$(BUILD_DIR)/initmod.%_32_debug.o) \ $(RUNTIME_CPP_COMPONENTS:%=$(BUILD_DIR)/initmod.%_64_debug.o) \ $(RUNTIME_EXPORTED_INCLUDES:$(INCLUDE_DIR)/%.h=$(BUILD_DIR)/initmod.%_h.o) \ $(C_TEMPLATE_FILES:%=$(BUILD_DIR)/c_template.%.o) \ $(HTML_TEMPLATE_FILES:%=$(BUILD_DIR)/html_template.%.o) \ $(BUILD_DIR)/initmod.inlined_c.o \ $(RUNTIME_LL_COMPONENTS:%=$(BUILD_DIR)/initmod.%_ll.o) \ $(PTX_DEVICE_INITIAL_MODULES:libdevice.%.bc=$(BUILD_DIR)/initmod_ptx.%_ll.o) # Add the Hexagon simulator to the rpath on Linux. (Not supported elsewhere, so no else cases.) ifeq ($(UNAME), Linux) ifneq (,$(WITH_HEXAGON)) ifneq (,$(HL_HEXAGON_TOOLS)) TEST_LD_FLAGS += -Wl,--rpath=$(ROOT_DIR)/src/runtime/hexagon_remote/bin/host TEST_LD_FLAGS += -Wl,--rpath=$(HL_HEXAGON_TOOLS)/lib/iss endif endif endif .PHONY: all all: distrib test_internal # Depending on which linker we're using, # we need a different invocation to get the # linker map file. ifeq ($(UNAME), Darwin) MAP_FLAGS= -Wl,-map -Wl,$(BUILD_DIR)/llvm_objects/list.all else MAP_FLAGS= -Wl,-Map=$(BUILD_DIR)/llvm_objects/list.all endif $(BUILD_DIR)/llvm_objects/list: $(OBJECTS) $(INITIAL_MODULES) # Determine the relevant object files from llvm with a dummy # compilation. Passing -map to the linker gets it to list, as # part of the linker map file, the object files in which archives it uses to # resolve symbols. We only care about the libLLVM ones, which we will filter below. @mkdir -p $(@D) $(CXX) -o /dev/null -shared $(MAP_FLAGS) $(OBJECTS) $(INITIAL_MODULES) $(LLVM_STATIC_LIBS) $(LLVM_SYSTEM_LIBS) $(COMMON_LD_FLAGS) > /dev/null # if the list has changed since the previous build, or there # is no list from a previous build, then delete any old object # files and re-extract the required object files cd $(BUILD_DIR)/llvm_objects; \ cat list.all | LANG=C sed -n 's/^[^\/]*\(\/[^ ()]*libLLVM.*[.]a\)[^a-zA-Z]*\([^ ()]*[.]o\).*$$/\1 \2/p' | sort | uniq > list.new; \ rm list.all; \ if cmp -s list.new list; \ then \ echo "No changes in LLVM deps"; \ touch list; \ else \ rm -f llvm_*.o*; \ cat list.new | sed = | sed "N;s/\n /\n/;s/\([0-9]*\)\n\([^ ]*\) \([^ ]*\)/ar x \2 \3; mv \3 llvm_\1_\3/" | bash - ; \ mv list.new list; \ fi $(LIB_DIR)/libHalide.a: $(OBJECTS) $(INITIAL_MODULES) $(BUILD_DIR)/llvm_objects/list # Archive together all the halide and llvm object files @mkdir -p $(@D) @rm -f $(LIB_DIR)/libHalide.a ar q $(LIB_DIR)/libHalide.a $(OBJECTS) $(INITIAL_MODULES) $(BUILD_DIR)/llvm_objects/llvm_*.o* ranlib $(LIB_DIR)/libHalide.a ifeq ($(UNAME), Linux) LIBHALIDE_SONAME_FLAGS=-Wl,-soname,libHalide.so else LIBHALIDE_SONAME_FLAGS= endif ifeq ($(UNAME), Linux) LIBHALIDE_EXPORTS=-Wl,--version-script=$(ROOT_DIR)/src/exported_symbols.ldscript else LIBHALIDE_EXPORTS=-Wl,-exported_symbols_list $(ROOT_DIR)/src/exported_symbols.osx endif $(BIN_DIR)/libHalide.$(SHARED_EXT): $(OBJECTS) $(INITIAL_MODULES) @mkdir -p $(@D) $(CXX) -shared $(LIBHALIDE_EXPORTS) $(OBJECTS) $(INITIAL_MODULES) $(LLVM_LIBS_FOR_SHARED_LIBHALIDE) $(LLVM_SYSTEM_LIBS) $(COMMON_LD_FLAGS) $(INSTALL_NAME_TOOL_LD_FLAGS) $(LIBHALIDE_SONAME_FLAGS) -o $(BIN_DIR)/libHalide.$(SHARED_EXT) ifeq ($(UNAME), Darwin) install_name_tool -id $(CURDIR)/$(BIN_DIR)/libHalide.$(SHARED_EXT) $(BIN_DIR)/libHalide.$(SHARED_EXT) endif $(INCLUDE_DIR)/Halide.h: $(SRC_DIR)/../LICENSE.txt $(HEADERS) $(BIN_DIR)/build_halide_h @mkdir -p $(@D) $(BIN_DIR)/build_halide_h $(SRC_DIR)/../LICENSE.txt $(HEADERS) > $(INCLUDE_DIR)/Halide.h # Also generate a precompiled version in the same folder so that anything compiled with a compatible set of flags can use it @mkdir -p $(INCLUDE_DIR)/Halide.h.gch $(CXX) -std=c++17 $(TEST_CXX_FLAGS) -I$(ROOT_DIR) $(OPTIMIZE) -x c++-header $(INCLUDE_DIR)/Halide.h -o $(INCLUDE_DIR)/Halide.h.gch/Halide.default.gch $(CXX) -std=c++17 $(TEST_CXX_FLAGS) -I$(ROOT_DIR) $(OPTIMIZE_FOR_BUILD_TIME) -x c++-header $(INCLUDE_DIR)/Halide.h -o $(INCLUDE_DIR)/Halide.h.gch/Halide.test.gch $(INCLUDE_DIR)/HalideRuntime%: $(SRC_DIR)/runtime/HalideRuntime% echo Copying $< @mkdir -p $(@D) cp $< $(INCLUDE_DIR)/ $(INCLUDE_DIR)/HalideBuffer.h: $(SRC_DIR)/runtime/HalideBuffer.h echo Copying $< @mkdir -p $(@D) cp $< $(INCLUDE_DIR)/ $(INCLUDE_DIR)/HalidePyTorchHelpers.h: $(SRC_DIR)/runtime/HalidePyTorchHelpers.h echo Copying $< @mkdir -p $(@D) cp $< $(INCLUDE_DIR)/ $(INCLUDE_DIR)/HalidePyTorchCudaHelpers.h: $(SRC_DIR)/runtime/HalidePyTorchCudaHelpers.h echo Copying $< @mkdir -p $(@D) cp $< $(INCLUDE_DIR)/ $(BIN_DIR)/build_halide_h: $(ROOT_DIR)/tools/build_halide_h.cpp @-mkdir -p $(@D) $(CXX) -std=c++17 $< -o $@ -include $(OBJECTS:.o=.d) -include $(INITIAL_MODULES:.o=.d) .SECONDARY: # Compile generic 32- or 64-bit code # (The 'nacl' is a red herring. This is just a generic 32-bit little-endian target.) RUNTIME_TRIPLE_32 = "le32-unknown-nacl-unknown" RUNTIME_TRIPLE_64 = "le64-unknown-unknown-unknown" # Windows requires special handling. The generic windows_* modules must have -fpic elided # and (for 64 bit) must set wchar to be 2 bytes. The windows_*_x86 and windows_*_arm # modules need to interact with specific calling conventions related to D3D12. # # TODO(marcos): generic code won't hold for ARM32... If ARM32 support becomes necessary, # all windows-related runtime modules will have to be wrapped in windows_*_arm.cpp files # for now, generic Windows 32bit code just assumes x86 (i386) RUNTIME_TRIPLE_WIN_X86_32 = "i386-unknown-windows-unknown" RUNTIME_TRIPLE_WIN_X86_64 = "x86_64-unknown-windows-unknown" RUNTIME_TRIPLE_WIN_ARM_32 = "arm-unknown-windows-unknown" RUNTIME_TRIPLE_WIN_ARM_64 = "aarch64-unknown-windows-unknown" RUNTIME_TRIPLE_WIN_GENERIC_64 = "le64-unknown-windows-unknown" # `-fno-threadsafe-statics` is very important here (note that it allows us to use a 'modern' C++ # standard but still skip threadsafe guards for static initialization in our runtime code) # # `-fno-rtti` is necessary to allow us to use classes with virtual functions in the runtime code RUNTIME_CXX_FLAGS = \ -O3 \ -std=c++17 \ -ffreestanding \ -fno-blocks \ -fno-exceptions \ -fno-unwind-tables \ -fno-vectorize \ -fno-threadsafe-statics \ -fno-rtti \ -Wall \ -Wcast-qual \ -Werror \ -Wignored-qualifiers \ -Wno-comment \ -Wno-psabi \ -Wno-unknown-warning-option \ -Wno-unused-function \ -Wvla \ -Wsign-compare $(BUILD_DIR)/initmod.windows_%_x86_32.ll: $(SRC_DIR)/runtime/windows_%_x86.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WIN_X86_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_x86_32.d $(BUILD_DIR)/initmod.windows_%_x86_64.ll: $(SRC_DIR)/runtime/windows_%_x86.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WIN_X86_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_x86_64.d $(BUILD_DIR)/initmod.windows_%_arm_32.ll: $(SRC_DIR)/runtime/windows_%_arm.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WIN_ARM_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_arm_32.d $(BUILD_DIR)/initmod.windows_%_arm_64.ll: $(SRC_DIR)/runtime/windows_%_arm.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WIN_ARM_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_arm_64.d $(BUILD_DIR)/initmod.windows_%_32.ll: $(SRC_DIR)/runtime/windows_%.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WIN_X86_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_32.d $(BUILD_DIR)/initmod.windows_%_64.ll: $(SRC_DIR)/runtime/windows_%.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WIN_GENERIC_64) -fshort-wchar -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_64.d $(BUILD_DIR)/initmod.%_64.ll: $(SRC_DIR)/runtime/%.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -fpic -m64 -target $(RUNTIME_TRIPLE_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.$*_64.d $(BUILD_DIR)/initmod.%_32.ll: $(SRC_DIR)/runtime/%.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -fpic -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.$*_32.d $(BUILD_DIR)/initmod.windows_%_x86_32_debug.ll: $(SRC_DIR)/runtime/windows_%_x86.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WIN_X86_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_x86_32_debug.d $(BUILD_DIR)/initmod.windows_%_x86_64_debug.ll: $(SRC_DIR)/runtime/windows_%_x86.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WIN_X86_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_x86_64_debug.d $(BUILD_DIR)/initmod.windows_%_arm_32_debug.ll: $(SRC_DIR)/runtime/windows_%_arm.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WIN_ARM_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_arm_32_debug.d $(BUILD_DIR)/initmod.windows_%_arm_64_debug.ll: $(SRC_DIR)/runtime/windows_%_arm.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WIN_ARM_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_arm_64_debug.d $(BUILD_DIR)/initmod.windows_%_64_debug.ll: $(SRC_DIR)/runtime/windows_%.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WIN_GENERIC_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_64_debug.d $(BUILD_DIR)/initmod.%_64_debug.ll: $(SRC_DIR)/runtime/%.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -fpic -m64 -target $(RUNTIME_TRIPLE_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.$*_64_debug.d $(BUILD_DIR)/initmod.windows_%_32_debug.ll: $(SRC_DIR)/runtime/windows_%.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WIN_X86_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_32_debug.d $(BUILD_DIR)/initmod.%_32_debug.ll: $(SRC_DIR)/runtime/%.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) $(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME -O3 $(RUNTIME_CXX_FLAGS) -fpic -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.$*_32_debug.d $(BUILD_DIR)/initmod.%_ll.ll: $(SRC_DIR)/runtime/%.ll @mkdir -p $(@D) cp $(SRC_DIR)/runtime/$*.ll $(BUILD_DIR)/initmod.$*_ll.ll $(BUILD_DIR)/initmod.%.bc: $(BUILD_DIR)/initmod.%.ll $(BUILD_DIR)/llvm_ok $(LLVM_AS) $(BUILD_DIR)/initmod.$*.ll -o $(BUILD_DIR)/initmod.$*.bc $(BUILD_DIR)/initmod.%.cpp: $(BIN_DIR)/binary2cpp $(BUILD_DIR)/initmod.%.bc ./$(BIN_DIR)/binary2cpp halide_internal_initmod_$* < $(BUILD_DIR)/initmod.$*.bc > $@ $(BUILD_DIR)/initmod.%_h.cpp: $(BIN_DIR)/binary2cpp $(SRC_DIR)/runtime/%.h ./$(BIN_DIR)/binary2cpp halide_internal_runtime_header_$*_h < $(SRC_DIR)/runtime/$*.h > $@ # Any c in the runtime that must be inlined needs to be copy-pasted into the output for the C backend. $(BUILD_DIR)/initmod.inlined_c.cpp: $(BIN_DIR)/binary2cpp $(SRC_DIR)/runtime/halide_buffer_t.cpp ./$(BIN_DIR)/binary2cpp halide_internal_initmod_inlined_c < $(SRC_DIR)/runtime/halide_buffer_t.cpp > $@ $(BUILD_DIR)/initmod_ptx.%_ll.cpp: $(BIN_DIR)/binary2cpp $(SRC_DIR)/runtime/nvidia_libdevice_bitcode/libdevice.%.bc ./$(BIN_DIR)/binary2cpp halide_internal_initmod_ptx_$(basename $*)_ll < $(SRC_DIR)/runtime/nvidia_libdevice_bitcode/libdevice.$*.bc > $@ $(BUILD_DIR)/c_template.%.cpp: $(BIN_DIR)/binary2cpp $(SRC_DIR)/%.template.cpp ./$(BIN_DIR)/binary2cpp halide_c_template_$* < $(SRC_DIR)/$*.template.cpp > $@ $(BUILD_DIR)/html_template.%.cpp: $(BIN_DIR)/binary2cpp $(SRC_DIR)/irvisualizer/html_template_% ./$(BIN_DIR)/binary2cpp halide_html_template_$(subst .,_,$*) < $(SRC_DIR)/irvisualizer/html_template_$* > $@ $(BIN_DIR)/binary2cpp: $(ROOT_DIR)/tools/binary2cpp.cpp @mkdir -p $(@D) $(CXX) $< -o $@ $(BIN_DIR)/regexp_replace: $(ROOT_DIR)/tools/regexp_replace.cpp @mkdir -p $(@D) $(CXX) -std=c++11 $< -o $@ $(BUILD_DIR)/initmod_ptx.%_ll.o: $(BUILD_DIR)/initmod_ptx.%_ll.cpp $(CXX) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o $(BUILD_DIR)/initmod.%.o: $(BUILD_DIR)/initmod.%.cpp $(CXX) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o $(BUILD_DIR)/c_template.%.o: $(BUILD_DIR)/c_template.%.cpp $(CXX) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o $(BUILD_DIR)/html_template.%.o: $(BUILD_DIR)/html_template.%.cpp $(CXX) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o $(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp $(BUILD_DIR)/llvm_ok @mkdir -p $(@D) $(CXX) $(CXX_FLAGS) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o $(BUILD_DIR)/Simplify_%.o: $(SRC_DIR)/Simplify_%.cpp $(SRC_DIR)/Simplify_Internal.h $(BUILD_DIR)/llvm_ok @mkdir -p $(@D) $(CXX) $(CXX_FLAGS) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/Simplify_$*.d -MT $@ .PHONY: clean clean: rm -rf $(LIB_DIR) rm -rf $(BIN_DIR) rm -rf $(BUILD_DIR) rm -rf $(TMP_DIR) rm -rf $(FILTERS_DIR) rm -rf $(INCLUDE_DIR) rm -rf $(SHARE_DIR) rm -rf $(DISTRIB_DIR) rm -rf $(ROOT_DIR)/apps/*/bin CORRECTNESS_TESTS = $(shell ls $(ROOT_DIR)/test/correctness/*.cpp) $(shell ls $(ROOT_DIR)/test/correctness/*.c) PERFORMANCE_TESTS = $(shell ls $(ROOT_DIR)/test/performance/*.cpp) ERROR_TESTS = $(shell ls $(ROOT_DIR)/test/error/*.cpp) WARNING_TESTS = $(shell ls $(ROOT_DIR)/test/warning/*.cpp) RUNTIME_TESTS = $(shell ls $(ROOT_DIR)/test/runtime/*.cpp) GENERATOR_EXTERNAL_TESTS := $(shell ls $(ROOT_DIR)/test/generator/*test.cpp) GENERATOR_EXTERNAL_TEST_GENERATOR := $(shell ls $(ROOT_DIR)/test/generator/*_generator.cpp) TUTORIALS = $(filter-out %_generate.cpp, $(shell ls $(ROOT_DIR)/tutorial/*.cpp)) MULLAPUDI2016_TESTS = $(shell ls $(ROOT_DIR)/test/autoschedulers/mullapudi2016/*.cpp) LI2018_TESTS = $(shell ls $(ROOT_DIR)/test/autoschedulers/li2018/test.cpp) ADAMS2019_TESTS = $(shell ls $(ROOT_DIR)/test/autoschedulers/adams2019/test.cpp) test_correctness: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=quiet_correctness_%) $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.c=quiet_correctness_%) test_performance: $(PERFORMANCE_TESTS:$(ROOT_DIR)/test/performance/%.cpp=performance_%) test_error: $(ERROR_TESTS:$(ROOT_DIR)/test/error/%.cpp=error_%) test_warning: $(WARNING_TESTS:$(ROOT_DIR)/test/warning/%.cpp=warning_%) test_runtime: $(RUNTIME_TESTS:$(ROOT_DIR)/test/runtime/%.cpp=runtime_%) test_tutorial: $(TUTORIALS:$(ROOT_DIR)/tutorial/%.cpp=tutorial_%) test_valgrind: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=valgrind_%) test_avx512: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=avx512_%) test_autoschedulers: test_mullapudi2016 test_li2018 test_adams2019 test_auto_schedule: test_autoschedulers .PHONY: test_correctness_multi_gpu test_correctness_multi_gpu: correctness_gpu_multi_device # There are 3 types of tests for generators: # 1) Externally-written aot-based tests # 2) Externally-written aot-based tests (compiled using C++ backend) # 3) Externally-written JIT-based tests GENERATOR_AOT_TESTS = $(GENERATOR_EXTERNAL_TESTS:$(ROOT_DIR)/test/generator/%_aottest.cpp=generator_aot_%) GENERATOR_AOTCPP_TESTS = $(GENERATOR_EXTERNAL_TESTS:$(ROOT_DIR)/test/generator/%_aottest.cpp=generator_aotcpp_%) GENERATOR_JIT_TESTS = $(GENERATOR_EXTERNAL_TESTS:$(ROOT_DIR)/test/generator/%_jittest.cpp=generator_jit_%) # multitarget test doesn't make any sense for the CPP backend; just skip it. GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_multitarget,$(GENERATOR_AOTCPP_TESTS)) # Note that many of the AOT-CPP tests are broken right now; # remove AOT-CPP tests that don't (yet) work for C++ backend # (each tagged with the *known* blocking issue(s)) # sanitizercoverage relies on LLVM-specific hooks, so it will never work with the C backend GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_sanitizercoverage,$(GENERATOR_AOTCPP_TESTS)) # https://github.com/halide/Halide/issues/2084 (only if opencl enabled)) #GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_cleanup_on_error,$(GENERATOR_AOTCPP_TESTS)) # https://github.com/halide/Halide/issues/7273 GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_msan,$(GENERATOR_AOTCPP_TESTS)) # https://github.com/halide/Halide/issues/7272 GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_memory_profiler_mandelbrot,$(GENERATOR_AOTCPP_TESTS)) # https://github.com/halide/Halide/issues/4916 GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_stubtest,$(GENERATOR_AOTCPP_TESTS)) GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_stubuser,$(GENERATOR_AOTCPP_TESTS)) # Build requirements are finicky, testing non-C++ backend is good enough here GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_gpu_multi_context_threaded,$(GENERATOR_AOTCPP_TESTS)) test_aotcpp_generator: $(GENERATOR_AOTCPP_TESTS) # This is just a test to ensure than RunGen builds and links for a critical mass of Generators; # not all will work directly (e.g. due to missing define_externs at link time), so we disable # those known to be broken for plausible reasons. GENERATOR_BUILD_RUNGEN_TESTS = $(GENERATOR_EXTERNAL_TEST_GENERATOR:$(ROOT_DIR)/test/generator/%_generator.cpp=$(FILTERS_DIR)/%.rungen) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/async_parallel.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/cxx_mangling_define_extern.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/define_extern_opencl.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/msan.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/sanitizercoverage.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/multitarget.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/nested_externs.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/tiled_blur.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/extern_output.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/gpu_multi_context_threaded.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(GENERATOR_BUILD_RUNGEN_TESTS) \ $(FILTERS_DIR)/multi_rungen \ $(FILTERS_DIR)/multi_rungen2 \ $(FILTERS_DIR)/rungen_test \ $(FILTERS_DIR)/registration_test test_rungen: $(GENERATOR_BUILD_RUNGEN_TESTS) $(FILTERS_DIR)/rungen_test $(FILTERS_DIR)/registration_test test_generator: $(GENERATOR_AOT_TESTS) $(GENERATOR_AOTCPP_TESTS) $(GENERATOR_JIT_TESTS) $(GENERATOR_BUILD_RUNGEN_TESTS) $(FILTERS_DIR)/rungen_test $(FILTERS_DIR)/registration_test ALL_TESTS = test_internal test_correctness test_error test_tutorial test_warning test_runtime test_generator # These targets perform timings of each test. For most tests this includes Halide JIT compile times, and run times. # For generator tests they time the compile time only. The times are recorded in CSV files. time_compilation_correctness: init_time_compilation_correctness $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=time_compilation_test_%) time_compilation_performance: init_time_compilation_performance $(PERFORMANCE_TESTS:$(ROOT_DIR)/test/performance/%.cpp=time_compilation_performance_%) time_compilation_generator: init_time_compilation_generator $(GENERATOR_TESTS:$(ROOT_DIR)/test/generator/%_aottest.cpp=time_compilation_generator_%) init_time_compilation_%: echo "TEST,User (s),System (s),Real" > $(@:init_time_compilation_%=compile_times_%.csv) TIME_COMPILATION ?= /usr/bin/time -a -f "$@,%U,%S,%E" -o run_tests: $(ALL_TESTS) make -f $(THIS_MAKEFILE) test_performance test_autoschedulers .PHONY: build_tests build_tests: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=$(BIN_DIR)/correctness_%) \ $(PERFORMANCE_TESTS:$(ROOT_DIR)/test/performance/%.cpp=$(BIN_DIR)/performance_%) \ $(ERROR_TESTS:$(ROOT_DIR)/test/error/%.cpp=$(BIN_DIR)/error_%) \ $(WARNING_TESTS:$(ROOT_DIR)/test/warning/%.cpp=$(BIN_DIR)/warning_%) \ $(RUNTIME_TESTS:$(ROOT_DIR)/test/runtime/%.cpp=$(BIN_DIR)/runtime_%) \ $(GENERATOR_EXTERNAL_TESTS:$(ROOT_DIR)/test/generator/%_aottest.cpp=$(BIN_DIR)/$(TARGET)/generator_aot_%) \ $(GENERATOR_EXTERNAL_TESTS:$(ROOT_DIR)/test/generator/%_jittest.cpp=$(BIN_DIR)/generator_jit_%) \ $(MULLAPUDI2016_TESTS:$(ROOT_DIR)/test/autoschedulers/mullapudi2016/%.cpp=$(BIN_DIR)/mullapudi2016_%) \ $(LI2018_TESTS:$(ROOT_DIR)/test/autoschedulers/li2018/%.cpp=$(BIN_DIR)/li2018_%) \ $(ADAMS2019_TESTS:$(ROOT_DIR)/test/autoschedulers/adams2019/%.cpp=$(BIN_DIR)/adams2019_%) clean_generator: rm -rf $(BIN_DIR)/*.generator rm -rf $(BIN_DIR)/*/runtime.a rm -rf $(FILTERS_DIR) rm -rf $(BIN_DIR)/*/generator_* rm -rf $(BUILD_DIR)/*_generator.o rm -f $(BUILD_DIR)/GenGen.o rm -f $(BUILD_DIR)/RunGenMain.o time_compilation_tests: time_compilation_correctness time_compilation_performance time_compilation_generator # These are just aliases to the autoscheduler plugins to make Generator rules & deps a little terser BIN_ADAMS2019=$(BIN_DIR)/libautoschedule_adams2019.$(PLUGIN_EXT) BIN_LI2018=$(BIN_DIR)/libautoschedule_li2018.$(PLUGIN_EXT) BIN_MULLAPUDI2016=$(BIN_DIR)/libautoschedule_mullapudi2016.$(PLUGIN_EXT) $(BUILD_DIR)/GenGen.o: $(ROOT_DIR)/tools/GenGen.cpp $(INCLUDE_DIR)/Halide.h @mkdir -p $(@D) $(CXX) -c $< $(TEST_CXX_FLAGS) -I$(INCLUDE_DIR) -o $@ # Make an empty generator for generating runtimes. $(BIN_DIR)/runtime.generator: $(BUILD_DIR)/GenGen.o $(BIN_DIR)/libHalide.$(SHARED_EXT) @mkdir -p $(@D) $(CXX) $< $(TEST_LD_FLAGS) -o $@ # Generate a standalone runtime for a given target string $(BIN_DIR)/%/runtime.a: $(BIN_DIR)/runtime.generator @mkdir -p $(@D) $(CURDIR)/$< -r runtime -o $(CURDIR)/$(BIN_DIR)/$* target=$* $(BIN_DIR)/test_internal: $(ROOT_DIR)/test/internal.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) $< -I$(SRC_DIR) $(TEST_LD_FLAGS) -o $@ ifneq (,$(shell which flatc)) $(BUILD_DIR)/Deserialization.o : $(BUILD_DIR)/halide_ir.fbs.h $(BUILD_DIR)/Serialization.o : $(BUILD_DIR)/halide_ir.fbs.h endif # Generated header for serialization/deserialization $(BUILD_DIR)/halide_ir.fbs.h: $(SRC_DIR)/halide_ir.fbs @mkdir -p $(@D) flatc --cpp --cpp-std C++17 --no-union-value-namespacing --keep-prefix --filename-suffix ".fbs" -o $(BUILD_DIR) $^ # Correctness test that link against libHalide $(BIN_DIR)/correctness_%: $(ROOT_DIR)/test/correctness/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(RUNTIME_EXPORTED_INCLUDES) @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ # Correctness tests that do NOT link against libHalide $(BIN_DIR)/correctness_plain_c_includes: $(ROOT_DIR)/test/correctness/plain_c_includes.c $(RUNTIME_EXPORTED_INCLUDES) $(CXX) -x c -Wall -Werror -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(ROOT_DIR)/src/runtime -o $@ # Note that this test must *not* link in either libHalide, or a Halide runtime; # this test should be usable without either. $(BIN_DIR)/correctness_halide_buffer: $(ROOT_DIR)/test/correctness/halide_buffer.cpp $(INCLUDE_DIR)/HalideBuffer.h $(RUNTIME_EXPORTED_INCLUDES) $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) -o $@ # The image_io test additionally needs to link to libpng and # libjpeg. $(BIN_DIR)/correctness_image_io: $(ROOT_DIR)/test/correctness/image_io.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(RUNTIME_EXPORTED_INCLUDES) $(CXX) $(TEST_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ # OpenCL runtime correctness test requires runtime.a to be linked. $(BIN_DIR)/$(TARGET)/correctness_opencl_runtime: $(ROOT_DIR)/test/correctness/opencl_runtime.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(BIN_DIR)/$(TARGET)/runtime.a $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ $(BIN_DIR)/performance_%: $(ROOT_DIR)/test/performance/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE) $< -I$(INCLUDE_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(TEST_LD_FLAGS) -o $@ # Error tests that link against libHalide $(BIN_DIR)/error_%: $(ROOT_DIR)/test/error/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(CXX) $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ $(BIN_DIR)/warning_%: $(ROOT_DIR)/test/warning/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(CXX) $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ # Runtime tests that test internals RUNTIME_TESTS_CXXFLAGS = -fno-rtti -fno-exceptions -fno-threadsafe-statics -Wno-builtin-declaration-mismatch -DCOMPILING_HALIDE_RUNTIME -DCOMPILING_HALIDE_RUNTIME_TESTS $(BIN_DIR)/runtime_internal_common.o: $(ROOT_DIR)/test/runtime/common.cpp $(ROOT_DIR)/test/runtime/common.h @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) $(RUNTIME_TESTS_CXXFLAGS) -I$(ROOT_DIR)/test/runtime -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) -c $< -o $@ $(BIN_DIR)/runtime_internal_msan_stubs.o: $(ROOT_DIR)/src/runtime/msan_stubs.cpp @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) $(RUNTIME_TESTS_CXXFLAGS) -I$(ROOT_DIR)/test/runtime -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) -c $< -o $@ $(BIN_DIR)/runtime_internal_to_string.o: $(ROOT_DIR)/src/runtime/to_string.cpp @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) $(RUNTIME_TESTS_CXXFLAGS) -I$(ROOT_DIR)/test/runtime -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) -c $< -o $@ $(BIN_DIR)/runtime_common: @mkdir -p $(@D) touch $@ $(BIN_DIR)/runtime_%: $(ROOT_DIR)/test/runtime/%.cpp $(BIN_DIR)/runtime_internal_common.o $(BIN_DIR)/runtime_internal_msan_stubs.o $(BIN_DIR)/runtime_internal_to_string.o @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) $(RUNTIME_TESTS_CXXFLAGS) -I$(ROOT_DIR)/test/runtime -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) $^ $(COMMON_LD_FLAGS) -o $@ # Auto schedule tests that link against libHalide $(BIN_DIR)/mullapudi2016_%: $(ROOT_DIR)/test/autoschedulers/mullapudi2016/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ $(BIN_DIR)/li2018_%: $(ROOT_DIR)/test/autoschedulers/li2018/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ $(BIN_DIR)/adams2019_%: $(ROOT_DIR)/test/autoschedulers/adams2019/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ # TODO(srj): this doesn't auto-delete, why not? .INTERMEDIATE: $(BIN_DIR)/%.generator # By default, %.generator is produced by building %_generator.cpp # Note that the rule includes all _generator.cpp files, so that generator with define_extern # usage can just add deps later. $(BUILD_DIR)/%_generator.o: $(ROOT_DIR)/test/generator/%_generator.cpp $(INCLUDE_DIR)/Halide.h @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) -I$(INCLUDE_DIR) -I$(CURDIR)/$(FILTERS_DIR) -c $< -o $@ $(BIN_DIR)/%.generator: $(BUILD_DIR)/GenGen.o $(BIN_DIR)/libHalide.$(SHARED_EXT) $(BUILD_DIR)/%_generator.o @mkdir -p $(@D) $(CXX) $(filter %.cpp %.o %.a,$^) $(TEST_LD_FLAGS) -o $@ NAME_MANGLING_TARGET=$(NON_EMPTY_TARGET)-c_plus_plus_name_mangling GEN_AOT_OUTPUTS=-e static_library,c_header,c_source,registration # By default, %.a/.h are produced by executing %.generator. Runtimes are not included in these. # (We explicitly also generate .cpp output here as well, as additional test surface for the C++ backend.) $(FILTERS_DIR)/%.a: $(BIN_DIR)/%.generator @mkdir -p $(@D) $(CURDIR)/$< -g $* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime $(FILTERS_DIR)/%.h: $(FILTERS_DIR)/%.a @echo $@ produced implicitly by $^ $(FILTERS_DIR)/%.halide_generated.cpp: $(FILTERS_DIR)/%.a @echo $@ produced implicitly by $^ $(FILTERS_DIR)/%.registration.cpp: $(FILTERS_DIR)/%.a @echo $@ produced implicitly by $^ $(FILTERS_DIR)/%.stub.h: $(BIN_DIR)/%.generator @mkdir -p $(@D) $(CURDIR)/$< -g $* -n $* -o $(CURDIR)/$(FILTERS_DIR) -e cpp_stub $(FILTERS_DIR)/cxx_mangling_externs.o: $(ROOT_DIR)/test/generator/cxx_mangling_externs.cpp @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) -c $(filter-out %.h,$^) $(GEN_AOT_INCLUDES) -o $@ # If we want to use a Generator with custom GeneratorParams, we need to write # custom rules: to pass the GeneratorParams, and to give a unique function and file name. $(FILTERS_DIR)/cxx_mangling.a: $(BIN_DIR)/cxx_mangling.generator $(FILTERS_DIR)/cxx_mangling_externs.o @mkdir -p $(@D) $(CURDIR)/$< -g cxx_mangling $(GEN_AOT_OUTPUTS),function_info_header -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-c_plus_plus_name_mangling -f "HalideTest::AnotherNamespace::cxx_mangling" $(ROOT_DIR)/tools/makelib.sh $@ $@ $(FILTERS_DIR)/cxx_mangling_externs.o ifneq ($(TEST_CUDA), ) # Also build with a gpu target to ensure that the GPU-Host generation # code handles name mangling properly. (Note that we don't need to # run this code, just check for link errors.) $(FILTERS_DIR)/cxx_mangling_gpu.a: $(BIN_DIR)/cxx_mangling.generator $(FILTERS_DIR)/cxx_mangling_externs.o @mkdir -p $(@D) $(CURDIR)/$< -g cxx_mangling $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-c_plus_plus_name_mangling-cuda-cuda_capability_30 -f "HalideTest::cxx_mangling_gpu" $(ROOT_DIR)/tools/makelib.sh $@ $@ $(FILTERS_DIR)/cxx_mangling_externs.o endif $(FILTERS_DIR)/cxx_mangling_define_extern_externs.o: $(ROOT_DIR)/test/generator/cxx_mangling_define_extern_externs.cpp $(FILTERS_DIR)/cxx_mangling.h @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) -c $(filter-out %.h,$^) $(GEN_AOT_INCLUDES) -o $@ $(FILTERS_DIR)/cxx_mangling_define_extern.a: $(BIN_DIR)/cxx_mangling_define_extern.generator $(FILTERS_DIR)/cxx_mangling_define_extern_externs.o @mkdir -p $(@D) $(CURDIR)/$< -g cxx_mangling_define_extern $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-c_plus_plus_name_mangling-user_context -f "HalideTest::cxx_mangling_define_extern" $(ROOT_DIR)/tools/makelib.sh $@ $@ $(FILTERS_DIR)/cxx_mangling_define_extern_externs.o # pyramid needs a custom arg. $(FILTERS_DIR)/pyramid.a: $(BIN_DIR)/pyramid.generator @mkdir -p $(@D) $(CURDIR)/$< -g pyramid -f pyramid $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime levels=10 $(FILTERS_DIR)/string_param.a: $(BIN_DIR)/string_param.generator @mkdir -p $(@D) $(CURDIR)/$< -g string_param -f string_param $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime rpn_expr="5 y * x +" # memory_profiler_mandelbrot need profiler set $(FILTERS_DIR)/memory_profiler_mandelbrot.a: $(BIN_DIR)/memory_profiler_mandelbrot.generator @mkdir -p $(@D) $(CURDIR)/$< -g memory_profiler_mandelbrot -f memory_profiler_mandelbrot $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-profile $(FILTERS_DIR)/alias_with_offset_42.a: $(BIN_DIR)/alias.generator @mkdir -p $(@D) $(CURDIR)/$< -g alias_with_offset_42 -f alias_with_offset_42 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime $(FILTERS_DIR)/alias_Adams2019.a: $(BIN_DIR)/alias.generator autoschedulers @mkdir -p $(@D) $(CURDIR)/$< -g alias_Adams2019 -f alias_Adams2019 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -p $(BIN_ADAMS2019) $(FILTERS_DIR)/alias_Li2018.a: $(BIN_DIR)/alias.generator autoschedulers @mkdir -p $(@D) $(CURDIR)/$< -g alias_Li2018 -f alias_Li2018 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -p $(BIN_LI2018) $(FILTERS_DIR)/alias_Mullapudi2016.a: $(BIN_DIR)/alias.generator autoschedulers @mkdir -p $(@D) $(CURDIR)/$< -g alias_Mullapudi2016 -f alias_Mullapudi2016 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -p $(BIN_MULLAPUDI2016) METADATA_TESTER_GENERATOR_ARGS=\ input.type=uint8 input.dim=3 \ dim_only_input_buffer.type=uint8 \ untyped_input_buffer.type=uint8 untyped_input_buffer.dim=3 \ output.type=float32,float32 output.dim=3 \ input_not_nod.type=uint8 input_not_nod.dim=3 \ input_nod.dim=3 \ input_not.type=uint8 \ array_input.size=2 \ array_i8.size=2 \ array_i16.size=2 \ array_i32.size=2 \ array_h.size=2 \ buffer_array_input2.dim=3 \ buffer_array_input3.type=float32 \ buffer_array_input4.dim=3 \ buffer_array_input4.type=float32 \ buffer_array_input5.size=2 \ buffer_array_input6.size=2 \ buffer_array_input6.dim=3 \ buffer_array_input7.size=2 \ buffer_array_input7.type=float32 \ buffer_array_input8.size=2 \ buffer_array_input8.dim=3 \ buffer_array_input8.type=float32 \ buffer_f16_untyped.type=float16 \ untyped_scalar_input.type=uint8 \ array_outputs.size=2 \ array_outputs7.size=2 \ array_outputs8.size=2 \ array_outputs9.size=2 # metadata_tester is built with and without user-context. # Also note that metadata_tester (but not metadata_tester_ucon) is built as "multitarget" to verify that # the metadata names are correctly emitted. $(FILTERS_DIR)/metadata_tester.a: $(BIN_DIR)/metadata_tester.generator @mkdir -p $(@D) $(CURDIR)/$< -g metadata_tester -f metadata_tester -e static_library,c_header,registration,function_info_header -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime,$(TARGET)-no_runtime-no_bounds_query $(METADATA_TESTER_GENERATOR_ARGS) # c_source output doesn't work properly with multitarget output $(FILTERS_DIR)/metadata_tester.halide_generated.cpp: $(BIN_DIR)/metadata_tester.generator @mkdir -p $(@D) $(CURDIR)/$< -g metadata_tester -f metadata_tester -e c_source -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime $(METADATA_TESTER_GENERATOR_ARGS) $(FILTERS_DIR)/metadata_tester_ucon.a: $(BIN_DIR)/metadata_tester.generator @mkdir -p $(@D) $(CURDIR)/$< -g metadata_tester -f metadata_tester_ucon $(GEN_AOT_OUTPUTS),function_info_header -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-user_context-no_runtime $(METADATA_TESTER_GENERATOR_ARGS) $(BIN_DIR)/$(TARGET)/generator_aot_metadata_tester: $(FILTERS_DIR)/metadata_tester_ucon.a $(BIN_DIR)/$(TARGET)/generator_aotcpp_metadata_tester: $(FILTERS_DIR)/metadata_tester_ucon.halide_generated.cpp $(FILTERS_DIR)/multitarget.a: $(BIN_DIR)/multitarget.generator @mkdir -p $(@D) $(CURDIR)/$< -g multitarget -f "HalideTest::multitarget" $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) \ target=$(TARGET)-no_bounds_query-no_runtime-c_plus_plus_name_mangling,$(TARGET)-no_runtime-c_plus_plus_name_mangling \ -e assembly,bitcode,c_source,c_header,stmt_html,static_library,stmt $(FILTERS_DIR)/msan.a: $(BIN_DIR)/msan.generator @mkdir -p $(@D) $(CURDIR)/$< -g msan -f msan $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-msan $(FILTERS_DIR)/sanitizercoverage.a: $(BIN_DIR)/sanitizercoverage.generator @mkdir -p $(@D) $(CURDIR)/$< -g sanitizercoverage -f sanitizercoverage $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-sanitizer_coverage # user_context needs to be generated with user_context as the first argument to its calls $(FILTERS_DIR)/user_context.a: $(BIN_DIR)/user_context.generator @mkdir -p $(@D) $(CURDIR)/$< -g user_context $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context # ditto for user_context_insanity $(FILTERS_DIR)/user_context_insanity.a: $(BIN_DIR)/user_context_insanity.generator @mkdir -p $(@D) $(CURDIR)/$< -g user_context_insanity $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context # ditto for async_parallel $(FILTERS_DIR)/async_parallel.a: $(BIN_DIR)/async_parallel.generator @mkdir -p $(@D) $(CURDIR)/$< -g async_parallel $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context # Some .generators have additional dependencies (usually due to define_extern usage). # These typically require two extra dependencies: # (1) Ensuring the extra _generator.cpp is built into the .generator. # (2) Ensuring the extra .a is linked into the final output. # TODO(srj): we really want to say "anything that depends on tiled_blur.a also depends on blur2x2.a"; # is there a way to specify that in Make? $(BIN_DIR)/$(TARGET)/generator_aot_tiled_blur: $(FILTERS_DIR)/blur2x2.a ifneq ($(TEST_CUDA), ) $(BIN_DIR)/$(TARGET)/generator_aot_cxx_mangling: $(FILTERS_DIR)/cxx_mangling_gpu.a endif $(BIN_DIR)/$(TARGET)/generator_aot_cxx_mangling_define_extern: $(FILTERS_DIR)/cxx_mangling.a $(BIN_DIR)/$(TARGET)/generator_aotcpp_tiled_blur: $(FILTERS_DIR)/blur2x2.halide_generated.cpp ifneq ($(TEST_CUDA), ) $(BIN_DIR)/$(TARGET)/generator_aotcpp_cxx_mangling: $(FILTERS_DIR)/cxx_mangling_gpu.halide_generated.cpp endif $(BIN_DIR)/$(TARGET)/generator_aotcpp_cxx_mangling: $(FILTERS_DIR)/cxx_mangling_externs.o $(BIN_DIR)/$(TARGET)/generator_aotcpp_cxx_mangling_define_extern: $(FILTERS_DIR)/cxx_mangling.halide_generated.cpp $(FILTERS_DIR)/cxx_mangling_externs.o $(FILTERS_DIR)/cxx_mangling_define_extern_externs.o $(BUILD_DIR)/stubuser_generator.o: $(FILTERS_DIR)/stubtest.stub.h $(FILTERS_DIR)/configure.stub.h $(BIN_DIR)/stubuser.generator: $(BUILD_DIR)/stubtest_generator.o $(BUILD_DIR)/configure_generator.o # stubtest has input and output funcs with undefined types and array sizes; this is fine for stub # usage (the types can be inferred), but for AOT compilation, we must make the types # concrete via generator args. STUBTEST_GENERATOR_ARGS=\ untyped_buffer_input.type=uint8 untyped_buffer_input.dim=3 \ simple_input.type=float32 \ array_input.type=float32 array_input.size=2 \ int_arg.size=2 \ tuple_output.type=float32,float32 \ vectorize=true $(FILTERS_DIR)/stubtest.a: $(BIN_DIR)/stubtest.generator @mkdir -p $(@D) $(CURDIR)/$< -g stubtest -f stubtest $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime $(STUBTEST_GENERATOR_ARGS) $(FILTERS_DIR)/stubuser_auto.a: $(BIN_DIR)/stubuser.generator $(BIN_MULLAPUDI2016) @mkdir -p $(@D) $(CURDIR)/$< -g stubuser $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f stubuser_auto target=$(TARGET)-no_runtime autoscheduler=Mullapudi2016 -p $(BIN_MULLAPUDI2016) $(FILTERS_DIR)/autograd_grad.a: $(BIN_DIR)/autograd.generator $(BIN_MULLAPUDI2016) @mkdir -p $(@D) $(CURDIR)/$< -g autograd $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f autograd_grad target=$(TARGET)-no_runtime autoscheduler=Mullapudi2016 -d 1 -p $(BIN_MULLAPUDI2016) # Usually, it's considered best practice to have one Generator per # .cpp file, with the generator-name and filename matching; # nested_externs_generators.cpp is a counterexample, and thus requires # some special casing to get right. First, make a special rule to # build each of the Generators in nested_externs_generator.cpp (which # all have the form nested_externs_*). $(FILTERS_DIR)/nested_externs_%.a: $(BIN_DIR)/nested_externs.generator @mkdir -p $(@D) $(CURDIR)/$< -g nested_externs_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context-c_plus_plus_name_mangling # Similarly, gpu_multi needs two different kernels to test compilation caching. # Also requies user-context. $(FILTERS_DIR)/gpu_multi_context_threaded_%.a: $(BIN_DIR)/gpu_multi_context_threaded.generator @mkdir -p $(@D) $(CURDIR)/$< -g gpu_multi_context_threaded_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas -Wno-unused-variable GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools GEN_AOT_LD_FLAGS=$(COMMON_LD_FLAGS) ifneq ($(TEST_METAL), ) # Unlike cuda and opencl, which dynamically go find the appropriate symbols, metal requires actual linking. GEN_AOT_LD_FLAGS+=$(METAL_LD_FLAGS) endif # By default, %_aottest.cpp depends on $(FILTERS_DIR)/%.a/.h (but not libHalide). $(BIN_DIR)/$(TARGET)/generator_aot_%: $(ROOT_DIR)/test/generator/%_aottest.cpp $(FILTERS_DIR)/%.a $(FILTERS_DIR)/%.h $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ # Also make AOT testing targets that depends on the .cpp output (rather than .a). $(BIN_DIR)/$(TARGET)/generator_aotcpp_%: $(ROOT_DIR)/test/generator/%_aottest.cpp $(FILTERS_DIR)/%.halide_generated.cpp $(FILTERS_DIR)/%.h $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(OPTIMIZE) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ # MSAN test doesn't use the standard runtime $(BIN_DIR)/$(TARGET)/generator_aot_msan: $(ROOT_DIR)/test/generator/msan_aottest.cpp $(FILTERS_DIR)/msan.a $(FILTERS_DIR)/msan.h $(RUNTIME_EXPORTED_INCLUDES) @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter-out %.h,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ # SanitizerCoverage test doesn't use the standard runtime $(BIN_DIR)/$(TARGET)/generator_aot_sanitizercoverage: $(ROOT_DIR)/test/generator/sanitizercoverage_aottest.cpp $(FILTERS_DIR)/sanitizercoverage.a $(FILTERS_DIR)/sanitizercoverage.h $(RUNTIME_EXPORTED_INCLUDES) @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter-out %.h,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ # SanitizerCoverage test will never work with C++ backend $(BIN_DIR)/$(TARGET)/generator_aotcpp_sanitizercoverage: $(ROOT_DIR)/test/generator/sanitizercoverage_aottest.cpp @mkdir -p $(@D) echo "SanitizerCoverage test will never work with C++ backend" exit 1 # alias has additional deps to link in $(BIN_DIR)/$(TARGET)/generator_aot_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.a $(FILTERS_DIR)/alias_with_offset_42.a $(FILTERS_DIR)/alias_Adams2019.a $(FILTERS_DIR)/alias_Li2018.a $(FILTERS_DIR)/alias_Mullapudi2016.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ $(BIN_DIR)/$(TARGET)/generator_aotcpp_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.halide_generated.cpp $(FILTERS_DIR)/alias_with_offset_42.halide_generated.cpp $(FILTERS_DIR)/alias_Adams2019.halide_generated.cpp $(FILTERS_DIR)/alias_Li2018.halide_generated.cpp $(FILTERS_DIR)/alias_Mullapudi2016.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ # autograd has additional deps to link in $(BIN_DIR)/$(TARGET)/generator_aot_autograd: $(ROOT_DIR)/test/generator/autograd_aottest.cpp $(FILTERS_DIR)/autograd.a $(FILTERS_DIR)/autograd_grad.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ $(BIN_DIR)/$(TARGET)/generator_aotcpp_autograd: $(ROOT_DIR)/test/generator/autograd_aottest.cpp $(FILTERS_DIR)/autograd.halide_generated.cpp $(FILTERS_DIR)/autograd_grad.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ # nested_externs has additional deps to link in $(BIN_DIR)/$(TARGET)/generator_aot_nested_externs: $(ROOT_DIR)/test/generator/nested_externs_aottest.cpp $(FILTERS_DIR)/nested_externs_root.a $(FILTERS_DIR)/nested_externs_inner.a $(FILTERS_DIR)/nested_externs_combine.a $(FILTERS_DIR)/nested_externs_leaf.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ $(BIN_DIR)/$(TARGET)/generator_aotcpp_nested_externs: $(ROOT_DIR)/test/generator/nested_externs_aottest.cpp $(FILTERS_DIR)/nested_externs_root.halide_generated.cpp $(FILTERS_DIR)/nested_externs_inner.halide_generated.cpp $(FILTERS_DIR)/nested_externs_combine.halide_generated.cpp $(FILTERS_DIR)/nested_externs_leaf.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ # The gpu object lifetime test needs the debug runtime $(BIN_DIR)/$(TARGET)/generator_aot_gpu_object_lifetime: $(ROOT_DIR)/test/generator/gpu_object_lifetime_aottest.cpp $(FILTERS_DIR)/gpu_object_lifetime.a $(FILTERS_DIR)/gpu_object_lifetime.h $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)-debug/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(TEST_LD_FLAGS) -o $@ # acquire_release explicitly uses CUDA/OpenCL APIs, so link those here. $(BIN_DIR)/$(TARGET)/generator_aot_acquire_release: $(ROOT_DIR)/test/generator/acquire_release_aottest.cpp $(FILTERS_DIR)/acquire_release.a $(FILTERS_DIR)/acquire_release.h $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@ $(BIN_DIR)/$(TARGET)/generator_aotcpp_acquire_release: $(ROOT_DIR)/test/generator/acquire_release_aottest.cpp $(FILTERS_DIR)/acquire_release.halide_generated.cpp $(FILTERS_DIR)/acquire_release.h $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@ # define_extern_opencl explicitly uses OpenCL APIs, so link those here. $(BIN_DIR)/$(TARGET)/generator_aot_define_extern_opencl: $(ROOT_DIR)/test/generator/define_extern_opencl_aottest.cpp $(FILTERS_DIR)/define_extern_opencl.a $(FILTERS_DIR)/define_extern_opencl.h $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) -o $@ $(BIN_DIR)/$(TARGET)/generator_aotcpp_define_extern_opencl: $(ROOT_DIR)/test/generator/define_extern_opencl_aottest.cpp $(FILTERS_DIR)/define_extern_opencl.halide_generated.cpp $(FILTERS_DIR)/define_extern_opencl.h $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) -o $@ # By default, %_jittest.cpp depends on libHalide, plus the stubs for the Generator. These are external tests that use the JIT. $(BIN_DIR)/generator_jit_%: $(ROOT_DIR)/test/generator/%_jittest.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(FILTERS_DIR)/%.stub.h $(BUILD_DIR)/%_generator.o @mkdir -p $(@D) $(CXX) -g $(TEST_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) -I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I $(ROOT_DIR)/apps/support $(TEST_LD_FLAGS) -o $@ # stubuser is run with autoscheduling too $(BIN_DIR)/$(TARGET)/generator_aot_stubuser: $(ROOT_DIR)/test/generator/stubuser_aottest.cpp $(FILTERS_DIR)/stubuser.a $(FILTERS_DIR)/stubuser.h $(FILTERS_DIR)/stubuser_auto.a $(FILTERS_DIR)/stubuser_auto.h $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ # generator_aot_multitarget is run multiple times, with different env vars. generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget @mkdir -p $(@D) HL_MULTITARGET_TEST_USE_NOBOUNDSQUERY_FEATURE=0 $(CURDIR)/$< HL_MULTITARGET_TEST_USE_NOBOUNDSQUERY_FEATURE=1 $(CURDIR)/$< @-echo # gpu_multi_context_threaded has additional deps to link in $(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \ $(FILTERS_DIR)/gpu_multi_context_threaded_add.a \ $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a \ $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@ $(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \ $(FILTERS_DIR)/gpu_multi_context_threaded_add.halide_generated.cpp \ $(FILTERS_DIR)/gpu_multi_context_threaded_mul.halide_generated.cpp \ $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@ # nested externs doesn't actually contain a generator named # "nested_externs", and has no internal tests in any case. test_generator_nested_externs: @echo "Skipping" # gpu_multi actually contain a generator named # "gpu_multi", and has no internal tests in any case. test_generator_gpu_multi: @echo "Skipping" # gpu_multi_context_threaded actually contain a generator named # "gpu_multi", and has no internal tests in any case. test_generator_gpu_multi_context_threaded: @echo "Skipping" $(BUILD_DIR)/RunGenMain.o: $(ROOT_DIR)/tools/RunGenMain.cpp $(RUNTIME_EXPORTED_INCLUDES) $(ROOT_DIR)/tools/RunGen.h @mkdir -p $(@D) $(CXX) -c $< $(filter-out -g, $(TEST_CXX_FLAGS)) $(OPTIMIZE) -Os $(IMAGE_IO_CXX_FLAGS) -I$(INCLUDE_DIR) -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools -o $@ $(FILTERS_DIR)/%.registration.o: $(FILTERS_DIR)/%.registration.cpp @mkdir -p $(@D) $(CXX) -c $< $(TEST_CXX_FLAGS) -o $@ $(FILTERS_DIR)/%.rungen: $(BUILD_DIR)/RunGenMain.o $(BIN_DIR)/$(TARGET)/runtime.a $(FILTERS_DIR)/%.registration.o $(FILTERS_DIR)/%.a @mkdir -p $(@D) $(CXX) -std=c++17 -I$(FILTERS_DIR) \ $(BUILD_DIR)/RunGenMain.o \ $(BIN_DIR)/$(TARGET)/runtime.a \ $(call alwayslink,$(FILTERS_DIR)/$*.registration.o) \ $(FILTERS_DIR)/$*.a \ $(GEN_AOT_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ RUNARGS ?= $(FILTERS_DIR)/%.run: $(FILTERS_DIR)/%.rungen $(CURDIR)/$< $(RUNARGS) @-echo $(FILTERS_DIR)/%.registration_extra.o: $(FILTERS_DIR)/%.registration.cpp @mkdir -p $(@D) $(CXX) -c $< $(TEST_CXX_FLAGS) -DHALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC=halide_register_extra_key_value_pairs_$* -o $@ # Test the registration mechanism, independent of RunGen. # Note that this depends on the registration_extra.o (rather than registration.o) # because it compiles with HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC defined. $(FILTERS_DIR)/registration_test: $(ROOT_DIR)/test/generator/registration_test.cpp \ $(BIN_DIR)/$(TARGET)/runtime.a \ $(FILTERS_DIR)/blur2x2.registration_extra.o $(FILTERS_DIR)/blur2x2.a \ $(FILTERS_DIR)/cxx_mangling.registration_extra.o $(FILTERS_DIR)/cxx_mangling.a \ $(FILTERS_DIR)/pyramid.registration_extra.o $(FILTERS_DIR)/pyramid.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(GEN_AOT_INCLUDES) \ $(ROOT_DIR)/test/generator/registration_test.cpp \ $(FILTERS_DIR)/blur2x2.registration_extra.o \ $(FILTERS_DIR)/cxx_mangling.registration_extra.o \ $(FILTERS_DIR)/pyramid.registration_extra.o \ $(FILTERS_DIR)/blur2x2.a \ $(FILTERS_DIR)/cxx_mangling.a \ $(FILTERS_DIR)/pyramid.a \ $(BIN_DIR)/$(TARGET)/runtime.a \ $(GEN_AOT_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ # Test RunGen itself $(FILTERS_DIR)/rungen_test: $(ROOT_DIR)/test/generator/rungen_test.cpp \ $(BIN_DIR)/$(TARGET)/runtime.a \ $(FILTERS_DIR)/example.registration.o \ $(FILTERS_DIR)/example.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(GEN_AOT_INCLUDES) \ $(ROOT_DIR)/test/generator/rungen_test.cpp \ $(BIN_DIR)/$(TARGET)/runtime.a \ $(call alwayslink,$(FILTERS_DIR)/example.registration.o) \ $(FILTERS_DIR)/example.a \ $(GEN_AOT_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ # Test linking multiple filters into a single RunGen instance $(FILTERS_DIR)/multi_rungen: $(BUILD_DIR)/RunGenMain.o $(BIN_DIR)/$(TARGET)/runtime.a \ $(FILTERS_DIR)/blur2x2.registration.o $(FILTERS_DIR)/blur2x2.a \ $(FILTERS_DIR)/cxx_mangling.registration.o $(FILTERS_DIR)/cxx_mangling.a \ $(FILTERS_DIR)/pyramid.registration.o $(FILTERS_DIR)/pyramid.a @mkdir -p $(@D) $(CXX) -std=c++17 -I$(FILTERS_DIR) \ $(BUILD_DIR)/RunGenMain.o \ $(BIN_DIR)/$(TARGET)/runtime.a \ $(call alwayslink,$(FILTERS_DIR)/blur2x2.registration.o) \ $(call alwayslink,$(FILTERS_DIR)/cxx_mangling.registration.o) \ $(call alwayslink,$(FILTERS_DIR)/pyramid.registration.o) \ $(FILTERS_DIR)/blur2x2.a \ $(FILTERS_DIR)/cxx_mangling.a \ $(FILTERS_DIR)/pyramid.a \ $(GEN_AOT_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ # Test concatenating multiple registration files as well, which should also work $(FILTERS_DIR)/multi_rungen2.registration.cpp: $(FILTERS_DIR)/blur2x2.registration.cpp $(FILTERS_DIR)/cxx_mangling.registration.cpp $(FILTERS_DIR)/pyramid.registration.cpp cat $^ > $@ $(FILTERS_DIR)/multi_rungen2: $(BUILD_DIR)/RunGenMain.o $(BIN_DIR)/$(TARGET)/runtime.a \ $(FILTERS_DIR)/multi_rungen2.registration.cpp \ $(FILTERS_DIR)/blur2x2.a \ $(FILTERS_DIR)/cxx_mangling.a \ $(FILTERS_DIR)/pyramid.a @mkdir -p $(@D) $(CXX) -std=c++17 -I$(FILTERS_DIR) $^ $(GEN_AOT_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ $(BIN_DIR)/tutorial_%: $(ROOT_DIR)/tutorial/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(INCLUDE_DIR)/HalideRuntime.h @ if [[ $@ == *_run ]]; then \ export TUTORIAL=$* ;\ export LESSON=`echo $${TUTORIAL} | cut -b1-9`; \ make -f $(THIS_MAKEFILE) tutorial_$${TUTORIAL/run/generate}; \ $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< \ -I$(TMP_DIR) -I$(INCLUDE_DIR) $(TMP_DIR)/$${LESSON}_*.a $(GEN_AOT_LD_FLAGS) $(IMAGE_IO_LIBS) -lz -o $@; \ else \ $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< \ -I$(INCLUDE_DIR) -I$(ROOT_DIR)/tools $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@;\ fi $(BIN_DIR)/tutorial_lesson_15_generators: $(ROOT_DIR)/tutorial/lesson_15_generators.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(BUILD_DIR)/GenGen.o $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \ -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ tutorial_lesson_15_generators: $(ROOT_DIR)/tutorial/lesson_15_generators_usage.sh $(BIN_DIR)/tutorial_lesson_15_generators @-mkdir -p $(TMP_DIR) cp $(BIN_DIR)/tutorial_lesson_15_generators $(TMP_DIR)/lesson_15_generate; \ cd $(TMP_DIR); \ PATH="$${PATH}:$(CURDIR)/$(BIN_DIR)" source $(ROOT_DIR)/tutorial/lesson_15_generators_usage.sh @-echo $(BIN_DIR)/tutorial_lesson_16_rgb_generate: $(ROOT_DIR)/tutorial/lesson_16_rgb_generate.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(BUILD_DIR)/GenGen.o $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \ -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ $(BIN_DIR)/tutorial_lesson_16_rgb_run: $(ROOT_DIR)/tutorial/lesson_16_rgb_run.cpp $(BIN_DIR)/tutorial_lesson_16_rgb_generate @-mkdir -p $(TMP_DIR) # Run the generator $(BIN_DIR)/tutorial_lesson_16_rgb_generate -g brighten -o $(TMP_DIR) -f brighten_planar target=host layout=planar $(BIN_DIR)/tutorial_lesson_16_rgb_generate -g brighten -o $(TMP_DIR) -f brighten_interleaved target=host-no_runtime layout=interleaved $(BIN_DIR)/tutorial_lesson_16_rgb_generate -g brighten -o $(TMP_DIR) -f brighten_either target=host-no_runtime layout=either $(BIN_DIR)/tutorial_lesson_16_rgb_generate -g brighten -o $(TMP_DIR) -f brighten_specialized target=host-no_runtime layout=specialized # Compile the runner $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< \ -I$(INCLUDE_DIR) -L$(BIN_DIR) -I $(TMP_DIR) $(TMP_DIR)/brighten_*.a \ -lHalide $(TEST_LD_FLAGS) $(COMMON_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ @-echo $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate: $(ROOT_DIR)/tutorial/lesson_21_auto_scheduler_generate.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(BUILD_DIR)/GenGen.o $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \ -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ # The values are: # - the maximum level of parallelism available, # - the size of the last-level cache (in bytes), # - the ratio between the cost of a miss at the last level cache and the cost # of arithmetic on the target architecture # ...in that order. LESSON_21_AUTOSCHEDULER_PARAMS=\ autoscheduler=Mullapudi2016 \ autoscheduler.parallelism=32 \ autoscheduler.last_level_cache_size=16777216 \ autoscheduler.balance=40 $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_run: $(ROOT_DIR)/tutorial/lesson_21_auto_scheduler_run.cpp $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate $(BIN_MULLAPUDI2016) @-mkdir -p $(TMP_DIR) # Run the generator $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate -g auto_schedule_gen -o $(TMP_DIR) -e static_library,c_header,schedule -f auto_schedule_false target=host $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate -g auto_schedule_gen -o $(TMP_DIR) -e static_library,c_header,schedule -f auto_schedule_true target=host-no_runtime $(LESSON_21_AUTOSCHEDULER_PARAMS) -p $(BIN_MULLAPUDI2016) # Compile the runner $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< \ -I$(INCLUDE_DIR) -L$(BIN_DIR) -I $(TMP_DIR) $(TMP_DIR)/auto_schedule_*.a \ -lHalide $(TEST_LD_FLAGS) $(COMMON_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ @-echo test_internal: $(BIN_DIR)/test_internal @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< @-echo correctness_%: $(BIN_DIR)/correctness_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< @-echo correctness_opencl_runtime: $(BIN_DIR)/$(TARGET)/correctness_opencl_runtime @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< @-echo quiet_correctness_%: $(BIN_DIR)/correctness_% @-mkdir -p $(TMP_DIR) @cd $(TMP_DIR) ; ( $(CURDIR)/$< 2>stderr_$*.txt > stdout_$*.txt && echo -n . ) || ( echo ; echo FAILED TEST: $* ; cat stdout_$*.txt stderr_$*.txt ; false ) valgrind_%: $(BIN_DIR)/correctness_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; valgrind --error-exitcode=-1 $(CURDIR)/$< @-echo # Use Intel SDE to emulate an avx 512 processor. avx512_%: $(BIN_DIR)/correctness_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; sde -cnl -- $(CURDIR)/$< cd $(TMP_DIR) ; sde -knl -- $(CURDIR)/$< @-echo # This test is *supposed* to do an out-of-bounds read, so skip it when testing under valgrind valgrind_tracing_stack: $(BIN_DIR)/correctness_tracing_stack @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$(BIN_DIR)/correctness_tracing_stack @-echo performance_%: $(BIN_DIR)/performance_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< @-echo error_%: $(BIN_DIR)/error_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< 2>&1 | egrep --q "terminating with uncaught exception|terminating due to uncaught exception|^terminate called|^Error|Assertion.*failed" @-echo warning_%: $(BIN_DIR)/warning_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< 2>&1 | egrep --q "^Warning" @-echo runtime_common: # nothing runtime_%: $(BIN_DIR)/runtime_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< @-echo generator_jit_%: $(BIN_DIR)/generator_jit_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< @-echo generator_aot_%: $(BIN_DIR)/$(TARGET)/generator_aot_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< @-echo generator_aotcpp_%: $(BIN_DIR)/$(TARGET)/generator_aotcpp_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< @-echo $(TMP_DIR)/images/%.png: $(ROOT_DIR)/tutorial/images/%.png @-mkdir -p $(TMP_DIR)/images cp $< $(TMP_DIR)/images/ tutorial_%: $(BIN_DIR)/tutorial_% $(TMP_DIR)/images/rgb.png $(TMP_DIR)/images/gray.png @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< @-echo # Skip the serialization tutorial, if we didn't build -DWITH_SERIALIZATION ifeq (,$(shell which flatc)) tutorial_lesson_23_serialization: @echo "Skipping tutorial lesson 23 (serialization not enabled) ..." endif test_mullapudi2016: $(MULLAPUDI2016_TESTS:$(ROOT_DIR)/test/autoschedulers/mullapudi2016/%.cpp=mullapudi2016_%) mullapudi2016_%: $(BIN_DIR)/mullapudi2016_% $(BIN_MULLAPUDI2016) @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< $(realpath $(BIN_MULLAPUDI2016)) @-echo test_li2018: $(LI2018_TESTS:$(ROOT_DIR)/test/autoschedulers/li2018/%.cpp=li2018_%) li2018_%: $(BIN_DIR)/li2018_% $(BIN_LI2018) @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< $(realpath $(BIN_LI2018)) @-echo test_adams2019: $(ADAMS2019_TESTS:$(ROOT_DIR)/test/autoschedulers/adams2019/%.cpp=adams2019_%) adams2019_test: $(BIN_DIR)/adams2019_test $(BIN_ADAMS2019) $(SRC_DIR)/autoschedulers/adams2019/baseline.weights @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< $(realpath $(BIN_ADAMS2019)) $(realpath $(SRC_DIR)/autoschedulers/adams2019/baseline.weights) @-echo time_compilation_test_%: $(BIN_DIR)/test_% $(TIME_COMPILATION) compile_times_correctness.csv make -f $(THIS_MAKEFILE) $(@:time_compilation_test_%=test_%) time_compilation_performance_%: $(BIN_DIR)/performance_% $(TIME_COMPILATION) compile_times_performance.csv make -f $(THIS_MAKEFILE) $(@:time_compilation_performance_%=performance_%) time_compilation_generator_%: $(BIN_DIR)/%.generator $(TIME_COMPILATION) compile_times_generator.csv make -f $(THIS_MAKEFILE) $(@:time_compilation_generator_%=$(FILTERS_DIR)/%.a) TEST_APPS=\ bilateral_grid \ bgu \ blur \ c_backend \ camera_pipe \ conv_layer \ fft \ hist \ interpolate \ lens_blur \ linear_algebra \ local_laplacian \ max_filter \ nl_means \ onnx \ resize \ resnet_50 \ stencil_chain \ wavelet TEST_APPS_DEPS=$(TEST_APPS:%=%_test_app) BUILD_APPS_DEPS=$(TEST_APPS:%=%_build_app) $(BUILD_APPS_DEPS): distrib @echo Building app $(@:%_build_app=%) for ${HL_TARGET}... @$(MAKE) -C $(ROOT_DIR)/apps/$(@:%_build_app=%) build \ HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) \ BIN_DIR=$(CURDIR)/$(BIN_DIR)/apps/$(@:%_build_app=%)/bin \ HL_TARGET=$(HL_TARGET) \ || exit 1 ; \ $(TEST_APPS_DEPS): distrib @echo Testing app $(@:%_test_app=%) for ${HL_TARGET}... @$(MAKE) -C $(ROOT_DIR)/apps/$(@:%_test_app=%) test \ HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) \ BIN_DIR=$(CURDIR)/$(BIN_DIR)/apps/$(@:%_test_app=%)/bin \ HL_TARGET=$(HL_TARGET) \ || exit 1 ; \ .PHONY: test_apps build_apps $(BUILD_APPS_DEPS) build_apps: $(BUILD_APPS_DEPS) test_apps: $(BUILD_APPS_DEPS) $(MAKE) -f $(THIS_MAKEFILE) -j1 $(TEST_APPS_DEPS) build_hannk: distrib @echo Building apps/hannk for ${HL_TARGET}... @$(MAKE) -C $(ROOT_DIR)/apps/hannk build \ HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) \ BIN_DIR=$(CURDIR)/$(BIN_DIR)/apps/hannk/bin \ HL_TARGET=$(HL_TARGET) \ || exit 1 ; \ test_hannk: build_hannk @echo Testing apps/hannk for ${HL_TARGET}... @$(MAKE) -C $(ROOT_DIR)/apps/hannk test \ HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) \ BIN_DIR=$(CURDIR)/$(BIN_DIR)/apps/hannk/bin \ HL_TARGET=$(HL_TARGET) \ || exit 1 ; \ BENCHMARK_APPS=\ bilateral_grid \ camera_pipe \ lens_blur \ local_laplacian \ nl_means \ stencil_chain $(BENCHMARK_APPS): distrib @echo Building $@ for ${HL_TARGET}... @$(MAKE) -C $(ROOT_DIR)/apps/$@ \ $(CURDIR)/$(BIN_DIR)/apps/$@/bin/$(HL_TARGET)/$@.rungen \ HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) \ BIN_DIR=$(CURDIR)/$(BIN_DIR)/apps/$@/bin \ HL_TARGET=$(HL_TARGET) \ > /dev/null \ || exit 1 .PHONY: benchmark_apps $(BENCHMARK_APPS) benchmark_apps: $(BENCHMARK_APPS) @for APP in $(BENCHMARK_APPS); do \ echo ;\ echo Benchmarking $${APP} for ${HL_TARGET}... ; \ make -C $(ROOT_DIR)/apps/$${APP} \ $${APP}.benchmark \ HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) \ BIN_DIR=$(CURDIR)/$(BIN_DIR)/apps/$${APP}/bin \ HL_TARGET=$(HL_TARGET) \ || exit 1 ; \ done # It's just for compiling the runtime, so earlier clangs *might* work, # but best to peg it to the minimum llvm version. ifneq (,$(findstring clang version 3.7,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 3.8,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 4.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 5.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 6.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 7.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 7.1,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 8.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 9.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 10.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 11.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 11.1,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 12.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 13.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 14.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 15.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 16.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 17.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring clang version 18.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq (,$(findstring Apple LLVM version 5.0,$(CLANG_VERSION))) CLANG_OK=yes endif ifneq ($(CLANG_OK), ) $(BUILD_DIR)/clang_ok: @echo "Found a new enough version of clang" mkdir -p $(BUILD_DIR) touch $(BUILD_DIR)/clang_ok else $(BUILD_DIR)/clang_ok: @echo "Can't find clang or version of clang too old (we need 3.7 or greater):" @echo "You can override this check by setting CLANG_OK=y" echo '$(CLANG_VERSION)' echo $(findstring version 3,$(CLANG_VERSION)) echo $(findstring version 3.0,$(CLANG_VERSION)) $(CLANG) --version @exit 1 endif ifneq (,$(findstring $(LLVM_VERSION_TIMES_10), 160 170 180)) LLVM_OK=yes endif ifneq ($(LLVM_OK), ) $(BUILD_DIR)/llvm_ok: $(BUILD_DIR)/rtti_ok @echo "Found a new enough version of llvm" mkdir -p $(BUILD_DIR) touch $(BUILD_DIR)/llvm_ok else $(BUILD_DIR)/llvm_ok: @echo "Can't find llvm or version of llvm too old (we need 9.0 or greater):" @echo "You can override this check by setting LLVM_OK=y" $(LLVM_CONFIG) --version @exit 1 endif ifneq ($(WITH_RTTI), ) ifneq ($(LLVM_HAS_NO_RTTI), ) else RTTI_OK=yes # Enabled in Halide and LLVM endif else RTTI_OK=yes # Enabled in LLVM but not in Halide endif ifneq ($(RTTI_OK), ) $(BUILD_DIR)/rtti_ok: mkdir -p $(BUILD_DIR) touch $(BUILD_DIR)/rtti_ok else $(BUILD_DIR)/rtti_ok: @echo "Can't enable RTTI - llvm was compiled without it." @echo "LLVM c++ flags: " $(LLVM_CXX_FLAGS) @exit 1 endif install: $(LIB_DIR)/libHalide.a $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(RUNTIME_EXPORTED_INCLUDES) mkdir -p $(PREFIX)/include $(PREFIX)/bin $(PREFIX)/lib $(PREFIX)/share/halide/tutorial/images $(PREFIX)/share/halide/tools $(PREFIX)/share/halide/tutorial/figures cp $(LIB_DIR)/libHalide.a $(BIN_DIR)/libHalide.$(SHARED_EXT) $(PREFIX)/lib cp $(INCLUDE_DIR)/Halide.h $(PREFIX)/include cp $(INCLUDE_DIR)/HalideBuffer.h $(PREFIX)/include cp $(INCLUDE_DIR)/HalideRuntim*.h $(PREFIX)/include cp $(ROOT_DIR)/tutorial/images/*.png $(PREFIX)/share/halide/tutorial/images cp $(ROOT_DIR)/tutorial/figures/*.gif $(PREFIX)/share/halide/tutorial/figures cp $(ROOT_DIR)/tutorial/figures/*.jpg $(PREFIX)/share/halide/tutorial/figures cp $(ROOT_DIR)/tutorial/figures/*.mp4 $(PREFIX)/share/halide/tutorial/figures cp $(ROOT_DIR)/tutorial/*.cpp $(PREFIX)/share/halide/tutorial cp $(ROOT_DIR)/tutorial/*.h $(PREFIX)/share/halide/tutorial cp $(ROOT_DIR)/tutorial/*.sh $(PREFIX)/share/halide/tutorial cp $(ROOT_DIR)/tools/GenGen.cpp $(PREFIX)/share/halide/tools cp $(ROOT_DIR)/tools/RunGen.h $(PREFIX)/share/halide/tools cp $(ROOT_DIR)/tools/RunGenMain.cpp $(PREFIX)/share/halide/tools cp $(ROOT_DIR)/tools/halide_image.h $(PREFIX)/share/halide/tools cp $(ROOT_DIR)/tools/halide_image_io.h $(PREFIX)/share/halide/tools cp $(ROOT_DIR)/tools/halide_image_info.h $(PREFIX)/share/halide/tools cp $(ROOT_DIR)/tools/halide_malloc_trace.h $(PREFIX)/share/halide/tools cp $(ROOT_DIR)/tools/halide_thread_pool.h $(PREFIX)/share/halide/tools ifeq ($(UNAME), Darwin) install_name_tool -id $(PREFIX)/lib/libHalide.$(SHARED_EXT) $(PREFIX)/lib/libHalide.$(SHARED_EXT) endif # This is a specialized 'install' for users who need Hexagon support libraries as well. install_qc: install $(HEXAGON_RUNTIME_LIBS) mkdir -p $(PREFIX)/bin $(PREFIX)/tools $(PREFIX)/support mkdir -p $(PREFIX)/lib/arm-32-android mkdir -p $(PREFIX)/lib/arm-64-android mkdir -p $(PREFIX)/lib/adsp/arm-32-android mkdir -p $(PREFIX)/lib/adsp/arm-64-android mkdir -p $(PREFIX)/lib/cdsp/arm-32-android mkdir -p $(PREFIX)/lib/cdsp/arm-64-android mkdir -p $(PREFIX)/lib/host mkdir -p $(PREFIX)/lib/v65 cp $(HEXAGON_RUNTIME_LIBS_DIR)/arm-32-android/* $(PREFIX)/lib/arm-32-android cp $(HEXAGON_RUNTIME_LIBS_DIR)/arm-64-android/* $(PREFIX)/lib/arm-64-android cp $(HEXAGON_RUNTIME_LIBS_DIR)/cdsp/arm-32-android/* $(PREFIX)/lib/cdsp/arm-32-android cp $(HEXAGON_RUNTIME_LIBS_DIR)/cdsp/arm-64-android/* $(PREFIX)/lib/cdsp/arm-64-android cp $(HEXAGON_RUNTIME_LIBS_DIR)/adsp/arm-32-android/* $(PREFIX)/lib/adsp/arm-32-android cp $(HEXAGON_RUNTIME_LIBS_DIR)/adsp/arm-64-android/* $(PREFIX)/lib/adsp/arm-64-android cp $(HEXAGON_RUNTIME_LIBS_DIR)/host/* $(PREFIX)/lib/host cp -r $(HEXAGON_RUNTIME_LIBS_DIR)/v65/* $(PREFIX)/lib/v65 ln -sf ../share/halide/tools/GenGen.cpp $(PREFIX)/tools/GenGen.cpp ln -sf ../lib/v65/hexagon_sim_remote $(PREFIX)/bin/hexagon_sim_remote ln -sf v65/libsim_qurt.a $(PREFIX)/lib/libsim_qurt.a ln -sf v65/libhalide_hexagon_remote_skel.so $(PREFIX)/lib/libhalide_hexagon_remote_skel.so # We need to capture the system libraries that we'll need to link # against, so that downstream consumers of our build rules don't # have to guess what's necessary on their system; call # llvm-config and capture the result in config files that # we include in our distribution. HALIDE_RTTI_RAW=$(if $(WITH_RTTI),1,0) $(BUILD_DIR)/halide_config.%: $(ROOT_DIR)/tools/halide_config.%.tpl @mkdir -p $(@D) cat $< | sed -e 's/@HALIDE_SYSTEM_LIBS_RAW@/${LLVM_SYSTEM_LIBS}/g' \ | sed -e 's/@HALIDE_RTTI_RAW@/${HALIDE_RTTI_RAW}/g' \ | sed -e 's;@HALIDE_LLVM_CXX_FLAGS_RAW@;${LLVM_CXX_FLAGS};g' > $@ $(DISTRIB_DIR)/lib/libHalide.$(SHARED_EXT): \ $(LIB_DIR)/libHalide.a \ $(BIN_DIR)/libHalide.$(SHARED_EXT) \ $(INCLUDE_DIR)/Halide.h \ $(RUNTIME_EXPORTED_INCLUDES) \ $(ROOT_DIR)/README*.md \ $(BUILD_DIR)/halide_config.make rm -rf $(DISTRIB_DIR) mkdir -p $(DISTRIB_DIR)/include \ $(DISTRIB_DIR)/bin \ $(DISTRIB_DIR)/lib \ $(DISTRIB_DIR)/tutorial \ $(DISTRIB_DIR)/tutorial/images \ $(DISTRIB_DIR)/tools \ $(DISTRIB_DIR)/tutorial/figures cp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(DISTRIB_DIR)/lib cp $(LIB_DIR)/libHalide.a $(DISTRIB_DIR)/lib cp $(INCLUDE_DIR)/Halide.h $(DISTRIB_DIR)/include cp $(INCLUDE_DIR)/HalideBuffer.h $(DISTRIB_DIR)/include cp $(INCLUDE_DIR)/HalideRuntim*.h $(DISTRIB_DIR)/include cp $(INCLUDE_DIR)/HalidePyTorch*.h $(DISTRIB_DIR)/include cp $(ROOT_DIR)/tutorial/images/*.png $(DISTRIB_DIR)/tutorial/images cp $(ROOT_DIR)/tutorial/figures/*.gif $(DISTRIB_DIR)/tutorial/figures cp $(ROOT_DIR)/tutorial/figures/*.jpg $(DISTRIB_DIR)/tutorial/figures cp $(ROOT_DIR)/tutorial/figures/*.mp4 $(DISTRIB_DIR)/tutorial/figures cp $(ROOT_DIR)/tutorial/*.cpp $(DISTRIB_DIR)/tutorial cp $(ROOT_DIR)/tutorial/*.h $(DISTRIB_DIR)/tutorial cp $(ROOT_DIR)/tutorial/*.sh $(DISTRIB_DIR)/tutorial cp $(ROOT_DIR)/tools/GenGen.cpp $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/tools/RunGen.h $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/tools/RunGenMain.cpp $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/tools/halide_benchmark.h $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/tools/halide_image.h $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/tools/halide_image_io.h $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/tools/halide_image_info.h $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/tools/halide_malloc_trace.h $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/tools/halide_thread_pool.h $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/tools/halide_trace_config.h $(DISTRIB_DIR)/tools cp $(ROOT_DIR)/README*.md $(DISTRIB_DIR) cp $(BUILD_DIR)/halide_config.* $(DISTRIB_DIR) ifeq ($(UNAME), Darwin) install_name_tool -id @rpath/libHalide.$(SHARED_EXT) $(DISTRIB_DIR)/lib/libHalide.$(SHARED_EXT) endif $(BIN_DIR)/libautoschedule_%.$(PLUGIN_EXT): $(DISTRIB_DIR)/lib/libHalide.$(SHARED_EXT) $(MAKE) -f $(SRC_DIR)/autoschedulers/$*/Makefile $@ HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) ifeq ($(UNAME), Darwin) install_name_tool -id @rpath/$(@F) $(CURDIR)/$@ endif $(DISTRIB_DIR)/lib/libautoschedule_%.$(PLUGIN_EXT): $(BIN_DIR)/libautoschedule_%.$(PLUGIN_EXT) @mkdir -p $(@D) cp $< $(DISTRIB_DIR)/lib ifeq ($(UNAME), Darwin) install_name_tool -id @rpath/$(@F) $(CURDIR)/$@ endif # Build some common tools $(DISTRIB_DIR)/bin/featurization_to_sample $(DISTRIB_DIR)/bin/get_host_target: $(DISTRIB_DIR)/lib/libHalide.$(SHARED_EXT) @mkdir -p $(@D) $(MAKE) -f $(SRC_DIR)/autoschedulers/common/Makefile $(BIN_DIR)/featurization_to_sample $(BIN_DIR)/get_host_target HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) for TOOL in featurization_to_sample get_host_target; do \ cp $(BIN_DIR)/$${TOOL} $(DISTRIB_DIR)/bin/; \ done # Adams2019 also includes autotuning tools $(DISTRIB_DIR)/lib/libautoschedule_adams2019.$(PLUGIN_EXT): $(BIN_DIR)/libautoschedule_adams2019.$(PLUGIN_EXT) @mkdir -p $(@D) $(MAKE) -f $(SRC_DIR)/autoschedulers/adams2019/Makefile $(BIN_DIR)/adams2019_retrain_cost_model $(BIN_DIR)/adams2019_weightsdir_to_weightsfile HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) cp $< $(DISTRIB_DIR)/lib/ for TOOL in adams2019_retrain_cost_model adams2019_weightsdir_to_weightsfile; do \ cp $(BIN_DIR)/$${TOOL} $(DISTRIB_DIR)/bin/; \ done cp $(SRC_DIR)/autoschedulers/adams2019/adams2019_autotune_loop.sh $(DISTRIB_DIR)/tools/ ifeq ($(UNAME), Darwin) install_name_tool -id @rpath/$(@F) $(CURDIR)/$@ endif autoschedulers: \ $(DISTRIB_DIR)/lib/libautoschedule_mullapudi2016.$(PLUGIN_EXT) \ $(DISTRIB_DIR)/lib/libautoschedule_li2018.$(PLUGIN_EXT) \ $(DISTRIB_DIR)/lib/libautoschedule_adams2019.$(PLUGIN_EXT) \ $(DISTRIB_DIR)/bin/featurization_to_sample \ $(DISTRIB_DIR)/bin/get_host_target .PHONY: distrib distrib: $(DISTRIB_DIR)/lib/libHalide.$(SHARED_EXT) autoschedulers $(DISTRIB_DIR)/halide.tgz: distrib ln -sf $(DISTRIB_DIR) halide tar -czf $(BUILD_DIR)/halide.tgz \ halide/bin \ halide/lib \ halide/include \ halide/tools \ halide/tutorial \ halide/README*.md \ halide/halide_config.* rm -rf halide mv $(BUILD_DIR)/halide.tgz $(DISTRIB_DIR)/halide.tgz $(BIN_DIR)/HalideTraceViz: $(ROOT_DIR)/util/HalideTraceViz.cpp $(INCLUDE_DIR)/HalideRuntime.h $(ROOT_DIR)/tools/halide_image_io.h $(ROOT_DIR)/tools/halide_trace_config.h $(CXX) $(OPTIMIZE) -std=c++17 $(filter %.cpp,$^) -I$(INCLUDE_DIR) -I$(ROOT_DIR)/tools -L$(BIN_DIR) -o $@ $(BIN_DIR)/HalideTraceDump: $(ROOT_DIR)/util/HalideTraceDump.cpp $(ROOT_DIR)/util/HalideTraceUtils.cpp $(INCLUDE_DIR)/HalideRuntime.h $(ROOT_DIR)/tools/halide_image_io.h $(CXX) $(OPTIMIZE) -std=c++17 $(filter %.cpp,$^) -I$(INCLUDE_DIR) -I$(ROOT_DIR)/tools -I$(ROOT_DIR)/src/runtime -L$(BIN_DIR) $(IMAGE_IO_CXX_FLAGS) $(IMAGE_IO_LIBS) -o $@ # Note: you must have CLANG_FORMAT_LLVM_INSTALL_DIR set for this rule to work. # Let's default to the Ubuntu install location. CLANG_FORMAT_LLVM_INSTALL_DIR ?= /usr/lib/llvm-12 .PHONY: format format: @CLANG_FORMAT_LLVM_INSTALL_DIR=$(CLANG_FORMAT_LLVM_INSTALL_DIR) ${ROOT_DIR}/run-clang-format.sh # Note: you must have CLANG_TIDY_LLVM_INSTALL_DIR set for these rules to work. # Let's default to the Ubuntu install location. CLANG_TIDY_LLVM_INSTALL_DIR ?= /usr/lib/llvm-12 .PHONY: clang-tidy clang-tidy: @CLANG_TIDY_LLVM_INSTALL_DIR=$(CLANG_TIDY_LLVM_INSTALL_DIR) ${ROOT_DIR}/run-clang-tidy.sh .PHONY: clang-tidy-fix clang-tidy-fix: @CLANG_TIDY_LLVM_INSTALL_DIR=$(CLANG_TIDY_LLVM_INSTALL_DIR) ${ROOT_DIR}/run-clang-tidy.sh -fix # Build the documentation. Be sure to keep this synchronized with doc/CMakeLists.txt # if you choose to edit it. # Copy ROOT_DIR to keep the following Doxyfile closer to CMake Halide_SOURCE_DIR=${ROOT_DIR} define Doxyfile # Keep the following in sync with doc/CMakeLists.txt ALPHABETICAL_INDEX = NO BUILTIN_STL_SUPPORT = YES CASE_SENSE_NAMES = NO CLASS_DIAGRAMS = NO DISTRIBUTE_GROUP_DOC = YES EXAMPLE_PATH = "${Halide_SOURCE_DIR}/tutorial" EXCLUDE = bin EXTRACT_ALL = YES EXTRACT_LOCAL_CLASSES = NO FILE_PATTERNS = *.h GENERATE_TREEVIEW = YES HIDE_FRIEND_COMPOUNDS = YES HIDE_IN_BODY_DOCS = YES HIDE_UNDOC_CLASSES = YES HIDE_UNDOC_MEMBERS = YES JAVADOC_AUTOBRIEF = YES QT_AUTOBRIEF = YES QUIET = YES RECURSIVE = YES REFERENCED_BY_RELATION = YES REFERENCES_RELATION = YES SORT_BY_SCOPE_NAME = YES SORT_MEMBER_DOCS = NO SOURCE_BROWSER = YES STRIP_CODE_COMMENTS = NO # Makefile-specific options GENERATE_LATEX = NO HAVE_DOT = NO HTML_OUTPUT = . INPUT = "${Halide_SOURCE_DIR}/src" "${Halide_SOURCE_DIR}/test" OUTPUT_DIRECTORY = ${DOC_DIR} PROJECT_NAME = Halide endef # Make the above Doxyfile variable available to the doc target. export Doxyfile .PHONY: doc doc: @-mkdir -p $(TMP_DIR) echo "$$Doxyfile" > $(TMP_DIR)/Doxyfile @-mkdir -p ${DOC_DIR} doxygen $(TMP_DIR)/Doxyfile Halide-17.0.1/README.md000066400000000000000000000410401456515664200142620ustar00rootroot00000000000000# Halide Halide is a programming language designed to make it easier to write high-performance image and array processing code on modern machines. Halide currently targets: - CPU architectures: X86, ARM, Hexagon, PowerPC, RISC-V - Operating systems: Linux, Windows, macOS, Android, iOS, Qualcomm QuRT - GPU Compute APIs: CUDA, OpenCL, OpenGL Compute Shaders, Apple Metal, Microsoft Direct X 12, Vulkan Rather than being a standalone programming language, Halide is embedded in C++. This means you write C++ code that builds an in-memory representation of a Halide pipeline using Halide's C++ API. You can then compile this representation to an object file, or JIT-compile it and run it in the same process. Halide also provides a Python binding that provides full support for writing Halide embedded in Python without C++. Halide requires C++17 (or later) to use. For more detail about what Halide is, see http://halide-lang.org. For API documentation see http://halide-lang.org/docs To see some example code, look in the tutorials directory. If you've acquired a full source distribution and want to build Halide, see the [notes below](#building-halide-with-cmake). # Getting Halide ## Binary tarballs The latest version of Halide can always be found on GitHub at https://github.com/halide/Halide/releases We provide binary releases for many popular platforms and architectures, including 32/64-bit x86 Windows, 64-bit macOS, and 32/64-bit x86/ARM Ubuntu Linux. ## Vcpkg If you use [vcpkg](https://github.com/microsoft/vcpkg) to manage dependencies, you can install Halide via: ``` $ vcpkg install halide:x64-windows # or x64-linux/x64-osx ``` One caveat: vcpkg installs only the minimum Halide backends required to compile code for the active platform. If you want to include all the backends, you should install `halide[target-all]:x64-windows` instead. Note that since this will build LLVM, it will take a _lot_ of disk space (up to 100GB). ## Homebrew Alternatively, if you use macOS, you can install Halide via [Homebrew](https://brew.sh/) like so: ``` $ brew install halide ``` ## Other package managers We are interested in bringing Halide to other popular package managers and Linux distribution repositories including, but not limited to, Conan, Debian, [Ubuntu (or PPA)](https://github.com/halide/Halide/issues/5285), CentOS/Fedora, and Arch. If you have experience publishing packages we would be happy to work with you! If you are a maintainer of any other package distribution platform, we would be excited to work with you, too. # Platform Support There are two sets of platform requirements relevant to Halide: those required to run the compiler library in either JIT or AOT mode, and those required to run the _binary outputs_ of the AOT compiler. These are the **tested** host toolchain and platform combinations for building and running the Halide compiler library. | Compiler | Version | OS | Architectures | |------------|--------------|------------------------|-----------------| | GCC | 9.4 | Ubuntu Linux 20.04 LTS | x86, x64, ARM32 | | GCC | 9.4 | Ubuntu Linux 18.04 LTS | ARM32, ARM64 | | MSVC | 2019 (19.28) | Windows 10 (20H2) | x86, x64 | | AppleClang | 14.0.3 | macOS 13.4 | x86_64 | | AppleClang | 14.0.3 | macOS 13.4 | ARM64 | Some users have successfully built Halide for Linux using Clang 9.0.0+, for Windows using ClangCL 11.0.0+, and for Windows ARM64 by cross-compiling with MSVC. We do not actively test these scenarios, however, so your mileage may vary. Beyond these, we are willing to support (by accepting PRs for) platform and toolchain combinations that still receive _active, first-party, public support_ from their original vendors. For instance, at time of writing, this excludes Windows 7 and includes Ubuntu 18.04 LTS. Compiled AOT pipelines are expected to have much broader platform support. The binaries use the C ABI, and we expect any compliant C compiler to be able to use the generated headers correctly. The C++ bindings currently require C++17. If you discover a compatibility problem with a generated pipeline, please open an issue. # Building Halide with Make ### TL;DR Have llvm-16.0 (or greater) installed and run `make` in the root directory of the repository (where this README is). ### Acquiring LLVM At any point in time, building Halide requires either the latest stable version of LLVM, the previous stable version of LLVM, and trunk. At the time of writing, this means versions 18, 17, and 16 are supported, but 15 is not. The commands `llvm-config` and `clang` must be somewhere in the path. If your OS does not have packages for LLVM, you can find binaries for it at http://llvm.org/releases/download.html. Download an appropriate package and then either install it, or at least put the `bin` subdirectory in your path. (This works well on OS X and Ubuntu.) If you want to build it yourself, first check it out from GitHub: ``` % git clone --depth 1 --branch llvmorg-16.0.6 https://github.com/llvm/llvm-project.git ``` (If you want to build LLVM 17.x, use branch `release/17.x`; for current trunk, use `main`) Then build it like so: ``` % cmake -DCMAKE_BUILD_TYPE=Release \ -DLLVM_ENABLE_PROJECTS="clang;lld;clang-tools-extra" \ -DLLVM_TARGETS_TO_BUILD="X86;ARM;NVPTX;AArch64;Hexagon;WebAssembly;RISCV" \ -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \ -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \ -DLLVM_ENABLE_RUNTIMES="compiler-rt" \ -S llvm-project/llvm -B llvm-build % cmake --build llvm-build % cmake --install llvm-build --prefix llvm-install ``` Running a serial build will be slow. To improve speed, try running a parallel build. That's done by default in Ninja; for make, use the option -j NNN, where NNN is the number of parallel jobs, e.g. the number of CPUs you have. Then, point Halide to it: ``` % export LLVM_ROOT=$PWD/llvm-install % export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config ``` Note that you _must_ add `clang` to `LLVM_ENABLE_PROJECTS`; adding `lld` to `LLVM_ENABLE_PROJECTS` is only required when using WebAssembly, `LLVM_ENABLE_RUNTIMES="compiler-rt"` is only required if building the fuzz tests, and adding `clang-tools-extra` is only necessary if you plan to contribute code to Halide (so that you can run `clang-tidy` on your pull requests). We recommend enabling both in all cases to simplify builds. You can disable exception handling (EH) and RTTI if you don't want the Python bindings. ### Building Halide with make With `LLVM_CONFIG` set (or `llvm-config` in your path), you should be able to just run `make` in the root directory of the Halide source tree. `make run_tests` will run the JIT test suite, and `make test_apps` will make sure all the apps compile and run (but won't check their output). There is no `make install`. If you want to make an install package, use CMake. ### Building Halide out-of-tree with make If you wish to build Halide in a separate directory, you can do that like so: % cd .. % mkdir halide_build % cd halide_build % make -f ../Halide/Makefile # Building Halide with CMake ### MacOS and Linux Follow the above instructions to build LLVM or acquire a suitable binary release. Then change directory to the Halide repository and run: ``` % cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_DIR=$LLVM_ROOT/lib/cmake/llvm -S . -B build % cmake --build build ``` `LLVM_DIR` is the folder in the LLVM installation tree **(do not use the build tree by mistake)** that contains `LLVMConfig.cmake`. It is not required to set this variable if you have a suitable system-wide version installed. If you have multiple system-wide versions installed, you can specify the version with `Halide_REQUIRE_LLVM_VERSION`. Remove `-G Ninja` if you prefer to build with a different generator. ### Windows We suggest building with Visual Studio 2019. Your mileage may vary with earlier versions. Be sure to install the "C++ CMake tools for Windows" in the Visual Studio installer. For older versions of Visual Studio, do not install the CMake tools, but instead acquire CMake and Ninja from their respective project websites. These instructions start from the `D:` drive. We assume this git repo is cloned to `D:\Halide`. We also assume that your shell environment is set up correctly. For a 64-bit build, run: ``` D:\> "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 ``` For a 32-bit build, run: ``` D:\> "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64_x86 ``` #### Managing dependencies with vcpkg The best way to get compatible dependencies on Windows is to use [vcpkg](https://github.com/Microsoft/vcpkg). Install it like so: ``` D:\> git clone https://github.com/Microsoft/vcpkg.git D:\> cd vcpkg D:\> .\bootstrap-vcpkg.bat D:\vcpkg> .\vcpkg integrate install ... CMake projects should use: "-DCMAKE_TOOLCHAIN_FILE=D:/vcpkg/scripts/buildsystems/vcpkg.cmake" ``` Then install the libraries. For a 64-bit build, run: ``` D:\vcpkg> .\vcpkg install libpng:x64-windows libjpeg-turbo:x64-windows llvm[target-all,clang-tools-extra]:x64-windows ``` To support 32-bit builds, also run: ``` D:\vcpkg> .\vcpkg install libpng:x86-windows libjpeg-turbo:x86-windows llvm[target-all,clang-tools-extra]:x86-windows ``` #### Building Halide Create a separate build tree and call CMake with vcpkg's toolchain. This will build in either 32-bit or 64-bit depending on the environment script (`vcvars`) that was run earlier. ``` D:\Halide> cmake -G Ninja ^ -DCMAKE_BUILD_TYPE=Release ^ -DCMAKE_TOOLCHAIN_FILE=D:/vcpkg/scripts/buildsystems/vcpkg.cmake ^ -S . -B build ``` **Note:** If building with Python bindings on 32-bit (enabled by default), be sure to point CMake to the installation path of a 32-bit Python 3. You can do this by specifying, for example: `"-DPython3_ROOT_DIR=C:\Program Files (x86)\Python38-32"`. Then run the build with: ``` D:\Halide> cmake --build build --config Release ``` To run all the tests: ``` D:\Halide> cd build D:\Halide\build> ctest -C Release ``` Subsets of the tests can be selected with `-L` and include `correctness`, `python`, `error`, and the other directory names under `/tests`. #### Building LLVM (optional) Follow these steps if you want to build LLVM yourself. First, download LLVM's sources (these instructions use the latest 17.0 release) ``` D:\> git clone --depth 1 --branch release/17.x https://github.com/llvm/llvm-project.git ``` For a 64-bit build, run: ``` D:\> cmake -G Ninja ^ -DCMAKE_BUILD_TYPE=Release ^ -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra ^ -DLLVM_ENABLE_TERMINFO=OFF ^ -DLLVM_TARGETS_TO_BUILD=X86;ARM;NVPTX;AArch64;Hexagon;RISCV ^ -DLLVM_ENABLE_ASSERTIONS=ON ^ -DLLVM_ENABLE_EH=ON ^ -DLLVM_ENABLE_RTTI=ON ^ -DLLVM_BUILD_32_BITS=OFF ^ -S llvm-project\llvm -B llvm-build ``` For a 32-bit build, run: ``` D:\> cmake -G Ninja ^ -DCMAKE_BUILD_TYPE=Release ^ -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra ^ -DLLVM_ENABLE_TERMINFO=OFF ^ -DLLVM_TARGETS_TO_BUILD=X86;ARM;NVPTX;AArch64;Hexagon;RISCV ^ -DLLVM_ENABLE_ASSERTIONS=ON ^ -DLLVM_ENABLE_EH=ON ^ -DLLVM_ENABLE_RTTI=ON ^ -DLLVM_BUILD_32_BITS=ON ^ -S llvm-project\llvm -B llvm32-build ``` Finally, run: ``` D:\> cmake --build llvm-build --config Release D:\> cmake --install llvm-build --prefix llvm-install ``` You can substitute `Debug` for `Release` in the above `cmake` commands if you want a debug build. Make sure to add `-DLLVM_DIR=D:/llvm-install/lib/cmake/llvm` to the Halide CMake command to override `vcpkg`'s LLVM. **MSBuild:** If you want to build LLVM with MSBuild instead of Ninja, use `-G "Visual Studio 16 2019" -Thost=x64 -A x64` or `-G "Visual Studio 16 2019" -Thost=x64 -A Win32` in place of `-G Ninja`. #### If all else fails... Do what the build-bots do: https://buildbot.halide-lang.org/master/#/builders If the column that best matches your system is red, then maybe things aren't just broken for you. If it's green, then you can click the "stdio" links in the latest build to see what commands the build bots run, and what the output was. # Some useful environment variables `HL_TARGET=...` will set Halide's AOT compilation target. `HL_JIT_TARGET=...` will set Halide's JIT compilation target. `HL_DEBUG_CODEGEN=1` will print out pseudocode for what Halide is compiling. Higher numbers will print more detail. `HL_NUM_THREADS=...` specifies the number of threads to create for the thread pool. When the async scheduling directive is used, more threads than this number may be required and thus allocated. A maximum of 256 threads is allowed. (By default, the number of cores on the host is used.) `HL_TRACE_FILE=...` specifies a binary target file to dump tracing data into (ignored unless at least one `trace_` feature is enabled in `HL_TARGET` or `HL_JIT_TARGET`). The output can be parsed programmatically by starting from the code in `utils/HalideTraceViz.cpp`. # Using Halide on OSX Precompiled Halide distributions are built using XCode's command-line tools with Apple clang 500.2.76. This means that we link against libc++ instead of libstdc++. You may need to adjust compiler options accordingly if you're using an older XCode which does not default to libc++. # Halide for Hexagon HVX Halide supports offloading work to Qualcomm Hexagon DSP on Qualcomm Snapdragon 845/710 devices or newer. The Hexagon DSP provides a set of 128 byte vector instruction extensions - the Hexagon Vector eXtensions (HVX). HVX is well suited for image processing, and Halide for Hexagon HVX will generate the appropriate HVX vector instructions from a program authored in Halide. Halide can be used to compile Hexagon object files directly, by using a target such as `hexagon-32-qurt-hvx`. Halide can also be used to offload parts of a pipeline to Hexagon using the `hexagon` scheduling directive. To enable the `hexagon` scheduling directive, include the `hvx` target feature in your target. The currently supported combination of targets is to use the HVX target features with an x86 linux host (to use the simulator) or with an ARM android target (to use Hexagon DSP hardware). For examples of using the `hexagon` scheduling directive on both the simulator and a Hexagon DSP, see the blur example app. To build and run an example app using the Hexagon target, 1. Obtain and build trunk LLVM and Clang. (Earlier versions of LLVM may work but are not actively tested and thus not recommended.) 2. Download and install the Hexagon SDK and Hexagon Tools. Hexagon SDK 4.3.0 or later is needed. Hexagon Tools 8.4 or later is needed. 3. Build and run an example for Hexagon HVX ### 1. Obtain and build trunk LLVM and Clang (Follow the instructions given previously, just be sure to check out the `main` branch.) ### 2. Download and install the Hexagon SDK and Hexagon Tools Go to https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools 1. Select the Hexagon Series 600 Software and download & run QPM and install the Hexagon SDK 4.3.0 version or later for Linux. 2. untar the installer 3. Run the extracted installer to install the Hexagon SDK and Hexagon Tools, selecting Installation of Hexagon SDK into `/location/of/SDK/Hexagon_SDK/4.x` and the Hexagon tools into `/location/of/SDK/Hexagon_Tools/8.x` 4. Set an environment variable to point to the SDK installation location ``` export SDK_LOC=/location/of/SDK ``` ### 3. Build and run an example for Hexagon HVX In addition to running Hexagon code on device, Halide also supports running Hexagon code on the simulator from the Hexagon tools. To build and run the blur example in Halide/apps/blur on the simulator: ``` cd apps/blur export HL_HEXAGON_SIM_REMOTE=../../src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote export HL_HEXAGON_TOOLS=$SDK_LOC/Hexagon_Tools/8.x/Tools/ LD_LIBRARY_PATH=../../src/runtime/hexagon_remote/bin/host/:$HL_HEXAGON_TOOLS/lib/iss/:. HL_TARGET=host-hvx make test ``` ### To build and run the blur example in Halide/apps/blur on Android: To build the example for Android, first ensure that you have Android NDK r19b or later installed, and the ANDROID_NDK_ROOT environment variable points to it. (Note that Qualcomm Hexagon SDK v4.3.0 includes Android NDK r19c, which is fine.) Now build and run the blur example using the script to run it on device: ``` export HL_HEXAGON_TOOLS=$SDK_LOC/HEXAGON_Tools/8.4.11/Tools/ HL_TARGET=arm-64-android-hvx ./adb_run_on_device.sh ``` Halide-17.0.1/README_cmake.md000066400000000000000000002336161456515664200154360ustar00rootroot00000000000000# Halide and CMake This is a comprehensive guide to the three main usage stories of the Halide CMake build. 1. Compiling or packaging Halide from source. 2. Building Halide programs using the official CMake package. 3. Contributing to Halide and updating the build files. The following sections cover each in detail. ## Table of Contents - [Halide and CMake](#halide-and-cmake) - [Table of Contents](#table-of-contents) - [Getting started](#getting-started) - [Installing CMake](#installing-cmake) - [Cross-platform](#cross-platform) - [Windows](#windows) - [macOS](#macos) - [Ubuntu Linux](#ubuntu-linux) - [Installing dependencies](#installing-dependencies) - [Windows](#windows-1) - [macOS](#macos-1) - [Ubuntu](#ubuntu) - [Building Halide with CMake](#building-halide-with-cmake) - [Basic build](#basic-build) - [Windows](#windows-2) - [macOS and Linux](#macos-and-linux) - [CMake Presets](#cmake-presets) - [Installing](#installing) - [Build options](#build-options) - [Find module options](#find-module-options) - [Using Halide from your CMake build](#using-halide-from-your-cmake-build) - [A basic CMake project](#a-basic-cmake-project) - [JIT mode](#jit-mode) - [AOT mode](#aot-mode) - [Autoschedulers](#autoschedulers) - [RunGenMain](#rungenmain) - [Halide package documentation](#halide-package-documentation) - [Components](#components) - [Variables](#variables) - [Imported targets](#imported-targets) - [Functions](#functions) - [`add_halide_library`](#add_halide_library) - [`add_halide_generator`](#add_halide_generator) - [`add_halide_python_extension_library`](#add_halide_python_extension_library) - [`add_halide_runtime`](#add_halide_runtime) - [Cross compiling](#cross-compiling) - [Use `add_halide_generator`](#use-add_halide_generator) - [Use a super-build](#use-a-super-build) - [Use `ExternalProject` directly](#use-externalproject-directly) - [Use an emulator or run on device](#use-an-emulator-or-run-on-device) - [Bypass CMake](#bypass-cmake) - [Contributing CMake code to Halide](#contributing-cmake-code-to-halide) - [General guidelines and best practices](#general-guidelines-and-best-practices) - [Prohibited commands list](#prohibited-commands-list) - [Prohibited variables list](#prohibited-variables-list) - [Adding tests](#adding-tests) - [Adding apps](#adding-apps) # Getting started This section covers installing a recent version of CMake and the correct dependencies for building and using Halide. If you have not used CMake before, we strongly suggest reading through the [CMake documentation][cmake-docs] first. ## Installing CMake Halide requires at least version 3.22, which was released in November 2021. Fortunately, getting a recent version of CMake couldn't be easier, and there are multiple good options on any system to do so. Generally, one should always have the most recent version of CMake installed system-wide. CMake is committed to backwards compatibility and even the most recent release can build projects over a decade old. ### Cross-platform The Python package manager `pip3` has the newest version of CMake at all times. This might be the most convenient method since Python 3 is an optional dependency for Halide, anyway. ``` $ pip3 install --upgrade cmake ``` See the [PyPI website][pypi-cmake] for more details. ### Windows On Windows, there are three primary methods for installing an up-to-date CMake: 1. If you have Visual Studio 2019 installed, you can get CMake 3.17 through the Visual Studio installer. This is the recommended way of getting CMake if you are able to use Visual Studio 2019. See Microsoft's [documentation][vs2019-cmake-docs] for more details. 2. If you use [Chocolatey][chocolatey], its [CMake package][choco-cmake] is kept up to date. It should be as simple as `choco install cmake`. 3. Otherwise, you should install CMake from [Kitware's website][cmake-download]. ### macOS On macOS, the [Homebrew][homebrew] [CMake package][brew-cmake] is kept up to date. Simply run: ``` $ brew update $ brew install cmake ``` to install the newest version of CMake. If your environment prevents you from installing Homebrew, the binary release on [Kitware's website][cmake-download] is also a viable option. ### Ubuntu Linux There are a few good ways to install a modern CMake on Ubuntu: 1. If you're on Ubuntu Linux 22.04 (Jammy Jellyfish), then simply running `sudo apt install cmake` will get you CMake 3.22. 2. If you are on an older Ubuntu release or would like to use the newest CMake, try installing via the snap store: `snap install cmake`. Be sure you do not already have `cmake` installed via APT. The snap package automatically stays up to date. 3. For older versions of Debian, Ubuntu, Mint, and derivatives, Kitware provides an [APT repository][cmake-apt] with up-to-date releases. Note that this is still useful for Ubuntu 20.04 because it will remain up to date. 4. If all else fails, you might need to build CMake from source (eg. on old Ubuntu versions running on ARM). In that case, follow the directions posted on [Kitware's website][cmake-from-source]. For other Linux distributions, check with your distribution's package manager or use pip as detailed above. Snap packages might also be available. **Note:** On WSL 1, the snap service is not available; in this case, prefer to use the APT repository. On WSL 2, all methods are available. ## Installing dependencies We generally recommend using a package manager to fetch Halide's dependencies. Except where noted, we recommend using [vcpkg][vcpkg] on Windows, [Homebrew][homebrew] on macOS, and APT on Ubuntu 20.04 LTS. Only LLVM and Clang are _absolutely_ required to build Halide. Halide always supports three LLVM versions: the current major version, the previous major version, and trunk. The LLVM and Clang versions must match exactly. For most users, we recommend using a binary release of LLVM rather than building it yourself. However, to run all of the tests and apps, an extended set is needed. This includes [lld][lld], [Python 3][python], [libpng][libpng], [libjpeg][libjpeg], [Doxygen][doxygen], [OpenBLAS][openblas], [ATLAS][atlas], and [Eigen3][eigen]. While not required to build any part of Halide, we find that [Ninja][ninja] is the best backend build tool across all platforms. Note that CMake has many special variables for overriding the locations of packages and executables. A partial list can be found in the ["find module options"](#find-module-options) section below, and more can be found in the documentation for the CMake [find_package][find_package] command. Normally, you should prefer to make sure your environment is set up so that CMake can find dependencies automatically. For instance, if you want CMake to use a particular version of Python, create a [virtual environment][venv] and activate it _before_ configuring Halide. ### Windows We assume you have vcpkg installed at `D:\vcpkg`. Follow the instructions in the [vcpkg README][vcpkg] to install. Start by installing LLVM. ``` D:\vcpkg> .\vcpkg install llvm[target-all,enable-assertions,clang-tools-extra]:x64-windows D:\vcpkg> .\vcpkg install llvm[target-all,enable-assertions,clang-tools-extra]:x86-windows ``` This will also install Clang and LLD. The `enable-assertions` option is not strictly necessary but will make debugging during development much smoother. These builds will take a long time and a lot of disk space. After they are built, it is safe to delete the intermediate build files and caches in `D:\vcpkg\buildtrees` and `%APPDATA%\local\vcpkg`. Then install the other libraries: ``` D:\vcpkg> .\vcpkg install libpng:x64-windows libjpeg-turbo:x64-windows openblas:x64-windows eigen3:x64-windows D:\vcpkg> .\vcpkg install libpng:x86-windows libjpeg-turbo:x86-windows openblas:x86-windows eigen3:x86-windows ``` To build the documentation, you will need to install [Doxygen][doxygen]. This can be done either through [Chocolatey][choco-doxygen] or from the [Doxygen website][doxygen-download]. ``` > choco install doxygen ``` To build the Python bindings, you will need to install Python 3. This should be done by running the official installer from the [Python website][python]. Be sure to download the debugging symbols through the installer. This will require using the "Advanced Installation" workflow. Although it is not strictly necessary, it is convenient to install Python system-wide on Windows (ie. `C:\Program Files`). This makes it easy for CMake to find without needing to manually set the `PATH`. Once Python is installed, you can install the Python module dependencies either globally or in a [virtual environment][venv] by running ``` > pip3 install -r .\python_bindings\requirements.txt ``` from the root of the repository. If you would like to use [Ninja][ninja], note that it is installed alongside CMake when using the Visual Studio 2019 installer. Alternatively, you can install via [Chocolatey][choco-ninja] or place the [pre-built binary][ninja-download] from their website in the PATH. ``` > choco install ninja ``` ### macOS On macOS, it is possible to install all dependencies via [Homebrew][homebrew]: ``` $ brew install llvm libpng libjpeg python@3.8 openblas doxygen ninja ``` The `llvm` package includes `clang`, `clang-format`, and `lld`, too. Don't forget to install the Python module dependencies: ``` $ pip3 install -r python_bindings/requirements.txt ``` ### Ubuntu Finally, on Ubuntu 20.04 LTS, you should install the following packages (this includes the Python module dependencies): ``` dev@ubuntu:~$ sudo apt install \ clang-tools lld llvm-dev libclang-dev liblld-10-dev \ libpng-dev libjpeg-dev libgl-dev \ python3-dev python3-numpy python3-scipy python3-imageio python3-pybind11 \ libopenblas-dev libeigen3-dev libatlas-base-dev \ doxygen ninja-build ``` # Building Halide with CMake ## Basic build These instructions assume that your working directory is the Halide repo root. ### Windows If you plan to use the Ninja generator, be sure to be in the developer command prompt corresponding to your intended environment. Note that whatever your intended target system (x86, x64, or arm), you must use the 64-bit _host tools_ because the 32-bit tools run out of memory during the linking step with LLVM. More information is available from [Microsoft's documentation][msvc-cmd]. You should either open the correct Developer Command Prompt directly or run the [`vcvarsall.bat`][vcvarsall] script with the correct argument, ie. one of the following: ``` D:\> "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 D:\> "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64_x86 D:\> "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64_arm ``` Then, assuming that vcpkg is installed to `D:\vcpkg`, simply run: ``` > cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=D:\vcpkg\scripts\buildsystems\vcpkg.cmake -S . -B build > cmake --build .\build ``` Valid values of [`CMAKE_BUILD_TYPE`][cmake_build_type] are `Debug`, `RelWithDebInfo`, `MinSizeRel`, and `Release`. When using a single-configuration generator (like Ninja) you must specify a build type when configuring Halide (or any other CMake project). Otherwise, if you wish to create a Visual Studio based build system, you can configure with: ``` > cmake -G "Visual Studio 16 2019" -Thost=x64 -A x64 ^ -DCMAKE_TOOLCHAIN_FILE=D:\vcpkg\scripts\buildsystems\vcpkg.cmake ^ -S . -B build > cmake --build .\build --config Release -j %NUMBER_OF_PROCESSORS% ``` Because the Visual Studio generator is a _multi-config generator_, you don't set `CMAKE_BUILD_TYPE` at configure-time, but instead pass the configuration to the build (and test/install) commands with the `--config` flag. More documentation is available in the [CMake User Interaction Guide][cmake-user-interaction]. The process is similar for 32-bit: ``` > cmake -G "Visual Studio 16 2019" -Thost=x64 -A Win32 ^ -DCMAKE_TOOLCHAIN_FILE=D:\vcpkg\scripts\buildsystems\vcpkg.cmake ^ -S . -B build > cmake --build .\build --config Release -j %NUMBER_OF_PROCESSORS% ``` In both cases, the `-Thost=x64` flag ensures that the correct host tools are used. **Note:** due to limitations in MSBuild, incremental builds using the VS generators will not detect changes to headers in the `src/runtime` folder. We recommend using Ninja for day-to-day development and use Visual Studio only if you need it for packaging. ### macOS and Linux The instructions here are straightforward. Assuming your environment is set up correctly, just run: ``` dev@host:~/Halide$ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build dev@host:~/Halide$ cmake --build ./build ``` If you omit `-G Ninja`, a Makefile-based generator will likely be used instead. In either case, [`CMAKE_BUILD_TYPE`][cmake_build_type] must be set to one of the standard types: `Debug`, `RelWithDebInfo`, `MinSizeRel`, or `Release`. ### CMake Presets If you are using CMake 3.21+, we provide several [presets][cmake_presets] to make the above commands more convenient. The following CMake preset commands correspond to the longer ones above. ``` > cmake --preset=win64 # VS 2019 generator, 64-bit build, vcpkg deps > cmake --preset=win32 # VS 2019 generator, 32-bit build, vcpkg deps > cmake --preset=release # Release mode, any single-config generator / compiler $ cmake --list-presets # Get full list of presets. ``` The Windows presets assume that the environment variable `VCPKG_ROOT` is set and points to the root of the vcpkg installation. There are also presets to use some Clang sanitizers with the CMake build; at present, only Fuzzer and ASAN (Address Sanitizer) are supported, and only on linux-x86-64. To use these, you must build LLVM with additional options: ``` -D LLVM_ENABLE_PROJECTS="clang;lld;clang-tools-extra" -D LLVM_ENABLE_RUNTIMES="compiler-rt;libcxx;libcxxabi;libunwind" ``` To build / test with ASAN, use `--preset linux-x64-asan`. To build / test with the Fuzzer, use `--preset linux-x64-fuzzer`. ## Installing Once built, Halide will need to be installed somewhere before using it in a separate project. On any platform, this means running the [`cmake --install`][cmake-install] command in one of two ways. For a single-configuration generator (like Ninja), run either: ``` dev@host:~/Halide$ cmake --install ./build --prefix /path/to/Halide-install > cmake --install .\build --prefix X:\path\to\Halide-install ``` For a multi-configuration generator (like Visual Studio) run: ``` dev@host:~/Halide$ cmake --install ./build --prefix /path/to/Halide-install --config Release > cmake --install .\build --prefix X:\path\to\Halide-install --config Release ``` Of course, make sure that you build the corresponding config before attempting to install it. ## Build options Halide reads and understands several options that can configure the build. The following are the most consequential and control how Halide is actually compiled. | Option | Default | Description | |------------------------------------------|-----------------------|------------------------------------------------------------------------------------------------------------------| | [`BUILD_SHARED_LIBS`][build_shared_libs] | `ON` | Standard CMake variable that chooses whether to build as a static or shared library. | | `Halide_BUNDLE_LLVM` | `OFF` | When building Halide as a static library, unpack the LLVM static libraries and add those objects to libHalide.a. | | `Halide_SHARED_LLVM` | `OFF` | Link to the shared version of LLVM. Not available on Windows. | | `Halide_ENABLE_RTTI` | _inherited from LLVM_ | Enable RTTI when building Halide. Recommended to be set to `ON` | | `Halide_ENABLE_EXCEPTIONS` | `ON` | Enable exceptions when building Halide | | `Halide_TARGET` | _empty_ | The default target triple to use for `add_halide_library` (and the generator tests, by extension) | The following options are _advanced_ and should not be required in typical workflows. Generally, these are used by Halide's own CI infrastructure, or as escape hatches for third-party packagers. | Option | Default | Description | |-----------------------------|--------------------------------------------------------------------|------------------------------------------------------------------------------------------| | `Halide_CLANG_TIDY_BUILD` | `OFF` | Used internally to generate fake compile jobs for runtime files when running clang-tidy. | | `Halide_CCACHE_BUILD` | `OFF` | Use ccache with Halide-recommended settings to accelerate rebuilds. | | `Halide_CCACHE_PARAMS` | `CCACHE_CPP2=yes CCACHE_HASHDIR=yes CCACHE_SLOPPINESS=pch_defines` | Options to pass to `ccache` when using `Halide_CCACHE_BUILD`. | | `Halide_SOVERSION_OVERRIDE` | `${Halide_VERSION_MAJOR}` | Override the SOVERSION for libHalide. Expects a positive integer (i.e. not a version). | The following options are only available when building Halide directly, ie. not through the [`add_subdirectory`][add_subdirectory] or [`FetchContent`][fetchcontent] mechanisms. They control whether non-essential targets (like tests and documentation) are built. | Option | Default | Description | |------------------------|----------------------|------------------------------------------------------------------| | `WITH_TESTS` | `ON` | Enable building unit and integration tests | | `WITH_PYTHON_BINDINGS` | `ON` if Python found | Enable building Python 3.x bindings | | `WITH_DOCS` | `OFF` | Enable building the documentation via Doxygen | | `WITH_UTILS` | `ON` | Enable building various utilities including the trace visualizer | | `WITH_TUTORIALS` | `ON` | Enable building the tutorials | The following options control whether to build certain test subsets. They only apply when `WITH_TESTS=ON`: | Option | Default | Description | |---------------------------|---------|-----------------------------------| | `WITH_TEST_AUTO_SCHEDULE` | `ON` | enable the auto-scheduling tests | | `WITH_TEST_CORRECTNESS` | `ON` | enable the correctness tests | | `WITH_TEST_ERROR` | `ON` | enable the expected-error tests | | `WITH_TEST_WARNING` | `ON` | enable the expected-warning tests | | `WITH_TEST_PERFORMANCE` | `ON` | enable performance testing | | `WITH_TEST_GENERATOR` | `ON` | enable the AOT generator tests | The following options enable/disable various LLVM backends (they correspond to LLVM component names): | Option | Default | Description | |----------------------|----------------------|-------------------------------------| | `TARGET_AARCH64` | `ON`, _if available_ | Enable the AArch64 backend | | `TARGET_AMDGPU` | `ON`, _if available_ | Enable the AMD GPU backend | | `TARGET_ARM` | `ON`, _if available_ | Enable the ARM backend | | `TARGET_HEXAGON` | `ON`, _if available_ | Enable the Hexagon backend | | `TARGET_NVPTX` | `ON`, _if available_ | Enable the NVidia PTX backend | | `TARGET_POWERPC` | `ON`, _if available_ | Enable the PowerPC backend | | `TARGET_RISCV` | `ON`, _if available_ | Enable the RISC V backend | | `TARGET_WEBASSEMBLY` | `ON`, _if available_ | Enable the WebAssembly backend. | | `TARGET_X86` | `ON`, _if available_ | Enable the x86 (and x86_64) backend | The following options enable/disable various Halide-specific backends: | Option | Default | Description | |-----------------------|---------|----------------------------------------| | `TARGET_OPENCL` | `ON` | Enable the OpenCL-C backend | | `TARGET_METAL` | `ON` | Enable the Metal backend | | `TARGET_D3D12COMPUTE` | `ON` | Enable the Direct3D 12 Compute backend | The following options are WebAssembly-specific. They only apply when `TARGET_WEBASSEMBLY=ON`: | Option | Default | Description | |-------------|---------|-------------------------------------------| | `WITH_WABT` | `ON` | Include WABT Interpreter for WASM testing | ### Find module options Halide uses the following find modules to search for certain dependencies. These modules accept certain variables containing hints for the search process. Before setting any of these variables, closely study the [`find_package`][find_package] documentation. All of these variables should be set at the CMake command line via the `-D` flag. First, Halide expects to find LLVM and Clang through the `CONFIG` mode of `find_package`. You can tell Halide where to find these dependencies by setting the corresponding `_DIR` variables: | Variable | Description | |-------------|------------------------------------------------| | `LLVM_DIR` | `$LLVM_ROOT/lib/cmake/LLVM/LLVMConfig.cmake` | | `Clang_DIR` | `$LLVM_ROOT/lib/cmake/Clang/ClangConfig.cmake` | Here, `$LLVM_ROOT` is assumed to point to the root of an LLVM installation tree. This is either a system path or one produced by running `cmake --install` (as detailed in the main README.md). When building LLVM (and any other `CONFIG` packages) manually, it is a common mistake to point CMake to a _build tree_ rather than an _install tree_. Doing so often produces inscrutable errors. When using CMake 3.18 or above, some of Halide's tests will search for CUDA using the [`FindCUDAToolkit`][findcudatoolkit] module. If it doesn't find your CUDA installation automatically, you can point it to it by setting: | Variable | Description | |--------------------|---------------------------------------------------| | `CUDAToolkit_ROOT` | Path to the directory containing `bin/nvcc[.exe]` | | `CUDA_PATH` | _Environment_ variable, same as above. | If the CMake version is lower than 3.18, the deprecated [`FindCUDA`][findcuda] module will be used instead. It reads the variable `CUDA_TOOLKIT_ROOT_DIR` instead of `CUDAToolkit_ROOT` above. TODO(https://github.com/halide/Halide/issues/5633): update this section for OpenGLCompute, which needs some (but maybe not all) of this. When targeting OpenGL, the [`FindOpenGL`][findopengl] and [`FindX11`][findx11] modules will be used to link AOT generated binaries. These modules can be overridden by setting the following variables: | Variable | Description | |-------------------------|----------------------------------| | `OPENGL_egl_LIBRARY` | Path to the EGL library. | | `OPENGL_glu_LIBRARY` | Path to the GLU library. | | `OPENGL_glx_LIBRARY` | Path to the GLVND GLX library. | | `OPENGL_opengl_LIBRARY` | Path to the GLVND OpenGL library | | `OPENGL_gl_LIBRARY` | Path to the OpenGL library. | The OpenGL paths will need to be set if you intend to use OpenGL with X11 on macOS. Halide also searches for `libpng` and `libjpeg-turbo` through the [`FindPNG`][findpng] and [`FindJPEG`][findjpeg] modules, respectively. They can be overridden by setting the following variables. | Variable | Description | |---------------------|----------------------------------------------------| | `PNG_LIBRARIES` | Paths to the libraries to link against to use PNG. | | `PNG_INCLUDE_DIRS` | Path to `png.h`, etc. | | `JPEG_LIBRARIES` | Paths to the libraries needed to use JPEG. | | `JPEG_INCLUDE_DIRS` | Paths to `jpeglib.h`, etc. | When `WITH_DOCS` is set to `ON`, Halide searches for Doxygen using the [`FindDoxygen`][finddoxygen] module. It can be overridden by setting the following variable. | Variable | Description | |----------------------|---------------------------------| | `DOXYGEN_EXECUTABLE` | Path to the Doxygen executable. | When compiling for an OpenCL target, Halide uses the [`FindOpenCL`][findopencl] target to locate the libraries and include paths. These can be overridden by setting the following variables: | Variable | Description | |-----------------------|-------------------------------------------------------| | `OpenCL_LIBRARIES` | Paths to the libraries to link against to use OpenCL. | | `OpenCL_INCLUDE_DIRS` | Include directories for OpenCL. | Lastly, Halide searches for Python 3 using the [`FindPython3`][findpython3] module, _not_ the deprecated `FindPythonInterp` and `FindPythonLibs` modules, like other projects you might have encountered. You can select which Python installation to use by setting the following variable. | Variable | Description | |--------------------|-------------------------------------------------------| | `Python3_ROOT_DIR` | Define the root directory of a Python 3 installation. | # Using Halide from your CMake build This section assumes some basic familiarity with CMake but tries to be explicit in all its examples. To learn more about CMake, consult the [documentation][cmake-docs] and engage with the community on the [CMake Discourse][cmake-discourse]. Note: previous releases bundled a `halide.cmake` module that was meant to be [`include()`][include]-ed into your project. This has been removed. Please upgrade to the new package config module. ## A basic CMake project There are two main ways to use Halide in your application: as a **JIT compiler** for dynamic pipelines or an **ahead-of-time (AOT) compiler** for static pipelines. CMake provides robust support for both use cases. No matter how you intend to use Halide, you will need some basic CMake boilerplate. ```cmake cmake_minimum_required(VERSION 3.22) project(HalideExample) set(CMAKE_CXX_STANDARD 17) # or newer set(CMAKE_CXX_STANDARD_REQUIRED YES) set(CMAKE_CXX_EXTENSIONS NO) find_package(Halide REQUIRED) ``` The [`cmake_minimum_required`][cmake_minimum_required] command is required to be the first command executed in a CMake program. It disables all of the deprecated behavior ("policies" in CMake lingo) from earlier versions. The [`project`][project] command sets the name of the project (and has arguments for versioning, language support, etc.) and is required by CMake to be called immediately after setting the minimum version. The next three variables set the project-wide C++ standard. The first, [`CMAKE_CXX_STANDARD`][cmake_cxx_standard], simply sets the standard version. Halide requires at least C++17. The second, [`CMAKE_CXX_STANDARD_REQUIRED`][cmake_cxx_standard_required], tells CMake to fail if the compiler cannot provide the requested standard version. Lastly, [`CMAKE_CXX_EXTENSIONS`][cmake_cxx_extensions] tells CMake to disable vendor-specific extensions to C++. This is not necessary to simply use Halide, but we require it when authoring new code in the Halide repo. Finally, we use [`find_package`][find_package] to locate Halide on your system. If Halide is not globally installed, you will need to add the root of the Halide installation directory to [`CMAKE_PREFIX_PATH`][cmake_prefix_path] at the CMake command line. ``` dev@ubuntu:~/myproj$ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/Halide-install" -S . -B build ``` ## JIT mode To use Halide in JIT mode (like the [tutorials][halide-tutorials] do, for example), you can simply link to `Halide::Halide`. ```cmake # ... same project setup as before ... add_executable(my_halide_app main.cpp) target_link_libraries(my_halide_app PRIVATE Halide::Halide) ``` Then `Halide.h` will be available to your code and everything should just work. That's it! ## AOT mode Using Halide in AOT mode is more complicated so we'll walk through it step by step. Note that this only applies to Halide generators, so it might be useful to re-read the [tutorial][halide-generator-tutorial] on generators. Assume (like in the tutorial) that you have a source file named `my_generators.cpp` and that in it you have generator classes `MyFirstGenerator` and `MySecondGenerator` with registered names `my_first_generator` and `my_second_generator` respectively. Then the first step is to add a **generator executable** to your build: ```cmake # ... same project setup as before ... add_executable(my_generators my_generators.cpp) target_link_libraries(my_generators PRIVATE Halide::Generator) ``` Using the generator executable, we can add a Halide library corresponding to `MyFirstGenerator`. ```cmake # ... continuing from above add_halide_library(my_first_generator FROM my_generators) ``` This will create a static library target in CMake that corresponds to the output of running your generator. The second generator in the file requires generator parameters to be passed to it. These are also easy to handle: ```cmake # ... continuing from above add_halide_library(my_second_generator FROM my_generators PARAMS parallel=false scale=3.0 rotation=ccw output.type=uint16) ``` Adding multiple configurations is easy, too: ```cmake # ... continuing from above add_halide_library(my_second_generator_2 FROM my_generators GENERATOR my_second_generator PARAMS scale=9.0 rotation=ccw output.type=float32) add_halide_library(my_second_generator_3 FROM my_generators GENERATOR my_second_generator PARAMS parallel=false output.type=float64) ``` Here, we had to specify which generator to use (`my_second_generator`) since it uses the target name by default. The functions in these libraries will be named after the target names, `my_second_generator_2` and `my_second_generator_3`, by default, but it is possible to control this via the `FUNCTION_NAME` parameter. Each one of these targets, ``, carries an associated `.runtime` target, which is also a static library containing the Halide runtime. It is transitively linked through `` to targets that link to ``. On an operating system like Linux, where weak linking is available, this is not an issue. However, on Windows, this can fail due to symbol redefinitions. In these cases, you must declare that two Halide libraries share a runtime, like so: ```cmake # ... updating above add_halide_library(my_second_generator_2 FROM my_generators GENERATOR my_second_generator USE_RUNTIME my_first_generator.runtime PARAMS scale=9.0 rotation=ccw output.type=float32) add_halide_library(my_second_generator_3 FROM my_generators GENERATOR my_second_generator USE_RUNTIME my_first_generator.runtime PARAMS parallel=false output.type=float64) ``` This will even work correctly when different combinations of targets are specified for each halide library. A "greatest common denominator" target will be chosen that is compatible with all of them (or the build will fail). ### Autoschedulers When the autoschedulers are included in the release package, they are very simple to apply to your own generators. For example, we could update the definition of the `my_first_generator` library above to use the `Adams2019` autoscheduler: ```cmake add_halide_library(my_second_generator FROM my_generators AUTOSCHEDULER Halide::Adams2019) ``` ### RunGenMain Halide provides a generic driver for generators to be used during development for benchmarking and debugging. Suppose you have a generator executable called `my_gen` and a generator within called `my_filter`. Then you can pass a variable name to the `REGISTRATION` parameter of `add_halide_library` which will contain the name of a generated C++ source that should be linked to `Halide::RunGenMain` and `my_filter`. For example: ```cmake add_halide_library(my_filter FROM my_gen REGISTRATION filter_reg_cpp) add_executable(runner ${filter_reg_cpp}) target_link_libraries(runner PRIVATE my_filter Halide::RunGenMain) ``` Then you can run, debug, and benchmark your generator through the `runner` executable. ## Halide package documentation Halide provides a CMake _package configuration_ module. The intended way to use the CMake build is to run `find_package(Halide ...)` in your `CMakeLists.txt` file. Closely read the [`find_package` documentation][find_package] before proceeding. ### Components The Halide package script understands a handful of optional components when loading the package. First, if you plan to use the Halide Image IO library, you will want to include the `png` and `jpeg` components when loading Halide. Second, Halide releases can contain a variety of configurations: static, shared, debug, release, etc. CMake handles Debug/Release configurations automatically, but generally only allows one type of library to be loaded. The package understands two components, `static` and `shared`, that specify which type of library you would like to load. For example, if you want to make sure that you link against shared Halide, you can write: ```cmake find_package(Halide REQUIRED COMPONENTS shared) ``` If the shared libraries are not available, this will result in a failure. If no component is specified, then the `Halide_SHARED_LIBS` variable is checked. If it is defined and set to true, then the shared libraries will be loaded or the package loading will fail. Similarly, if it is defined and set to false, the static libraries will be loaded. If no component is specified and `Halide_SHARED_LIBS` is _not_ defined, then the [`BUILD_SHARED_LIBS`][build_shared_libs] variable will be inspected. If it is **not defined** or **defined and set to true**, then it will attempt to load the shared libs and fall back to the static libs if they are not available. Similarly, if `BUILD_SHARED_LIBS` is **defined and set to false**, then it will try the static libs first then fall back to the shared libs. ### Variables Variables that control package loading: | Variable | Description | |----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `Halide_SHARED_LIBS` | override `BUILD_SHARED_LIBS` when loading the Halide package via `find_package`. Has no effect when using Halide via `add_subdirectory` as a Git or `FetchContent` submodule. | | `Halide_RUNTIME_NO_THREADS` | skip linking of Threads library to runtime. Should be set if your toolchain does not support it (e.g. baremetal). | | `Halide_RUNTIME_NO_DL_LIBS` | skip linking of DL library to runtime. Should be set if your toolchain does not support it (e.g. baremetal). | Variables set by the package: | Variable | Description | |----------------------------|--------------------------------------------------------------------| | `Halide_VERSION` | The full version string of the loaded Halide package | | `Halide_VERSION_MAJOR` | The major version of the loaded Halide package | | `Halide_VERSION_MINOR` | The minor version of the loaded Halide package | | `Halide_VERSION_PATCH` | The patch version of the loaded Halide package | | `Halide_VERSION_TWEAK` | The tweak version of the loaded Halide package | | `Halide_HOST_TARGET` | The Halide target triple corresponding to "host" for this build. | | `Halide_CMAKE_TARGET` | The Halide target triple corresponding to the active CMake target. | | `Halide_ENABLE_EXCEPTIONS` | Whether Halide was compiled with exception support | | `Halide_ENABLE_RTTI` | Whether Halide was compiled with RTTI | Variables that control package behavior: | Variable | Description | |----------------------------|-------------| | `Halide_PYTHON_LAUNCHER` | Semicolon separated list containing a command to launch the Python interpreter. Can be used to set environment variables for Python generators. | | `Halide_NO_DEFAULT_FLAGS` | Off by default. When enabled, suppresses recommended compiler flags that would be added by `add_halide_generator` | ### Imported targets Halide defines the following targets that are available to users: | Imported target | Description | |----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `Halide::Halide` | this is the JIT-mode library to use when using Halide from C++. | | `Halide::Generator` | this is the target to use when defining a generator executable. It supplies a `main()` function. | | `Halide::Runtime` | adds include paths to the Halide runtime headers | | `Halide::Tools` | adds include paths to the Halide tools, including the benchmarking utility. | | `Halide::ImageIO` | adds include paths to the Halide image IO utility. Depends on `PNG::PNG` and `JPEG::JPEG` if they exist or were loaded through the corresponding package components. | | `Halide::ThreadPool` | adds include paths to the Halide _simple_ thread pool utility library. This is not the same as the runtime's thread pool and is intended only for use by tests. Depends on `Threads::Threads`. | | `Halide::RunGenMain` | used with the `REGISTRATION` parameter of `add_halide_library` to create simple runners and benchmarking tools for Halide libraries. | The following targets are not guaranteed to be available: | Imported target | Description | |-------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `Halide::Python` | this is a Python 3 package that can be referenced as `$/..` when setting up `PYTHONPATH` for Python tests or the like from CMake. | | `Halide::Adams19` | the Adams et.al. 2019 autoscheduler (no GPU support) | | `Halide::Li18` | the Li et.al. 2018 gradient autoscheduler (limited GPU support) | | `Halide::Mullapudi2016` | the Mullapudi et.al. 2016 autoscheduler (no GPU support) | ### Functions Currently, only two functions are defined: #### `add_halide_library` This is the main function for managing generators in AOT compilation. The full signature follows: ``` add_halide_library( FROM [GENERATOR generator-name] [FUNCTION_NAME function-name] [NAMESPACE cpp-namespace] [USE_RUNTIME hl-target] [PARAMS param1 [param2 ...]] [TARGETS target1 [target2 ...]] [FEATURES feature1 [feature2 ...]] [PLUGINS plugin1 [plugin2 ...]] [AUTOSCHEDULER scheduler-name] [GRADIENT_DESCENT] [C_BACKEND] [REGISTRATION OUTVAR] [HEADER OUTVAR] [FUNCTION_INFO_HEADER OUTVAR] [ OUTVAR]) extra-output = ASSEMBLY | BITCODE | COMPILER_LOG | FEATURIZATION | LLVM_ASSEMBLY | PYTHON_EXTENSION | PYTORCH_WRAPPER | SCHEDULE | STMT | STMT_HTML ``` This function creates a called `` corresponding to running the `` (an executable target which links to `Halide::Generator`) one time, using command line arguments derived from the other parameters. The arguments `GENERATOR` and `FUNCTION_NAME` default to ``. They correspond to the `-g` and `-f` command line flags, respectively. `NAMESPACE` is syntactic sugar to specify the C++ namespace (if any) of the generated function; you can also specify the C++ namespace (if any) directly in the `FUNCTION_NAME` argument, but for repeated declarations or very long namespaces, specifying this separately can provide more readable build files. If `USE_RUNTIME` is not specified, this function will create another target called `.runtime` which corresponds to running the generator with `-r` and a compatible list of targets. This runtime target is an INTERFACE dependency of ``. If multiple runtime targets need to be linked together, setting `USE_RUNTIME` to another Halide library, `` will prevent the generation of `.runtime` and instead use `.runtime`. This argument is most commonly used in conjunction with [`add_halide_runtime`](#add_halide_runtime). Parameters can be passed to a generator via the `PARAMS` argument. Parameters should be space-separated. Similarly, `TARGETS` is a space-separated list of targets for which to generate code in a single function. They must all share the same platform/bits/os triple (eg. `arm-32-linux`). Features that are in common among all targets, including device libraries (like `cuda`) should go in `FEATURES`. If `TARGETS` is not specified, the value of `Halide_TARGET` specified at configure time will be used. Every element of `TARGETS` must begin with the same `arch-bits-os` triple. This function understands two _meta-triples_, `host` and `cmake`. The meta-triple `host` is equal to the `arch-bits-os` triple used to compile Halide along with all of the supported instruction set extensions. On platforms that support running both 32 and 64-bit programs, this will not necessarily equal the platform the compiler is running on or that CMake is targeting. The meta-triple `cmake` is equal to the `arch-bits-os` of the current CMake target. This is useful if you want to make sure you are not unintentionally cross-compiling, which would result in an [`IMPORTED` target][imported-target] being created. When `TARGETS` is empty and the `host` target would not cross-compile, then `host` will be used. Otherwise, `cmake` will be used and an author warning will be issued. To use an autoscheduler, set the `AUTOSCHEDULER` argument to a target named like `Namespace::Scheduler`, for example `Halide::Adams19`. This will set the `autoscheduler` GeneratorParam on the generator command line to `Scheduler` and add the target to the list of plugins. Additional plugins can be loaded by setting the `PLUGINS` argument. If the argument to `AUTOSCHEDULER` does not contain `::` or it does not name a target, it will be passed to the `-s` flag verbatim. If `GRADIENT_DESCENT` is set, then the module will be built suitably for gradient descent calculation in TensorFlow or PyTorch. See `Generator::build_gradient_module()` for more documentation. This corresponds to passing `-d 1` at the generator command line. If the `C_BACKEND` option is set, this command will invoke the configured C++ compiler on a generated source. Note that a `.runtime` target is _not_ created in this case, and the `USE_RUNTIME` option is ignored. Other options work as expected. If `REGISTRATION` is set, the path (relative to `CMAKE_CURRENT_BINARY_DIR`) to the generated `.registration.cpp` file will be set in `OUTVAR`. This can be used to generate a runner for a Halide library that is useful for benchmarking and testing, as documented above. This is equivalent to setting `-e registration` at the generator command line. If `HEADER` is set, the path (relative to `CMAKE_CURRENT_BINARY_DIR`) to the generated `.h` header file will be set in `OUTVAR`. This can be used with `install(FILES)` to conveniently deploy the generated header along with your library. If `FUNCTION_INFO_HEADER` is set, the path (relative to `CMAKE_CURRENT_BINARY_DIR`) to the generated `.function_info.h` header file will be set in `OUTVAR`. This produces a file that contains `constexpr` descriptions of information about the generated functions (e.g., argument type and information). It is generated separately from the normal `HEADER` file because `HEADER` is intended to work with basic `extern "C"` linkage, while `FUNCTION_INFO_HEADER` requires C++17 or later to use effectively. (This can be quite useful for advanced usages, such as producing automatic call wrappers, etc.) Examples of usage can be found in the generated file. Lastly, each of the `extra-output` arguments directly correspond to an extra output (via `-e`) from the generator. The value `OUTVAR` names a variable into which a path (relative to [`CMAKE_CURRENT_BINARY_DIR`][cmake_current_binary_dir]) to the extra file will be written. #### `add_halide_generator` This function aids in creating cross-compilable builds that use Halide generators. ``` add_halide_generator( target [PACKAGE_NAME package-name] [PACKAGE_NAMESPACE namespace] [EXPORT_FILE export-file] [PYSTUB generator-name] [[SOURCES] source1 ...] ) ``` Every named argument is optional, and the function uses the following default arguments: - If `PACKAGE_NAME` is not provided, it defaults to `${PROJECT_NAME}-halide_generators`. - If `PACKAGE_NAMESPACE` is not provided, it defaults to `${PROJECT_NAME}::halide_generators::`. - If `EXPORT_FILE` is not provided, it defaults to `${PROJECT_BINARY_DIR}/cmake/${ARG_PACKAGE_NAME}-config.cmake` The `SOURCES` keyword marks the beginning of sources to be used to build ``, if it is not loaded. All unparsed arguments will be interpreted as sources. This function guarantees that a Halide generator target named `` is available. It will first search for a package named `` using `find_package`; if it is found, it is assumed that it provides the target. Otherwise, it will create an executable target named `target` and an `ALIAS` target ``. This function also creates a custom target named `` if it does not exist and `` would exist. In this case, `` will depend on ``, this enables easy building of _just_ the Halide generators managed by this function. After the call, `_FOUND` will be set to true if the host generators were imported (and hence won't be built). Otherwise, it will be set to false. This variable may be used to conditionally set properties on ``. Please see [test/integration/xc](https://github.com/halide/Halide/tree/main/test/integration/xc) for a simple example and [apps/hannk](https://github.com/halide/Halide/tree/main/apps/hannk) for a complete app that uses it extensively. If `PYSTUB` is specified, then a Python Extension will be built that wraps the Generator with CPython glue to allow use of the Generator Python 3.x. The result will be a a shared library of the form `_pystub..so`, where describes the specific Python version and platform (e.g., `cpython-310-darwin` for Python 3.10 on macOS.). See `README_python.md` for examples of use. #### `add_halide_python_extension_library` This function wraps the outputs of one or more `add_halide_library` targets with glue code to produce a Python Extension library. ``` add_halide_python_extension_library( target [MODULE_NAME module-name] HALIDE_LIBRARIES library1 ... ) ``` `FROM` specifies any valid Generator target. If omitted, `HALIDE_LIBRARIES` is a list of one of more `add_halide_library` targets. Each will be added to the extension as a callable method of the module. Note that every library specified must be built with the `PYTHON_EXTENSION` keyword specified, and all libraries must use the same Halide runtime. The result will be a a shared library of the form `..so`, where describes the specific Python version and platform (e.g., `cpython-310-darwin` for Python 3.10 on macOS.) #### `add_halide_runtime` This function generates a library containing a Halide runtime. Most user code will never need to use this, as `add_halide_library()` will call it for you if necessary. The most common use case is usually in conjunction with `add_halide_python_extension_library()`, as a way to ensure that all the halide libraries share an identical runtime. ``` add_halide_runtime( target [TARGETS target1 [target2 ...]] ) ``` The `TARGETS` argument has identical semantics to the argument of the same name for [`add_halide_library`](#add_halide_library). ## Cross compiling Cross-compiling in CMake can be tricky, since CMake doesn't easily support compiling for both the host platform and the cross-platform within the same build. Unfortunately, Halide generator executables are just about always designed to run on the host platform. Each project will be set up differently and have different requirements, but here are some suggestions for effective use of CMake in these scenarios. ### Use `add_halide_generator` If you are writing new programs that use Halide, you might wish to use our helper, `add_halide_generator`. When using this helper, you are expected to build your project twice: once for your build host and again for your intended target. When building the host build, you can use the `` (see the documentation above) target to build _just_ the generators. Then, in the target build, set `_ROOT` to the host build directory. For example: ``` $ cmake -G Ninja -S . -B build-host -DCMAKE_BUILD_TYPE=Release $ cmake --build build-host --target $ cmake -G Ninja -S . -B build-target -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_TOOLCHAIN_FILE=/path/to/target-tc.cmake \ -D_ROOT:FILEPATH=$PWD/build-host $ cmake --build build-target ``` ### Use a super-build A CMake super-build consists of breaking down a project into sub-projects that are isolated by [toolchain][cmake-toolchains]. The basic structure is to have an outermost project that only coordinates the sub-builds via the [`ExternalProject`][externalproject] module. One would then use Halide to build a generator executable in one self-contained project, then export that target to be used in a separate project. The second project would be configured with the target [toolchain][cmake-toolchains] and would call `add_halide_library` with no `TARGETS` option and set `FROM` equal to the name of the imported generator executable. Obviously, this is a significant increase in complexity over a typical CMake project. This is very compatible with the `add_halide_generator` strategy above. ### Use `ExternalProject` directly A lighter weight alternative to the above is to use [`ExternalProject`][externalproject] directly in your parent build. Configure the parent build with the target [toolchain][cmake-toolchains], and configure the inner project to use the host toolchain. Then, manually create an [`IMPORTED` target][imported-executable] for your generator executable and call `add_halide_library` as described above. The main drawback of this approach is that creating accurate `IMPORTED` targets is difficult since predicting the names and locations of your binaries across all possible platform and CMake project generators is difficult. In particular, it is hard to predict executable extensions in cross-OS builds. ### Use an emulator or run on device The [`CMAKE_CROSSCOMPILING_EMULATOR`][cmake_crosscompiling_emulator] variable allows one to specify a command _prefix_ to run a target-system binary on the host machine. One could set this to a custom shell script that uploads the generator executable, runs it on the device and copies back the results. ### Bypass CMake The previous two options ensure that the targets generated by `add_halide_library` will be _normal_ static libraries. This approach does not use [`ExternalProject`][externalproject], but instead produces `IMPORTED` targets. The main drawback of `IMPORTED` targets is that they are considered second-class in CMake. In particular, they cannot be installed with the typical [`install(TARGETS)` command][install-targets]. Instead, they must be installed using [`install(FILES)`][install-files] and the [`$`][target-file] generator expression. # Contributing CMake code to Halide When contributing new CMake code to Halide, keep in mind that the minimum version is 3.22. Therefore, it is possible (and indeed required) to use modern CMake best practices. Like any large and complex system with a dedication to preserving backwards compatibility, CMake is difficult to learn and full of traps. While not comprehensive, the following serves as a guide for writing quality CMake code and outlines the code quality expectations we have as they apply to CMake. ## General guidelines and best practices The following are some common mistakes that lead to subtly broken builds. - **Reading the build directory.** While setting up the build, the build directory should be considered _write only_. Using the build directory as a read/write temporary directory is acceptable as long as all temp files are cleaned up by the end of configuration. - **Not using [generator expressions][cmake-genex].** Declarative is better than imperative and this is no exception. Conditionally adding to a target property can leak unwanted details about the build environment into packages. Some information is not accurate or available except via generator expressions, eg. the build configuration. - **Using the wrong variable.** `CMAKE_SOURCE_DIR` doesn't always point to the Halide source root. When someone uses Halide via [`FetchContent`][fetchcontent], it will point to _their_ source root instead. The correct variable is [`Halide_SOURCE_DIR`][project-name_source_dir]. If you want to know if the compiler is MSVC, check it directly with the [`MSVC`][msvc] variable; don't use [`WIN32`][win32]. That will be wrong when compiling with clang on Windows. In most cases, however, a generator expression will be more appropriate. - **Using directory properties.** Directory properties have vexing behavior and are essentially deprecated from CMake 3.0+. Propagating target properties is the way of the future. - **Using the wrong visibility.** Target properties can be `PRIVATE`, `INTERFACE`, or both (aka `PUBLIC`). Pick the most conservative one for each scenario. Refer to the [transitive usage requirements][cmake-propagation] docs for more information. - **Needlessly expanding variables** The [`if`][cmake_if] and [`foreach`][cmake_foreach] commands generally expand variables when provided by name. Expanding such variables manually can unintentionally change the behavior of the command. Use `foreach (item IN LISTS list)` instead of `foreach (item ${list})`. Similarly, use `if (varA STREQUAL varB)` instead of `if ("${varA}" STREQUAL "${varB}")` and _definitely_ don't use `if (${varA} STREQUAL ${varB})` since that will fail (in the best case) if either variable's value contains a semi-colon (due to argument expansion). ### Prohibited commands list As mentioned above, using directory properties is brittle and they are therefore _not allowed_. The following functions may not appear in any new CMake code. | Command | Alternative | |-------------------------------------|----------------------------------------------------------------------------------------------------| | `add_compile_definitions` | Use [`target_compile_definitions`][target_compile_definitions] | | `add_compile_options` | Use [`target_compile_options`][target_compile_options] | | `add_definitions` | Use [`target_compile_definitions`][target_compile_definitions] | | `add_link_options` | Use [`target_link_options`][target_link_options], but prefer not to use either | | `get_directory_property` | Use cache variables or target properties | | `get_property(... DIRECTORY)` | Use cache variables or target properties | | `include_directories` | Use [`target_include_directories`][target_include_directories] | | `link_directories` | Use [`target_link_libraries`][target_link_libraries] | | `link_libraries` | Use [`target_link_libraries`][target_link_libraries] | | `remove_definitions` | [Generator expressions][cmake-genex] in [`target_compile_definitions`][target_compile_definitions] | | `set_directory_properties` | Use cache variables or target properties | | `set_property(... DIRECTORY)` | Use cache variables or target properties | | `target_link_libraries(target lib)` | Use [`target_link_libraries`][target_link_libraries] _with a visibility specifier_ (eg. `PRIVATE`) | As an example, it was once common practice to write code similar to this: ```cmake # WRONG: do not do this include_directories(include) add_library(my_lib source1.cpp ..) ``` However, this has two major pitfalls. First, it applies to _all_ targets created in that directory, even those before the call to `include_directories` and those created in [`include()`][include]-ed CMake files. As CMake files get larger and more complex, this behavior gets harder to pinpoint. This is particularly vexing when using the `link_libraries` or `add_defintions` commands. Second, this form does not provide a way to _propagate_ the include directory to consumers of `my_lib`. The correct way to do this is: ```cmake # CORRECT add_library(my_lib source1.cpp ...) target_include_directories(my_lib PUBLIC $) ``` This is better in many ways. It only affects the target in question. It propagates the include path to the targets linking to it (via `PUBLIC`). It also does not incorrectly export the host-filesystem-specific include path when installing or packaging the target (via `$`). If common properties need to be grouped together, use an INTERFACE target (better) or write a function (worse). There are also several functions that are disallowed for other reasons: | Command | Reason | Alternative | |---------------------------------|-----------------------------------------------------------------------------------|----------------------------------------------------------------------------------------| | `aux_source_directory` | Interacts poorly with incremental builds and Git | List source files explicitly | | `build_command` | CTest internal function | Use CTest build-and-test mode via [`CMAKE_CTEST_COMMAND`][cmake_ctest_command] | | `cmake_host_system_information` | Usually misleading information. | Inspect [toolchain][cmake-toolchains] variables and use generator expressions. | | `cmake_policy(... OLD)` | OLD policies are deprecated by definition. | Instead, fix the code to work with the new policy. | | `create_test_sourcelist` | We use our own unit testing solution | See the [adding tests](#adding-tests) section. | | `define_property` | Adds unnecessary complexity | Use a cache variable. Exceptions under special circumstances. | | `enable_language` | Halide is C/C++ only | [`FindCUDAToolkit`][findcudatoolkit] or [`FindCUDA`][findcuda], appropriately guarded. | | `file(GLOB ...)` | Interacts poorly with incremental builds and Git | List source files explicitly. Allowed if not globbing for source files. | | `fltk_wrap_ui` | Halide does not use FLTK | None | | `include_external_msproject` | Halide must remain portable | Write a CMake package config file or find module. | | `include_guard` | Use of recursive inclusion is not allowed | Write (recursive) functions. | | `include_regular_expression` | Changes default dependency checking behavior | None | | `load_cache` | Superseded by [`FetchContent`][fetchcontent]/[`ExternalProject`][externalproject] | Use aforementioned modules | | `macro` | CMake macros are not hygienic and are therefore error-prone | Use functions instead. | | `site_name` | Privacy: do not want leak host name information | Provide a cache variable, generate a unique name. | | `variable_watch` | Debugging helper | None. Not needed in production. | Lastly, do not introduce any dependencies via [`find_package`][find_package] without broader approval. Confine dependencies to the `dependencies/` subtree. ### Prohibited variables list Any variables that are specific to languages that are not enabled should, of course, be avoided. But of greater concern are variables that are easy to misuse or should not be overridden for our end-users. The following (non-exhaustive) list of variables shall not be used in code merged into main. | Variable | Reason | Alternative | |---------------------------------|-----------------------------------------------|---------------------------------------------------------------------------------------------------------| | `CMAKE_ROOT` | Code smell | Rely on `find_package` search options; include `HINTS` if necessary | | `CMAKE_DEBUG_TARGET_PROPERTIES` | Debugging helper | None | | `CMAKE_FIND_DEBUG_MODE` | Debugging helper | None | | `CMAKE_RULE_MESSAGES` | Debugging helper | None | | `CMAKE_VERBOSE_MAKEFILE` | Debugging helper | None | | `CMAKE_BACKWARDS_COMPATIBILITY` | Deprecated | None | | `CMAKE_BUILD_TOOL` | Deprecated | `${CMAKE_COMMAND} --build` or [`CMAKE_MAKE_PROGRAM`][cmake_make_program] (but see below) | | `CMAKE_CACHEFILE_DIR` | Deprecated | [`CMAKE_BINARY_DIR`][cmake_binary_dir], but see below | | `CMAKE_CFG_INTDIR` | Deprecated | `$`, `$`, target resolution of [`add_custom_command`][add_custom_command], etc. | | `CMAKE_CL_64` | Deprecated | [`CMAKE_SIZEOF_VOID_P`][cmake_sizeof_void_p] | | `CMAKE_COMPILER_IS_*` | Deprecated | [`CMAKE__COMPILER_ID`][cmake_lang_compiler_id] | | `CMAKE_HOME_DIRECTORY` | Deprecated | [`CMAKE_SOURCE_DIR`][cmake_source_dir], but see below | | `CMAKE_DIRECTORY_LABELS` | Directory property | None | | `CMAKE_BUILD_TYPE` | Only applies to single-config generators. | `$` | | `CMAKE_*_FLAGS*` (w/o `_INIT`) | User-only | Write a [toolchain][cmake-toolchains] file with the corresponding `_INIT` variable | | `CMAKE_COLOR_MAKEFILE` | User-only | None | | `CMAKE_ERROR_DEPRECATED` | User-only | None | | `CMAKE_CONFIGURATION_TYPES` | We only support the four standard build types | None | Of course feel free to insert debugging helpers _while developing_ but please remove them before review. Finally, the following variables are allowed, but their use must be motivated: | Variable | Reason | Alternative | |------------------------------------------------|-----------------------------------------------------|----------------------------------------------------------------------------------------------| | [`CMAKE_SOURCE_DIR`][cmake_source_dir] | Points to global source root, not Halide's. | [`Halide_SOURCE_DIR`][project-name_source_dir] or [`PROJECT_SOURCE_DIR`][project_source_dir] | | [`CMAKE_BINARY_DIR`][cmake_binary_dir] | Points to global build root, not Halide's | [`Halide_BINARY_DIR`][project-name_binary_dir] or [`PROJECT_BINARY_DIR`][project_binary_dir] | | [`CMAKE_MAKE_PROGRAM`][cmake_make_program] | CMake abstracts over differences in the build tool. | Prefer CTest's build and test mode or CMake's `--build` mode | | [`CMAKE_CROSSCOMPILING`][cmake_crosscompiling] | Often misleading. | Inspect relevant variables directly, eg. [`CMAKE_SYSTEM_NAME`][cmake_system_name] | | [`BUILD_SHARED_LIBS`][build_shared_libs] | Could override user setting | None, but be careful to restore value when overriding for a dependency | Any use of these functions and variables will block a PR. ## Adding tests When adding a file to any of the folders under `test`, be aware that CI expects that every `.c` and `.cpp` appears in the `CMakeLists.txt` file _on its own line_, possibly as a comment. This is to avoid globbing and also to ensure that added files are not missed. For most test types, it should be as simple as adding to the existing lists, which must remain in alphabetical order. Generator tests are trickier, but following the existing examples is a safe way to go. ## Adding apps If you're contributing a new app to Halide: great! Thank you! There are a few guidelines you should follow when writing a new app. - Write the app as if it were a top-level project. You should call `find_package(Halide)` and set the C++ version to 11. - Call [`enable_testing()`][enable_testing] and add a small test that runs the app. - Don't assume your app will have access to a GPU. Write your schedules to be robust to varying buildbot hardware. - Don't assume your app will be run on a specific OS, architecture, or bitness. Write your apps to be robust (ideally efficient) on all supported platforms. - If you rely on any additional packages, don't include them as `REQUIRED`, instead test to see if their targets are available and, if not, call `return()` before creating any targets. In this case, print a `message(STATUS "[SKIP] ...")`, too. - Look at the existing apps for examples. - Test your app with ctest before opening a PR. Apps are built as part of the test, rather than the main build. [add_custom_command]: https://cmake.org/cmake/help/latest/command/add_custom_command.html [add_library]: https://cmake.org/cmake/help/latest/command/add_library.html [add_subdirectory]: https://cmake.org/cmake/help/latest/command/add_subdirectory.html [atlas]: http://math-atlas.sourceforge.net/ [brew-cmake]: https://formulae.brew.sh/cask/cmake#default [build_shared_libs]: https://cmake.org/cmake/help/latest/variable/BUILD_SHARED_LIBS.html [choco-cmake]: https://chocolatey.org/packages/cmake/ [choco-doxygen]: https://chocolatey.org/packages/doxygen.install [choco-ninja]: https://chocolatey.org/packages/ninja [chocolatey]: https://chocolatey.org/ [cmake-apt]: https://apt.kitware.com/ [cmake-discourse]: https://discourse.cmake.org/ [cmake-docs]: https://cmake.org/cmake/help/latest/ [cmake-download]: https://cmake.org/download/ [cmake-from-source]: https://cmake.org/install/ [cmake-genex]: https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html [cmake-install]: https://cmake.org/cmake/help/latest/manual/cmake.1.html#install-a-project [cmake-propagation]: https://cmake.org/cmake/help/latest/manual/cmake-buildsystem.7.html#transitive-usage-requirements [cmake-toolchains]: https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html [cmake-user-interaction]: https://cmake.org/cmake/help/latest/guide/user-interaction/index.html#setting-build-variables [cmake_binary_dir]: https://cmake.org/cmake/help/latest/variable/CMAKE_BINARY_DIR.html [cmake_build_type]: https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html [cmake_crosscompiling]: https://cmake.org/cmake/help/latest/variable/CMAKE_CROSSCOMPILING.html [cmake_crosscompiling_emulator]: https://cmake.org/cmake/help/latest/variable/CMAKE_CROSSCOMPILING_EMULATOR.html [cmake_ctest_command]: https://cmake.org/cmake/help/latest/variable/CMAKE_CTEST_COMMAND.html [cmake_current_binary_dir]: https://cmake.org/cmake/help/latest/variable/CMAKE_CURRENT_BINARY_DIR.html [cmake_cxx_extensions]: https://cmake.org/cmake/help/latest/variable/CMAKE_CXX_EXTENSIONS.html [cmake_cxx_standard]: https://cmake.org/cmake/help/latest/variable/CMAKE_CXX_STANDARD.html [cmake_cxx_standard_required]: https://cmake.org/cmake/help/latest/variable/CMAKE_CXX_STANDARD_REQUIRED.html [cmake_foreach]: https://cmake.org/cmake/help/latest/command/foreach.html [cmake_if]: https://cmake.org/cmake/help/latest/command/if.html [cmake_lang_compiler_id]: https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html [cmake_make_program]: https://cmake.org/cmake/help/latest/variable/CMAKE_MAKE_PROGRAM.html [cmake_minimum_required]: https://cmake.org/cmake/help/latest/command/cmake_minimum_required.html [cmake_prefix_path]: https://cmake.org/cmake/help/latest/variable/CMAKE_PREFIX_PATH.html [cmake_presets]: https://cmake.org/cmake/help/latest/manual/cmake-presets.7.html [cmake_sizeof_void_p]: https://cmake.org/cmake/help/latest/variable/CMAKE_SIZEOF_VOID_P.html [cmake_source_dir]: https://cmake.org/cmake/help/latest/variable/CMAKE_SOURCE_DIR.html [cmake_system_name]: https://cmake.org/cmake/help/latest/variable/CMAKE_SYSTEM_NAME.html [doxygen-download]: https://www.doxygen.nl/download.html [doxygen]: https://www.doxygen.nl/index.html [eigen]: http://eigen.tuxfamily.org/index.php?title=Main_Page [enable_testing]: https://cmake.org/cmake/help/latest/command/enable_testing.html [externalproject]: https://cmake.org/cmake/help/latest/module/ExternalProject.html [fetchcontent]: https://cmake.org/cmake/help/latest/module/FetchContent.html [find_package]: https://cmake.org/cmake/help/latest/command/find_package.html [findcuda]: https://cmake.org/cmake/help/latest/module/FindCUDA.html [findcudatoolkit]: https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html [finddoxygen]: https://cmake.org/cmake/help/latest/module/FindDoxygen.html [findjpeg]: https://cmake.org/cmake/help/latest/module/FindJPEG.html [findopencl]: https://cmake.org/cmake/help/latest/module/FindOpenCL.html [findopengl]: https://cmake.org/cmake/help/latest/module/FindOpenGL.html [findpng]: https://cmake.org/cmake/help/latest/module/FindPNG.html [findpython3]: https://cmake.org/cmake/help/latest/module/FindPython3.html [findx11]: https://cmake.org/cmake/help/latest/module/FindX11.html [halide-generator-tutorial]: https://halide-lang.org/tutorials/tutorial_lesson_15_generators.html [halide-tutorials]: https://halide-lang.org/tutorials/tutorial_introduction.html [homebrew]: https://brew.sh [imported-executable]: https://cmake.org/cmake/help/latest/command/add_executable.html#imported-executables [imported-target]: https://cmake.org/cmake/help/latest/manual/cmake-buildsystem.7.html#imported-targets [include]: https://cmake.org/cmake/help/latest/command/include.html [install-files]: https://cmake.org/cmake/help/latest/command/install.html#files [install-targets]: https://cmake.org/cmake/help/latest/command/install.html#targets [libjpeg]: https://www.libjpeg-turbo.org/ [libpng]: http://www.libpng.org/pub/png/libpng.html [lld]: https://lld.llvm.org/ [msvc]: https://cmake.org/cmake/help/latest/variable/MSVC.html [msvc-cmd]: https://docs.microsoft.com/en-us/cpp/build/building-on-the-command-line?view=vs-2019 [ninja-download]: https://github.com/ninja-build/ninja/releases [ninja]: https://ninja-build.org/ [openblas]: https://www.openblas.net/ [project]: https://cmake.org/cmake/help/latest/command/project.html [project-name_binary_dir]: https://cmake.org/cmake/help/latest/variable/PROJECT-NAME_BINARY_DIR.html [project-name_source_dir]: https://cmake.org/cmake/help/latest/variable/PROJECT-NAME_SOURCE_DIR.html [project_source_dir]: https://cmake.org/cmake/help/latest/variable/PROJECT_SOURCE_DIR.html [project_binary_dir]: https://cmake.org/cmake/help/latest/variable/PROJECT_BINARY_DIR.html [pypi-cmake]: https://pypi.org/project/cmake/ [python]: https://www.python.org/downloads/ [target-file]: https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html#target-dependent-queries [target_compile_definitions]: https://cmake.org/cmake/help/latest/command/target_compile_definitions.html [target_compile_options]: https://cmake.org/cmake/help/latest/command/target_compile_options.html [target_include_directories]: https://cmake.org/cmake/help/latest/command/target_include_directories.html [target_link_libraries]: https://cmake.org/cmake/help/latest/command/target_link_libraries.html [target_link_options]: https://cmake.org/cmake/help/latest/command/target_link_options.html [vcpkg]: https://github.com/Microsoft/vcpkg [vcvarsall]: https://docs.microsoft.com/en-us/cpp/build/building-on-the-command-line?view=vs-2019#vcvarsall-syntax [venv]: https://docs.python.org/3/tutorial/venv.html [vs2019-cmake-docs]: https://docs.microsoft.com/en-us/cpp/build/cmake-projects-in-visual-studio?view=vs-2019 [win32]: https://cmake.org/cmake/help/latest/variable/WIN32.html Halide-17.0.1/README_fuzz_testing.md000066400000000000000000000076011456515664200171020ustar00rootroot00000000000000# Fuzz testing Halide has a set of fuzz-testing harnesses that can be used to find those tricky to find, edge cases and bugs that would otherwise not be caught by a regular unit-testing suite. At the moment these fuzz-tests are housed in the `test/fuzz` directory. The fuzz testing suite use the common, [libfuzzer](https://www.llvm.org/docs/LibFuzzer.html) interface for fuzz-tests. ## Building fuzz tests Fuzz testing requires specific instrumentation across the entire build; to do this we make use of a fuzzing-specific-toolchain/preset. e.g. ``` cmake -B build --preset linux-x64-fuzzer -DLLVM_ROOT=/path/to/llvminstall cmake --build ./build -j$(nproc) ``` Note that the LLVM install that you use must be built with `-D LLVM_ENABLE_RUNTIMES="compiler-rt"` set if you want to build the fuzzer tests (failing to do so will fail at configure time); not all prebuilt LLVM installs include this, so you may need to build LLVM from source to run the fuzz tests locally. ## Using the fuzz-harnesses Fuzz-testing harnesses are a little different to a more traditional unit-test and don't have a definitive end of test. In other words, a fuzz test will run: - for an infinite amount of time (the default), - for a user specified maximum amount of time, - until the fuzzer finds a bug and crashes, - you manually kill the process e.g. (ctrl-C). Once you have built the fuzz testing suite using the commands listed above you can list the fuzz testing harnesses using the command: ``` ls ./build/test/fuzz/fuzz_* ``` To run a fuzzer simply run the fuzz-testing harness with no arguments. e.g. `./build/test/fuzz/fuzz_simplify` By default this will run the fuzz test on a single core and discard whatever. temporary corpus is created. To reuse a given corpus (recommended) create a new directory to store the corpus generated by your fuzz testing harness and pass that directory into your fuzzer e.g. ``` mkdir fuzz_simplify_corpus -p ./build/test/fuzz/fuzz_simplify fuzz_simplify_corpus ``` This will save the state of the fuzzer between runs, this way any progress that your fuzzer makes improving code-coverage will remain persistent on your disk. Up until this point the fuzzer has only been running on a single core. To speed things up a little, let's run the fuzzer in parallel across all available cores on our machine. ``` ./build/test/fuzz/fuzz_simplify fuzz_simplify_corpus -fork=$(nproc) ``` ## Reproducing crashes An important part of fuzz testing is reproducing the crashing input. To handle this, a libfuzzer-based fuzz harness will create a crash file whenever the fuzzer exits unexpectedly. This will look something like: `crash-` To reproduce a crash we simply rerun our fuzz harness with our crash file as the first argument. `./build/test/fuzz/fuzz_simplify crash-` So long as your fuzz harness and library are deterministic this should reproduce the original crash. ## Adding new fuzz tests A bare-bones fuzzer will look something like the following: ```cpp #include #include #include extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { // Randomly throw data at our function and hope it doesn't crash. foo(data, size); return 0; } ``` This assumes that our function foo takes in a buffer and the size of said buffer. But in many cases we would like to make use of more structured data. e.g. a string or a vector of integers etc. Thankfully libfuzzer provides a handy helper to convert a raw buffer into common structured data types, the [FuzzedDataProvider class](https://github.com/llvm/llvm-project/blob/main/compiler-rt/include/fuzzer/FuzzedDataProvider.h). For examples on how to use this class see `test/fuzz/simplify.cpp`. ## Other useful materials - [The official libfuzzer docs](https://www.llvm.org/docs/LibFuzzer.html) - [The libfuzzer tutorial](https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md) Halide-17.0.1/README_python.md000066400000000000000000000775121456515664200157000ustar00rootroot00000000000000# Halide Bindings for Python - [Python Requirements](#python-requirements) - [Compilation Instructions](#compilation-instructions) - [Documentation and Examples](#documentation-and-examples) - [Differences from C++ API](#differences-from-c-api) - [Example of Simple Usage](#example-of-simple-usage) - [Halide Generators In Python](#halide-generators-in-python) - [Writing a Generator in Python](#writing-a-generator-in-python) - [@hl.generator\("name"\)](#hlgeneratorname) - [hl.GeneratorParam](#hlgeneratorparam) - [hl.InputBuffer, hl.InputScalar](#hlinputbuffer-hlinputscalar) - [hl.OutputBuffer, hl.OutputScalar](#hloutputbuffer-hloutputscalar) - [Names](#names) - [generate\(\) method](#generate-method) - [Types for Inputs and Outputs](#types-for-inputs-and-outputs) - [Using a Generator for JIT compilation](#using-a-generator-for-jit-compilation) - [Using a Generator for AOT compilation](#using-a-generator-for-aot-compilation) - [Calling Generator-Produced code from Python](#calling-generator-produced-code-from-python) - [Advanced Generator-Related Topics](#advanced-generator-related-topics) - [Generator Aliases](#generator-aliases) - [Dynamic Inputs and Outputs](#dynamic-inputs-and-outputs) - [Calling a Generator Directly](#calling-a-generator-directly) - [The Lifecycle Of A Generator](#the-lifecycle-of-a-generator) - [Notable Differences Between C++ and Python Generators](#notable-differences-between-c-and-python-generators) - [Keeping Up To Date](#keeping-up-to-date) - [License](#license) Halide provides Python bindings for most of its public API. Python 3.8 (or higher) is required. The Python bindings are supported on 64-bit Linux, OSX, and Windows systems. In addition to the ability to write just-in-time Halide code using Python, you can write [Generators](#halide-generators-in-python) using the Python bindings, which can simplify build-system integration (since no C++ metacompilation step is required). You can also use existing Halide Generators (written in either C++ or Python) to produce Python extensions that can be used within Python code. ## Python Requirements Before building, you should ensure you have prerequite packages installed in your local Python environment. The best way to get set up is to use a virtual environment: ```console $ python3 -m venv venv $ . venv/bin/activate $ python3 -m pip install -U setuptools wheel $ python3 -m pip install -r requirements.txt ``` ## Compilation Instructions Build as part of the CMake build with `-DWITH_PYTHON_BINDINGS=ON` (this is the default). Note that this requires both Halide and LLVM to be built with RTTI and exceptions **enabled**, which is not the default for LLVM. ## Documentation and Examples As mentioned elsewhere, the Python API attempts to mimic the [C++ Halide API](http://halide-lang.org/docs) as directly as possible; there isn't separate Python-specific documentation for the API at this time. For now, examine the code for the example applications in the `test/apps/` and `tutorial/` subdirectories. The tests run as part of the standard CTest infrastructure and are labeled with the `python` label. You can run the Python tests specifically by running: ``` $ ctest -L python ``` from the Halide build directory. ## Differences from C++ API The Python bindings attempt to mimic the Halide C++ API as closely as possible, with some differences where the C++ idiom is either inappropriate or impossible: - Most APIs that take a variadic argument list of ints in C++ take an explicit list in Python. For instance, the usual version of the `Buffer` ctor in C++ offers both variadic and list versions: ``` Buffer<>(Type t, int extent_dim_0, int extent_dim_1, ...., extent_dim_N, string name = ""); Buffer<>(Type t, vector extents, string name = ""); ``` In Python, only the second variant is provided. - `Func` and `Buffer` access is done using `[]` rather than `()` - For zero-dimensional `Func` and `Buffer`, you must explicitly specify `[()]` -- that is, use an empty tuple as the index -- because `[]` is not syntactically acceptable in Python. - Some classes in the Halide API aren't provided because standard Python idioms are a better fit: - `Halide::Tuple` doesn't exist in the Python bindings; an ordinary Python tuple of `Halide::Expr` is used instead. - `Halide::Realization` doesn't exist in the Python bindings; an ordinary Python tuple of `Halide::Buffer` is used instead. - static and instance method overloads with the same name in the same class aren't allowed, so some convenience methods are missing from `Halide::Var` - Templated types (notably `Halide::Buffer<>` and `Halide::Param<>`) aren't provided, for obvious reasons; only the equivalents of `Halide::Buffer` and `Halide::Param` are supported. - The functions in `Halide::ConciseCasts` are present in the toplevel Halide module in Python, rather than a submodule: e.g., use `halide.i8_sat()`, not `halide.ConciseCasts.i8_sat()`. - Only things in the `Halide` namespace are supported; classes and methods that involve using the `Halide::Internal` namespace are not provided. - No mechanism is provided for overriding any runtime functions from Python for JIT-compiled code. (Runtime functions for AOT-compiled code can be overridden by building and linking a custom runtime, but not currently via any runtime API, e.g. halide_set_custom_print() does not exist.) - No mechanism is provided for supporting `Func::define_extern`. - `Buffer::for_each_value()` isn't supported yet. - `Func::in` becomes `Func.in_` because `in` is a Python keyword. - `Func::async` becomes `Func.async_` because `async` is a Python keyword. - The `not` keyword cannot be used to negate boolean Halide expressions. Instead, the `logical_not` function can be used and is equivalent to using `operator!` in C++. - There is no way to override the logical `and`/`or` operators in Python to work with `Expr`: you must use the bitwise `|` and `&` instead. (Note that incorrectly using using `and`/`or` just short-circuits weirdly, rather than failing with some helpful error; this is an issue that we have not yet found any way to improve, unfortunately.) - Some error messages need to be made more informative. - Some exceptions are the "incorrect" type (compared to C++ expectations). - Many hooks to override runtime functions (e.g. Func::set_error_handler) aren't yet implemented. - The following parts of the Halide public API are currently missing entirely from the Python bindings (but are all likely to be supported at some point in the future): - `DeviceInterface` - `evaluate()` ## Example of Simple Usage The Python bindings for Halide are built as a standard part of the `install` target, and are present in the Halide install location at `$HALIDE_INSTALL/lib/python3/site-packages`; adding that to your `PYTHONPATH` should allow you to simply `import halide`: ``` # By convention, we import halide as 'hl' for terseness import halide as hl # Some constants edge = 512 k = 20.0 / float(edge) # Simple formula x, y, c = hl.Var('x'), hl.Var('y'), hl.Var('c') f = hl.Func('f') e = hl.sin(x * ((c + 1) / 3.0) * k) * hl.cos(y * ((c + 1) / 3.0) * k) f[x, y, c] = hl.cast(hl.UInt(8), e * 255.0) f.vectorize(x, 8).parallel(y) # Realize into a Buffer. buf = f.realize([edge, edge, 3]) # Do something with the image. We'll just save it to a PNG. from halide import imageio imageio.imwrite("/tmp/example.png", buf) ``` It's worth noting in the example above that the Halide `Buffer` object supports the Python Buffer Protocol (https://www.python.org/dev/peps/pep-3118) and thus is converted to and from other compatible objects (e.g., NumPy's `ndarray`), at essentially zero cost, with storage being shared. Thus, we can usually pass it directly to existing Python APIs (like `imsave()`) that expect 'image-like' objects without any explicit conversion necessary. ## Halide Generators In Python In Halide, a "Generator" is a unit of encapsulation for Halide code. It is a self-contained piece of code that can: - Produce a chunk of Halide IR (in the form of an `hl.Pipeline`) that is appropriate for compilation (via either JIT or AOT) - Expose itself to the build system in a discoverable way - Fully describe itself for the build system with metadata for (at least) the type and number of inputs and outputs expected - Allow for build-time customization of coder-specified parameters in a way that doesn't require editing of source code Originally, Halide only supported writing Generators in C++. In this document, we'll use the term "C++ Generator" to mean "Generator written in C++ using the classic API", the term "Python Generator" to mean "Generator written in Halide's Python bindings", and just plain "Generator" when the discussion is relatively neutral with respect to the implementation language/API. ### Writing a Generator in Python A Python Generator is a class that: - has the `@hl.generator` decorator applied to it - declares zero or more member fields that are initialized with values of `hl.InputBuffer` or `hl.InputScalar`, which specify the expected input(s) of the resulting `Pipeline`. - declares one or more member fields that are initialized with values of `hl.OutputBuffer` or `hl.OutputScalar`, which specify the expected output(s) of the resulting `Pipeline`. - declares zero or more member fields that are initialized with values of `hl.GeneratorParam`, which can be used to pass arbitrary information from the build system to the Generator. A GeneratorParam can carry a value of type `bool`, `int`, `float`, `str`, or `hl.Type`. - declares a `generate()` method that fill in the Halide IR needed to define all of the Outputs - optionally declares a `configure()` method to dynamically add Inputs or Outputs to the pipeline, based on (e.g.) the values of `GeneratorParam` values or other external inputs Let's look at a fairly simple example: > **TODO:** this example is pretty contrived; is there an equally simple > Generator to use here that would demonstrate the basics? ``` import halide as hl x = hl.Var('x') y = hl.Var('y') _operators = { 'xor': lambda a, b: a ^ b, 'and': lambda a, b: a & b, 'or': lambda a, b: a | b } # Apply a mask value to a 2D image using a logical operator that is selected at compile-time. @hl.generator(name = "logical_op_generator") class LogicalOpGenerator: op = hl.GeneratorParam("xor") input = hl.InputBuffer(hl.UInt(8), 2) mask = hl.InputScalar(hl.UInt(8)) output = hl.OutputBuffer(hl.UInt(8), 2) def generate(g): # Algorithm operator = _operators[g.op] g.output[x, y] = operator(g.input[x, y], g.mask) # Schedule v = g.natural_vector_size(hl.UInt(8)) g.output.vectorize(x, v) if __name__ == "__main__": hl.main() ``` If you've worked with Halide Generators written in C++, the "shape" of this will likely look familiar. (If not, no worries; you shouldn't need any knowledge of C++ Generators for the following to make sense.) Let's take the details here one at a time. #### @hl.generator("name") This decorator adds appropriate "glue" machinery to the class to enforce various invariants. It also serves as the declares a "registered name" for the Generator, which is a unique name that the build system will use to identify the Generator. If you omit the name, it defaults to defaults to `module.classname`; if module is `__main__` then we omit it and just use the plain classname. Note that the registered name need not match the classname. (Inside Halide, we use the convention of `CamelCase` for class names and `snake_case` for registered names, but you can use whatever convention you like.) #### hl.GeneratorParam Each `GeneratorParam` is an arbitrary key-value pair that can be used to provide configurable options at compile time. You provide the name and a default value. The default value can be overridden by the build machinery, which will replace the value (based on user specified text). Note that the type of the default value *is* used to define the expected type of the `GeneratorParam`, and trying to set it to an incompatible value will throw an exception. The types that are acceptable to use in a `GeneratorParam` are: - Python's `bool`, `int`, `float`, or `str` - Halide's `hl.Type` - ...that's all Note that the value of a `GeneratorParam` is read-only from the point of view of the Generator; they are set at Generator construction time and attempting to change their value will throw an exception. #### hl.InputBuffer, hl.InputScalar These declare the inputs to the `hl.Pipeline` that the Generator will produce. An `hl.InputScalar` is, essentially, a "factory" that produces an `hl.Param` in the existing Python API, while an `hl.InputBuffer` is a factory for `hl.ImageParam`. From the Generator author's perspective, a field initialized with `InputScalar` **is** a `Param` – not kinda-like-one, not a magic wrapper that forwards everything; it is literally just `hl.Param`. Similarly, an `InputBuffer` produces `ImageParam`, and an `InputFunc` is a wrapper around `Func`. You won't be able to assign a new value to the member field for Inputs – as with GeneratorParams, they are "read-only" to the Generator – but you will be able to set constraints on them. Note that in addition to specifying a concrete type and dimensionality for the inputs, these factory classes support the ability to specify either (or both) `None`, which means the type/dimensionality will be provided by GeneratorParams in the build system. #### hl.OutputBuffer, hl.OutputScalar These declare the output(s) of the Pipeline that the Generator will produce. An `hl.OutputBuffer` is, essentially, a "factory" that produces an `hl.Func` in the existing Python API. (`hl.OutputScalar` is just an `hl.OutputBuffer` that always has zero dimensions.) From the Generator author's perspective, a field declared with `OutputBuffer` **is** a `Func` – not kinda-like-one, not a magic wrapper that forwards everything; it is literally just `hl.Func` (with type-and-dimensionality set to match, see recent PR https://github.com/halide/Halide/pull/6734) . You won't be able to assign a new value to the member field for Inputs – as with GeneratorParams, they are "read-only" to the Generator – but you will be able to set constraints on them. Note that in addition to specifying a concrete type and dimensionality for the inputs, these factory classes support the ability to specify either (or both) as `None`, which means the type/dimensionality will be provided by GeneratorParams in the build system. #### Names Note that all of the GeneratorParams, Inputs, and Outputs have names that are implicitly filled in based on the fieldname of their initial assignment; unlike in C++ Generators, there isn't a way to "override" this name (i.e., the name in the IR will always exactly match the Python field name). Names have the same constraints as for C++ Generators (essentially, a C identifier, but without an initial underscore, and without any double underscore anywhere). #### generate() method This will be called by the Generator machinery to build the Pipeline. As with C++ Generators, the only required task is to ensure that all Output fields are fully defined, in a way that matches the type-and-dimension constraints specified. It is required that the `generate()` method be defined by the Generator. (Note that, by convention, Halide Generators use `g` instead of `self` in their `generate()` method to make the expression language terser; this is not in any way required, but is recommended to improve readability.) #### Types for Inputs and Outputs For all of the Input and Output fields of Generators, you can specify native Python types (instead of `hl.Type`) for certain cases that are unambiguous. At present, we allow `bool` as an alias for `hl.Bool()`, `int` as an alias for `hl.Int(32)`, and `float` as an alias for `hl.Float(32)`. ### Using a Generator for JIT compilation You can use the `compile_to_callable()` method to JIT-compile a Generator into a `hl.Callable`, which is (essentially) just a dynamically-created function. ``` import LogicalOpGenerator from halide import imageio import numpy as np # Instantiate a Generator -- we can only set the GeneratorParams # by passing in a dict to the Generator's constructor or_op_generator = LogicalOpGenerator({"op": "or"}) # Now compile the Generator into a Callable or_filter = or_op_generator.compile_to_callable() # Read in some file for input input_buf = imageio.imread("/path/to/some/file.png") assert input_buf.ndim == 2 assert input_buf.dtype == np.uint8 # create a Buffer-compatible object for the output; we'll use np.array output_buf = np.empty(input_buf.shape, dtype=input_buf.dtype) # Note, Python code throws exception for error conditions rather than returning an int or_filter(input_buf, 0x7f, output_buf) # Note also that we can use named arguments for any/all, in the Python manner: or_filter(mask=0x7f, input=input_buf, output=output_buf) imageio.imwrite("/tmp/or.png", output_buf) ``` By default, a Generator will produce code targeted at `Target("host")` (or the value of the `HL_JIT_TARGET` environment variable, if set); you can override this behavior selectively by activating a `GeneratorContext` when the Generator is *created*: ``` import LogicalOpGenerator # Compile with debugging enabled t = hl.Target("host-debug") with hl.GeneratorContext(t): or_op_generator = LogicalOpGenerator({"op": "or"}) or_filter = or_op_generator.compile_to_callable() ``` ### Using a Generator for AOT compilation If you are using CMake, the simplest thing is to use `add_halide_library` and `add_halide_python_extension_library()`: ``` # Build a Halide library as you usually would, but be sure to include `PYTHON_EXTENSION` add_halide_library(xor_filter FROM logical_op_generator PARAMS op=xor PYTHON_EXTENSION output_path_var [ FEATURES ... ] [ PARAMS ... ]) # Now wrap the generated code with a Python extension. # (Note that module name defaults to match the target name; we only # need to specify MODULE_NAME if we need a name that may differ) add_halide_python_extension_library(my_extension MODULE_NAME my_module HALIDE_LIBRARIES xor_filter) ``` (Note that this rule works for both C++ and Python Generators.) This compiles the Generator code in `logical_op_generator.py` with the registered name `logical_op_generator` to produce the target `xor_filter`, and then wraps the compiled output with a Python extension. The result will be a shared library of the form `..so`, where describes the specific Python version and platform (e.g., `cpython-310-darwin` for Python 3.10 on OSX.) Note that you can combine multiple Halide libraries into a single Python module; this is convenient for packagaing, but also because all the libraries in a single extension module share the same Halide runtime (and thus, the same caches, thread pools, etc.). ``` add_halide_library(xor_filter ...) add_halide_library(and_filter ...) add_halide_library(or_filter ...) add_halide_python_extension_library(my_extension MODULE_NAME my_module HALIDE_LIBRARIES xor_filter and_filter or_filter) ``` Note that you must take care to ensure that all of the `add_halide_library` targets specified use the same Halide runtime; it may be necessary to use `add_halide_runtime` to define an explicit runtime that is shared by all of the targets: ``` add_halide_runtime(my_runtime) add_halide_library(xor_filter USE_RUNTIME my_runtime ...) add_halide_library(and_filter USE_RUNTIME my_runtime ...) add_halide_library(or_filter USE_RUNTIME my_runtime ...) add_halide_python_extension_library(my_extension MODULE_NAME my_module HALIDE_LIBRARIES xor_filter and_filter or_filter) ``` If you're not using CMake, you can "drive" a Generator directly from your build system via command-line flags. The most common, minimal set looks something like this: ``` python3 /path/to/my/generator.py -g \ -o \ target= \ [generator-param=value ...] ``` The argument to `-g` is the name supplied to the `@hl.generator` decorator. The argument to -o is a directory to use for the output files; by default, we'll produce a static library containing the object code, and a C++ header file with a forward declaration. `target` specifies a Halide `Target` string decribing the OS, architecture, features, etc that should be used for compilation. Any other arguments to the command line that don't begin with `-` are presumed to name `GeneratorParam` values to set. There are other flags and options too, of course; use `python3 /path/to/my/generator.py -help` to see a list with explanations. (Unfortunately, there isn't (yet) a way to produce a Python Extension just by running a Generator; the logic for `add_halide_python_extension_library` is currently all in the CMake helper files.) ### Calling Generator-Produced code from Python As long as the shared library is in `PYTHONPATH`, it can be imported and used directly. For the example above: ``` from my_module import xor_filter from halide import imageio import numpy as np # Read in some file for input input_buf = imageio.imread("/path/to/some/file.png") assert input_buf.ndim == 2 assert input_buf.dtype == np.uint8 # create a Buffer-compatible object for the output; we'll use np.array output_buf = np.empty(input_buf.shape, dtype=input_buf.dtype) # Note, Python code throws exception for error conditions rather than returning an int xor_filter(input_buf, 0xff, output_buf) # Note also that we can use named arguments for any/all, in the Python manner: # xor_filter(input=input_buf, mask=0xff, output=output_buf) imageio.imwrite("/tmp/xored.png", output_buf) ``` Above, we're using common Python utilities (`numpy`) to construct the input/output buffers we want to pass to Halide. **Note**: Getting the memory order correct can be a little confusing for numpy. By default numpy uses "C-style" [row-major](https://docs.scipy.org/doc/numpy-1.13.0/reference/internals.html) order, which sounds like the right option for Halide; however, this nomenclature assumes the matrix-math convention of ordering axes as `[rows, cols]`, whereas Halide (and imaging code in general) generally assumes `[x, y]` (i.e., `[cols, rows]`). Thus what you usually want in Halide is column-major ordering. This means numpy arrays, by default, come with the wrong memory layout for Halide. But if you construct the numpy arrays yourself (like above), you can pass `order='F'` to make numpy use the Halide-compatible memory layout. If you're passing in an array constructed somewhere else, the easiest thing to do is to `.transpose()` it before passing it to your Halide code.) ### Advanced Generator-Related Topics #### Generator Aliases A Generator alias is a way to associate a Generator with one (or more) specific sets of GeneratorParams; the 'alias' is just another registered name. This offers a convenient alternative to specifying multiple sets of GeneratorParams via the build system. To define alias(es) for a Generator, just add the `@hl.alias` decorator before `@hl.generator` decorator: ``` @hl.alias( xor_generator={"op": "xor"}, and_generator={"op": "and"}, or_generator={"op": "or"} ) @hl.generator("logical_op_generator") class LogicalOpGenerator: ... ``` #### Dynamic Inputs and Outputs If you need to build `Input` and/or `Output` dynamically, you can define a `configure()` method. It will always be called after all `GeneratorParam` values are valid, but before `generate()` is called. Let's take our example and add an option to pass an offset to be added after the logical operator is done: ``` import halide as hl x = hl.Var('x') y = hl.Var('y') _operators = { 'xor': lambda a, b: a ^ b, 'and': lambda a, b: a & b, 'or': lambda a, b: a | b } # Apply a mask value to a 2D image using a logical operator that is selected at compile-time. @hl.generator(name = "logical_op_generator") class LogicalOpGenerator: op = hl.GeneratorParam("xor") with_offset = hl.GeneratorParam(False) input = hl.InputBuffer(hl.UInt(8), 2) mask = hl.InputScalar(hl.UInt(8)) output = hl.OutputBuffer(hl.UInt(8), 2) def configure(g): # If with_offset is specified, we if g.with_offset: g.add_input("offset", hl.InputScalar(hl.Int(32))) # See note the use of 'g' instead of 'self' here def generate(g): # Algorithm operator = _operators[g.op] if hasattr(g, "offset"): g.output[x, y] = operator(g.input[x, y], g.mask) + g.offset else: g.output[x, y] = operator(g.input[x, y], g.mask) # Schedule v = g.natural_vector_size(hl.UInt(8)) g.output.vectorize(x, v) if __name__ == "__main__": hl.main() ``` The only thing you can (usefully) do from `configure()` is to call `add_input()` or `add_output()`, which accept only the appropriate `Input` or `Output` classes. The resulting value is stored as a member variable with the name specified (if there is already a member with the given name, an exception is thrown). #### Calling a Generator Directly Each Generator has a class method (injected by `@hl.generator`) that allows you to "call" the Generator like an ordinary function; this allows you to directly take the Halide IR produced by the Generator and do anything you want to with it. This can be especially useful when writing library code, as you can 'compose' more complex pipelines this way. This method is named `call()` and looks like this: ``` @classmethod def call(cls, *args, **kwargs): ... ``` It takes the inputs (specified either by-name or by-position in the usual Python way). It also allows for an optional by-name-only argument, `generator_params`, which is a simple Python dict that allows for overriding `GeneratorParam`s. It returns a tuple of the Output values. For the earlier example, usage might be something like: ``` import LogicalOpFilter x, y = hl.Var(), hl.Var() input_buf = hl.Buffer(hl.UInt(8), [2, 2]) mask_value = 0x7f # Inputs by-position func_out = LogicalOpFilter.call(input_buf, mask_value) # Inputs by-name func_out = LogicalOpFilter.call(mask=mask_value, input=input_buf) # Above again, but with generator_params func_out = LogicalOpFilter.call(input_buf, mask_value, generator_params = {"op": "and"}) func_out = LogicalOpFilter.call(generator_params = {"op": and}, input=input_buf, mask=mask_value) ``` #### The Lifecycle Of A Generator Whether being driven by a build system (for AOT use) or by another piece of Python code (typically for JIT use), the lifecycle of a Generator looks something like this: - An instance of the Generator in question is created. It uses the currently-active `GeneratorContext` (which contains the `Target` to be used for code generation), which is stored in a thread-local stack. - Some (or all) of the default values of the `GeneratorParam` members may be replaced based on (e.g.) command-line arguments in the build system - All `GeneratorParam` members are made immutable. - The `configure()` method is called, allowing the Generator to use `add_input()` or `add_output()` to dynamically add inputs and/or outputs. - If any `Input` or `Output` members were defined with unspecified type or dimensions (e.g. `some_input = hl.InputBuffer(None, 3)`), those types and dimensions are filled in from `GeneratorParam` values (e.g. `some_input.type` in this case). If any types or dimensions are left unspecified after this step, an exception will be thrown. - If the Generator is being invoked via its `call()` method (see below), the default values for `Inputs` will be replaced by the values from the argument list. - The Generator instance has its `generate()` method called. - The calling code will extract the values of all `Output` values and validate that they match the type, dimensions, etc of the declarations. - The calling code will then either call `compile_to_file()` and friends (for AOT use), or return the output values to the caller (for JIT use). - Finally, the Generator instance will be discarded, never to be used again. Note that almost all of the code doing the hand-wavy bits above is injected by the `@hl.generator` decorator – the Generator author doesn't need to know or care about the specific details, only that they happen. All Halide Generators are **single-use** instances – that is, any given Generator instance should be used at most once. If a Generator is to be executed multiple times (e.g. for different `GeneratorParam` values, or a different `Target`), a new one must be constructed each time. #### Notable Differences Between C++ and Python Generators If you have written C++ Generators in Halide in the past, you might notice some features are missing and/or different for Python Generators. Among the differences are: - In C++, you can create a Generator, then call `set_generatorparam_value()` to alter the values of GeneratorParams. In Python, there is no public method to alter a GeneratorParam after the Generator is created; instead, you must pass a dict of GeneratorParam values to the constructor, after which the values are immutable for that Generator instance. - Array Inputs/Outputs: in our experience, they are pretty rarely used, it complicates the implementation in nontrivial ways, and the majority of use cases for them can all be reasonably supported by dynamically adding inputs or outputs (and saving the results in a local array). - `Input` and `Output`: these were deliberately left out in order to simplify Python Generators. It's possible that something similar might be added in the future. - GeneratorParams with LoopLevel types: these aren't useful without `Input`/`Output`. - GeneratorParams with Enum types: using a plain `str` type in Python is arguably just as easy, if not easier. - `get_externs_map()`: this allows registering ExternalCode objects to be appended to the Generator's code. In our experience, this feature is very rarely used. We will consider adding this in the future if necessary. - Lazy Binding of Unspecified Input/Output Types: for C++ Generators, if you left an Output's type (or dimensionality) unspecified, you didn't always have to specify a `GeneratorParam` to make it into a concrete type: if the type was always fully specified by the contents of the `generate()` method, that was good enough. In Python Generators, by contrast, **all** types and dimensions must be **explicitly** specified by either code declaration or by `GeneratorParam` setting. This simplifies the internal code in nontrivial ways, and also allows for (arguably) more readable code, since there are no longer cases that require the reader to execute the code in their head in order to deduce the output types. ## Keeping Up To Date If you use the Halide Bindings for Python inside Google, you are *strongly* encouraged to [subscribe to announcements for new releases of Halide](https://github.blog/changelog/2018-11-27-watch-releases/), as it is likely that enhancements and tweaks to our Python support will be made in future releases. ## License The Python bindings use the same [MIT license](https://github.com/halide/Halide/blob/main/LICENSE.txt) as Halide. Python bindings provided by Connelly Barnes (2012-2013), Fred Rotbart (2014), Rodrigo Benenson (2015) and the Halide open-source community. Halide-17.0.1/README_rungen.md000066400000000000000000000301021456515664200156350ustar00rootroot00000000000000# Running and Benchmarking Halide Generators ## Overview `RunGen` is a simple(ish) wrapper that allows an arbitrary Generator to be built into a single executable that can be run directly from bash, without needing to wrap it in your own custom main() driver. It also implements a rudimentary benchmarking and memory-usage functionality. If you use the standard CMake rules for Generators, you get RunGen functionality automatically. (If you use Make, you might need to add an extra rule or two to your Makefile; all the examples in `apps/` already have these rules.) For every `halide_library` (or `halide_library_from_generator`) rule, there is an implicit `name.rungen` rule that generates an executable that wraps the Generator library: ``` # In addition to defining a static library named "local_laplacian", this rule # also implicitly defines an executable target named "local_laplacian.rungen" halide_library( local_laplacian SRCS local_laplacian_generator.cc ) ``` You can build and run this like any other executable: ``` $ make bin/local_laplacian.rungen && ./bin/local_laplacian.rungen Usage: local_laplacian.rungen argument=value [argument=value... ] [flags] ...typical "usage" text... ``` To be useful, you need to pass in values for the Generator's inputs (and locations for the output(s)) on the command line, of course. You can use the `--describe` flag to see the names and expected types: ``` # ('make bin/local_laplacian.rungen && ' prefix omitted henceforth for clarity) $ ./bin/local_laplacian.rungen --describe Filter name: "local_laplacian" Input "input" is of type Buffer with 3 dimensions Input "levels" is of type int32 Input "alpha" is of type float32 Input "beta" is of type float32 Output "local_laplacian" is of type Buffer with 3 dimensions ``` Warning: Outputs may have `$X` (where `X` is a small integer) appended to their names in some cases (or, in the case of Generators that don't explicitly declare outputs via `Output<>`, an autogenerated name of the form `fX`). If this happens, don't forget to escape the `$` with a backslash as necessary. These are both bugs we intend to fix; see https://github.com/halide/Halide/issues/2194 As a convenience, there is also an implicit target that builds-and-runs, named simply "NAME.run": ``` # This is equivalent to "make bin/local_laplacian.rungen && ./bin/local_laplacian.rungen" $ make bin/local_laplacian.run Usage: local_laplacian.rungen argument=value [argument=value... ] [flags] # To pass arguments to local_laplacian.rungen, set the RUNARGS var: $ make bin/local_laplacian.run RUNARGS=--describe Filter name: "local_laplacian" Input "input" is of type Buffer with 3 dimensions Input "levels" is of type int32 Input "alpha" is of type float32 Input "beta" is of type float32 Output "local_laplacian" is of type Buffer with 3 dimensions ``` Inputs are specified as `name=value` pairs, in any order. Scalar inputs are specified the typical text form, while buffer inputs (and outputs) are specified via paths to image files. RunGen currently can read/write image files in any format supported by halide_image_io.h; at this time, that means .png, .jpg, .ppm, .pgm, and .tmp formats. (We plan to add .tiff and .mat (level 5) in the future.) ``` $ ./bin/local_laplacian.rungen input=../images/rgb_small16.png levels=8 alpha=1 beta=1 output=/tmp/out.png $ display /tmp/out.png ``` You can also specify any scalar input as `default` or `estimate`, which will use the default value specified for the input, or the value specified by `set_estimate` for that input. (If the relevant value isn't set for that input, a runtime error occurs.) ``` $ ./bin/local_laplacian.rungen input=../images/rgb_small16.png levels=8 alpha=estimate beta=default output=/tmp/out.png $ display /tmp/out.png ``` If you specify an input or output file format that doesn't match the required type/dimensions for an argument (e.g., using an 8-bit PNG for an Input, or a grayscale image for a 3-dimensional input), RunGen will try to coerce the inputs to something sensible; that said, it's hard to always get this right, so warnings are **always** issued whenever an input or output is modified in any way. ``` # This filter expects a 16-bit RGB image as input, but we're giving it an 8-bit grayscale image: $ ./bin/local_laplacian.rungen input=../images/gray.png levels=8 alpha=1 beta=1 output=/tmp/out.png Warning: Image for Input "input" has 2 dimensions, but this argument requires at least 3 dimensions: adding dummy dimensions of extent 1. Warning: Image loaded for argument "input" is type uint8 but this argument expects type uint16; data loss may have occurred. ``` By default, we try to guess a suitable size for the output image(s), based mainly on the size of the input images (if any); you can also specify explicit output extents. (Note that output_extents are subject to constraints already imposed by the particular Generator's logic, so arbitrary values for --output_extents may produce runtime errors.) ``` # Constrain output extents to 100x200x3 $ ./bin/local_laplacian.rungen --output_extents=[100,200,3] input=../images/rgb_small16.png levels=8 alpha=1 beta=1 output=/tmp/out.png ``` Sometimes you don't care what the particular element values for an input are (e.g. for benchmarking), and you just want an image of a particular size; in that case, you can use the `zero:[]` pseudo-file; it infers the _type_ from the Generator, and inits every element to zero: ``` # Input is a 3-dimensional image with extent 123, 456, and 3 # (bluring an image of all zeroes isn't very interesting, of course) $ ./bin/local_laplacian.rungen --output_extents=[100,200,3] input=zero:[123,456,3] levels=8 alpha=1 beta=1 output=/tmp/out.png ``` You can also specify arbitrary (nonzero) constants: ``` # Input is a 3-dimensional image with extent 123, 456, and 3, # filled with a constant value of 42 $ ./bin/local_laplacian.rungen --output_extents=[100,200,3] input=constant:42:[123,456,3] levels=8 alpha=1 beta=1 output=/tmp/out.png ``` Similarly, you can create identity images where only the diagonal elements are 1-s (rest are 0-s) by invoking `identity:[]`. Diagonal elements are defined as those whose first two coordinates are equal. There's also a `random:SEED:[]` pseudo-file, which fills the image with uniform noise based on a specific random-number seed: ``` # Input is a 3-dimensional image with extent 123, 456, and 3 $ ./bin/local_laplacian.rungen --output_extents=[100,200,3] input=random:42:[123,456,3] levels=8 alpha=1 beta=1 output=/tmp/out.png ``` Instead of specifying an explicit set of extents for a pseudo-input, you can use the string `auto`, which will run a bounds query to choose a legal set of extents for that input given the known output extents. (This is only useful when used in conjunction with the `--output_extents` flag.) ``` $ ./bin/local_laplacian.rungen --output_extents=[100,200,3] input=zero:auto levels=8 alpha=1 beta=1 output=/tmp/out.png ``` You can also specify `estimate` for the extents, which will use the estimate values provided, typically (but not necessarily) for auto_schedule. (If there aren't estimates for all of the buffer's dimensions, a runtime error occurs.) ``` $ ./bin/local_laplacian.rungen --output_extents=[100,200,3] input=zero:auto levels=8 alpha=1 beta=1 output=/tmp/out.png ``` You can combine the two and specify `estimate_then_auto` for the extents, which will attempt to use the estimate values; if a given input buffer has no estimates, it will fall back to the bounds-query result for that input: ``` $ ./bin/local_laplacian.rungen --output_extents=[100,200,3] input=zero:estimate_then_auto levels=8 alpha=1 beta=1 output=/tmp/out.png ``` Similarly, you can use `estimate` for `--output_extents`, which will use the estimate values for each output. (If there aren't estimates for all of the outputs, a runtime error occurs.) ``` $ ./bin/local_laplacian.rungen --output_extents=estimate input=zero:auto levels=8 alpha=1 beta=1 output=/tmp/out.png ``` If you don't want to explicitly specify all (or any!) of the input values, you can use the `--default_input_buffers` and `--default_input_scalars` flags, which provide wildcards for any omitted inputs: ``` $ ./bin/local_laplacian.rungen --output_extents=[100,200,3] --default_input_buffers=random:0:auto --default_input_scalars=estimate output=/tmp/out.png ``` In this case, all input buffers will be sized according to bounds query, and filled with a random seed; all input scalars will be initialized to their declared default values. (If they have no declared default value, a zero of the appropriate type will be used.) Note: `--default_input_buffers` can produce surprising sizes! For instance, any input that uses `BoundaryConditions::repeat_edge` to wrap itself can legally be set to almost any size, so you may legitimately get an input with extent=1 in all dimensions; whether this is useful to you or not depends on the code. It's highly recommended you do testing with the `--verbose` flag (which will log the calculated sizes) to reality-check that you are getting what you expect, especially for benchmarking. A common case (especially for benchmarking) is to specify using estimates for all inputs and outputs; for this, you can specify `--estimate_all`, which is just a shortcut for `--default_input_buffers=estimate_then_auto --default_input_scalars=estimate --output_extents=estimate`. ## Benchmarking To run a benchmark, use the `--benchmarks=all` flag: ``` $ ./bin/local_laplacian.rungen --benchmarks=all input=zero:[1920,1080,3] levels=8 alpha=1 beta=1 --output_extents=[100,200,3] Benchmark for local_laplacian produces best case of 0.0494629 sec/iter, over 3 blocks of 10 iterations. Best output throughput is 39.9802 mpix/sec. ``` You can use `--default_input_buffers` and `--default_input_scalars` here as well: ``` $ ./bin/local_laplacian.rungen --benchmarks=all --default_input_buffers --default_input_scalars --output_extents=estimate Benchmark for local_laplacian produces best case of 0.0494629 sec/iter, over 3 blocks of 10 iterations. Best output throughput is 39.9802 mpix/sec. ``` Note: `halide_benchmark.h` is known to be inaccurate for GPU filters; see https://github.com/halide/Halide/issues/2278 ## Measuring Memory Usage To track memory usage, use the `--track_memory` flag, which measures the high-water-mark of CPU memory usage. ``` $ ./bin/local_laplacian.rungen --track_memory input=zero:[1920,1080,3] levels=8 alpha=1 beta=1 --output_extents=[100,200,3] Maximum Halide memory: 82688420 bytes for output of 1.97754 mpix. ``` Warning: `--track_memory` may degrade performance; don't combine it with `--benchmark` or expect meaningful timing measurements when using it. ## Using RunGen in Make To add support for RunGen to your Makefile, you need to add rules something like this (see `apps/support/Makefile.inc` for an example): ``` HALIDE_DISTRIB ?= /path/to/halide/distrib/folder $(BIN)/RunGenMain.o: $(HALIDE_DISTRIB)/tools/RunGenMain.cpp @mkdir -p $(@D) @$(CXX) -c $< $(CXXFLAGS) $(LIBPNG_CXX_FLAGS) $(LIBJPEG_CXX_FLAGS) -I$(BIN) -o $@ .PRECIOUS: $(BIN)/%.rungen $(BIN)/%.rungen: $(BIN)/%.a $(BIN)/%.registration.cpp $(BIN)/RunGenMain.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LIBPNG_LIBS) $(LIBJPEG_LIBS) $(LDFLAGS) RUNARGS ?= $(BIN)/%.run: $(BIN)/%.rungen @$(CURDIR)/$< $(RUNARGS) ``` Note that the `%.registration.cpp` file is created by running a generator and specifying `registration` in the comma-separated list of files to emit; these are also generated by default if `-e` is not used on the generator command line. ## Known Issues & Caveats - If your Generator uses `define_extern()`, you must have all link-time dependencies declared properly via `FILTER_DEPS`; otherwise, you'll fail to link. - The code does its best to detect when inputs or outputs need to be chunky/interleaved (rather than planar), but in unusual cases it might guess wrong; if your Generator uses buffers with unusual stride setups, RunGen might fail at runtime. (If this happens, please file a bug!) - The code for deducing good output sizes is rudimentary and needs to be smartened; it will sometimes make bad decisions which will prevent the filter from executing. (If this happens, please file a bug!) Halide-17.0.1/README_vulkan.md000066400000000000000000000266041456515664200156530ustar00rootroot00000000000000# Vulkan Support for Halide Halide supports the Khronos Vulkan framework as a compute API backend for GPU-like devices, and compiles directly to a binary SPIR-V representation as part of its code generation before submitting it to the Vulkan API. Both JIT and AOT usage are supported via the `vulkan` target flag (e.g. `HL_JIT_TARGET=host-vulkan`). Vulkan support is actively under development, and considered *BETA* quality at this stage. Tests are passing, but performance tuning and user testing is needed to identify potential issues before rolling this into production. See [below](#current-status) for details. # Compiling Halide w/Vulkan Support You'll need to configure Halide and enable the cmake option TARGET_VULKAN (which is now ON by default). For example, on Linux & OSX: ``` % cmake -G Ninja -DTARGET_VULKAN=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_DIR=$LLVM_ROOT/lib/cmake/llvm % cmake --build build --config Release ``` On Windows, you may need to specify the location of the Vulkan SDK if the paths aren't resolved by CMake automatically. For example (assuming the Vulkan SDK is installed in the default path): ``` C:\> cmake -G Ninja -DTARGET_VULKAN=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_DIR=$LLVM_ROOT/lib/cmake/llvm -DVulkan_LIBRARY=C:\VulkanSDK\1.3.231.1\Lib\vulkan-1.lib -DVulkan_INCLUDE_DIR=C:\VulkanSDK\1.3.231.1\Include\vulkan -S . -B build C:\> cmake --build build --config Release ``` # Vulkan Runtime Environment: Halide has no direct dependency on Vulkan for code-generation, but the runtime requires a working Vulkan environment to run Halide generated code. Any valid Vulkan v1.0+ device driver should work. Specifically, you'll need: - A vendor specific Vulkan device driver - The generic Vulkan loader library For AMD & NVIDIA & Intel devices, download and install the latest graphics driver for your platform. Vulkan support should be included. ## Windows To build Halide AOT generators, you'll need the Vulkan SDK (specifically the Vulkan loader library and headers): https://sdk.lunarg.com/sdk/download/latest/windows/vulkan-sdk.exe For Vulkan device drivers, consult the appropriate hardware vendor for your device. A few common ones are listed below. - [AMD Vulkan Driver](https://www.amd.com/en/technologies/vulkan) - [NVIDIA Vulkan Driver](https://developer.nvidia.com/vulkan-driver) - [INTEL Vulkan Driver](https://www.intel.com/content/www/us/en/download-center/home.html) ## Linux On Ubuntu Linux v22.04, the vulkan runtime is distributed in the `vulkan-tools` package. For earlier versions of Ubuntu (e.g. v20.x or v18.x) the contents of the `vulkan-tools` package was distributed as `vulkan-utils` so use that package instead. Proprietary drivers can be installed via 'apt' using PPA's for each vendor. Examples for AMD and NVIDIA are provided below. For AMD on Ubuntu v22.04: ``` $ sudo add-apt-repository ppa:oibaf/graphics-drivers $ sudo apt update $ sudo apt upgrade $ sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools ``` For NVIDIA on Ubuntu v22.04: ``` $ sudo add-apt-repository ppa:graphics-drivers/ppa $ sudo apt update $ sudo apt upgrade # - replace ### with latest driver release (e.g. 515) $ sudo apt install nvidia-driver-### nvidia-settings vulkan vulkan-tools ``` Note that only valid drivers for your system should be installed since there are reports of the Vulkan loader segfaulting just by having a non-supported driver present. Specifically, the seemingly generic `mesa-vulkan-drivers` actually includes the AMD graphics driver, which can cause problems if installed on an NVIDIA-only system. ## Mac You're better off using Halide's Metal backend instead, but it is possible to run Vulkan apps on a Mac via the MoltenVK library: - [MoltenVK Project](https://github.com/KhronosGroup/MoltenVK) The easiest way to get the necessary dependencies is to use the official MoltenVK SDK installer provided by LunarG: - [MoltenVK SDK (Latest Release)](https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg) Alternatively, if you have the [Homebrew](https://brew.sh/) package manager installed for MacOS, you can use it to install the Vulkan Loader and MoltenVK compatibility layer: ``` $ brew install vulkan-loader molten-vk ``` # Testing Your Vulkan Environment You can validate that everything is configured correctly by running the `vulkaninfo` app (bundled in the vulkan-utils package) to make sure your device is detected (eg): ``` $ vulkaninfo ========== VULKANINFO ========== Vulkan Instance Version: 1.3.224 Instance Extensions: count = 19 =============================== ... Layers: count = 10 ================== VK_LAYER_KHRONOS_profiles (Khronos Profiles layer) Vulkan version 1.3.224, layer version 1: Layer Extensions: count = 0 Devices: count = 1 GPU id = 0 (NVIDIA GeForce RTX 3070 Ti) Layer-Device Extensions: count = 1 ... ``` Make sure everything looks correct before continuing! # Targetting Vulkan To generate Halide code for Vulkan, simply add the `vulkan` flag to your target as well as any other optional device specific features you wish to enable for Halide: | Target Feature | Description | | -- | -- | | `vulkan` | Enables the vulkan backend | | `vk_int8` | Allows 8-bit integer storage types to be used | | `vk_int16` | Allows 16-bit integer storage types to be used | | `vk_int64` | Allows 64-bit integer storage types to be used | | `vk_float16` | Allows 16-bit floating-point values to be used for computation | | `vk_float64` | Allows 64-bit floating-point values to be used for computation | | `vk_v10` | Generates code compatible with the Vulkan v1.0+ API | | `vk_v12` | Generates code compatible with the Vulkan v1.2+ API | | `vk_v13` | Generates code compatible with the Vulkan v1.3+ API | Note that 32-bit integer and floating-point types are always available. All other optional device features are off by default (since they are not required by the Vulkan API, and thus must be explicitly enabled to ensure that the code being generated will be compatible with the device and API version being used for execution). For AOT generators add `vulkan` (and any other flags you wish to use) to the target command line option: ``` $ ./lesson_15_generate -g my_first_generator -o . target=host-vulkan-vk_int8-vk_int16 ``` For JIT apps use the `HL_JIT_TARGET` environment variable: ``` $ HL_JIT_TARGET=host-vulkan-vk_int8-vk_int16 ./tutorial/lesson_01_basics ``` # Useful Runtime Environment Variables To modify the default behavior of the runtime, the following environment variables can be used to adjust the configuration of the Vulkan backend at execution time: `HL_VK_LAYERS=...` will tell Halide to choose a suitable Vulkan instance that supports the given list of layers. If not set, `VK_INSTANCE_LAYERS=...` will be used instead. If neither are present, Halide will use the first Vulkan compute device it can find. Multiple layers can be specified using the appropriate environment variable list delimiter (`:` on Linux/OSX/Posix, or `;` on Windows). `HL_VK_DEVICE_TYPE=...` will tell Halide to choose which type of device to select for creating the Vulkan instance. Valid options are 'gpu', 'discrete-gpu', 'integrated-gpu', 'virtual-gpu', or 'cpu'. If not set, Halide will search for the first 'gpu' like device it can find, or fall back to the first compute device it can find. `HL_VK_ALLOC_CONFIG=...` will tell Halide to configure the Vulkan memory allocator use the given constraints specified as 5x integer values separated by the appropriate environment variable list delimiter (e.g. `N:N:N:N:N` on Linux/OSX/Posix, or `N;N;N;N;N` on Windows). These values correspond to `maximum_pool_size`, `minimum_block_size`, `maximum_block_size`, `maximum_block_count` and `nearest_multiple`. The `maximum_pool_size` constraint will tell Halide to configure the Vulkan memory allocator to never request more than N megabytes for the entire pool of allocations for the context. This includes all resource blocks used for suballocations. Setting this to a non-zero value will limit the amount device memory used by Halide, which may be useful when other applications and frameworks are competing for resources. Default is 0 ... meaning no limit. The `minimum_block_size` constraint will tell Halide to configure the Vulkan memory allocator to always request a minimum of N megabytes for a resource block, which will be used as a pool for suballocations. Increasing this value may improve performance while sacrificing the amount of available device memory. Default is 32MB. The `maximum_block_size` constraint will tell Halide to configure the Vulkan memory allocator to never exceed a maximum of N megabytes for a resource block. Decreasing this value may free up more memory but may impact performance, and/or restrict allocations to be unusably small. Default is 0 ... meaning no limit. The `maximum_block_count` constraint will tell Halide to configure the Vulkan memory allocator to never exceed a total of N block allocations. Decreasing this value may free up more memory but may impact performance, and/or restrict allocations. Default is 0 ... meaning no limit. The `nearest_multiple` constraint will tell Halide to configure the Vulkan memory allocator to always round up the requested allocation sizes to the given integer value. This is useful for architectures that require specific alignments for subregions allocated within a block. Default is 32 ... setting this to zero means no constraint. # Debug Environment Variables The following environment variables may be useful for tracking down potential issues related to Vulkan: `HL_DEBUG_CODEGEN=3` will print out debug info that includees the SPIR-V code generator used for Vulkan while it is compiling. `HL_SPIRV_DUMP_FILE=...` specifies a file to dump the binary SPIR-V generated during compilation. Useful for debugging CodeGen issues. Can be inspected, validated and disassembled via the SPIR-V tools: https://github.com/KhronosGroup/SPIRV-Tools # Current Status All correctness tests are now passing on tested configs for Linux & Windows using the target `host-vulkan-vk_int8-vk_int16-vk_int64-vk_float16-vk_float64-vk_v13` on LLVM v14.x. MacOS passes most tests but encounters internal MoltenVK code translation issues for wide vectors, and ambiguous function calls. Python apps, tutorials and correctness tests are now passing, but the AOT cases are skipped since the runtime environment needs to be customized to locate the platform specific Vulkan loader library. Android platform support is currently being worked on. # Caveats: - Other than 32-bit floats and integers, every other data type is optional per the Vulkan spec - Float 64-bit types can be enabled, but there aren't any native math functions available in SPIR-V - Only one dynamically sized shared memory allocation can be used, but any number of fixed sized allocation are supported (up to the maximum amount allowed by the device) # Known TODO: - Performance tuning of CodeGen and Runtime - More platform support (Android is work-in-progress, RISC-V, etc) - Adapt unsupported types to supported types (if missing vk_int8 then promote to uint32_t)? - Better debugging utilities using the Vulkan debug hooks. - Allow debug symbols to be stripped from SPIR-V during codegen to reduce memory overhead for large kernels. - Investigate floating point rounding and precision (v1.3 adds more controls) - Investigate memory model usage (can Halide gain anything from these?) Halide-17.0.1/README_webassembly.md000066400000000000000000000247111456515664200166650ustar00rootroot00000000000000# WebAssembly Support for Halide Halide supports WebAssembly (Wasm) code generation from Halide using the LLVM backend. As WebAssembly itself is still under active development, Halide's support has some limitations. Some of the most important: - Sign-extension operations are enabled by default (but can be avoided via Target::WasmMvpOnly). - Non-trapping float-to-int conversions are enabled by default (but can be avoided via Target::WasmMvpOnly). - Fixed-width SIMD (128 bit) can be enabled via Target::WasmSimd128. - Threads have very limited support via Target::WasmThreads; see [below](#using-threads) for more details. - Halide's JIT for Wasm is extremely limited and really useful only for internal testing purposes. # Additional Tooling Requirements: - In additional to the usual install of LLVM and clang, you'll need lld. - Locally-installed version of Emscripten, 1.39.19+ Note that for all of the above, earlier versions might work, but have not been tested. # AOT Limitations Halide outputs a Wasm object (.o) or static library (.a) file, much like any other architecture; to use it, of course, you must link it to suitable calling code. Additionally, you must link to something that provides an implementation of `libc`; as a practical matter, this means using the Emscripten tool to do your linking, as it provides the most complete such implementation we're aware of at this time. - Halide ahead-of-time tests assume/require that you have Emscripten installed and available on your system, with the `EMSDK` environment variable set properly. # JIT Limitations It's important to reiterate that the WebAssembly JIT mode is not (and will never be) appropriate for anything other than limited self tests, for a number of reasons: - It actually uses an interpreter (from the WABT toolkit [https://github.com/WebAssembly/wabt]) to execute wasm bytecode; not surprisingly, this can be *very* slow. - Wasm effectively runs in a private, 32-bit memory address space; while the host has access to that entire space, the reverse is not true, and thus any `define_extern` calls require copying all `halide_buffer_t` data across the Wasm<->host boundary in both directions. This has severe implications for existing benchmarks, which don't currently attempt to account for this extra overhead. (This could possibly be improved by modeling the Wasm JIT's buffer support as a `device` model that would allow lazy copy-on-demand.) - Host functions used via `define_extern` or `HalideExtern` cannot accept or return values that are pointer types or 64-bit integer types; this includes things like `const char *` and `user_context`. Fixing this is tractable, but is currently omitted as the fix is nontrivial and the tests that are affected are mostly non-critical. (Note that `halide_buffer_t*` is explicitly supported as a special case, however.) - Threading isn't supported at all (yet); all `parallel()` schedules will be run serially. - The `.async()` directive isn't supported at all, not even in serial-emulation mode. - You can't use `Param` (or any other arbitrary pointer type) with the Wasm jit. - You can't use `Func.debug_to_file()`, `Func.set_custom_do_par_for()`, `Func.set_custom_do_task()`, or `Func.set_custom_allocator()`. - The implementation of `malloc()` used by the JIT is incredibly simpleminded and unsuitable for anything other than the most basic of tests. - GPU usage (or any buffer usage that isn't 100% host-memory) isn't supported at all yet. (This should be doable, just omitted for now.) Note that while some of these limitations may be improved in the future, some are effectively intrinsic to the nature of this problem. Realistically, this JIT implementation is intended solely for running Halide self-tests (and even then, a number of them are fundamentally impractical to support in a hosted-Wasm environment and are disabled). In sum: don't plan on using Halide JIT mode with Wasm unless you are working on the Halide library itself. ## Using V8 as the interpreter There is experimental support for using V8 as the interpreter in JIT mode, rather than WABT. This is enabled by the CMake command line options `-DWITH_V8=ON -DWITH_WABT=OFF` (only one of them can be used at a time). You must build V8 locally V8, then specify the path to the library and headers as CMake options. This is currently only tested on x86-64-Linux and requires v8 version 9.8.177 as a minimum. The canonical instructions to build V8 are at [v8.dev](https://v8.dev/docs/build), and [there are examples for embedding v8](https://v8.dev/docs/embed). The process for Halide is summarized below. - Install [`depot_tools`](https://commondatastorage.googleapis.com/chrome-infra-docs/flat/depot_tools/docs/html/depot_tools_tutorial.html#_setting_up) - Fetch v8 source code (and install required dependencies): ``` $ gclient $ mkdir ~/v8 && cd ~/v8 $ fetch v8 $ cd ~/v8/v8 $ git checkout origin/9.8.177 ``` - Create a build configuration: `tools/dev/v8gen.py x64.release.sample` - Turn off pointer compression: `echo 'v8_enable_pointer_compression = false' >> out.gn/x64.release.sample/args.gn` - Disable the GDB-JIT interface (conflicts with LLVM): `echo 'v8_enable_gdbjit = false' >> out.gn/x64.release.sample/args.gn` - Build the static library: `autoninja -C out.gn/x64.release.sample v8_monolith` With V8 built, we can pass the CMake options: - `V8_INCLUDE_PATH`, path to V8 includes, e.g. `$HOME/v8/v8/include` - `V8_LIB_PATH`, path to V8 static library, e.g. `$HOME/v8/v8/out.gn/x64.release.sample/obj/libv8_monolith.a` An example to configure Halide with V8 support, build and run an example test: ``` $ cd /path/to/halide $ export HL_TARGET=wasm-32-wasmrt-wasm_simd128 $ export HL_JIT_TARGET=${HL_TARGET} $ cmake -G Ninja \ -DWITH_WABT=OFF \ -DWITH_V8=ON \ -DV8_INCLUDE_PATH=$HOME/v8/v8/include \ -DV8_LIB_PATH=$HOME/v8/v8/out.gn/x64.release.sample/obj/libv8_monolith.a \ -DHalide_TARGET=${HL_TARGET} \ /* other cmake settings here as appropriate */ $ cmake --build . $ ctest -L "correctness|generator" -j ``` # To Use Halide For WebAssembly: - Ensure WebAssembly is in LLVM_TARGETS_TO_BUILD; if you use the default (`"all"`) then it's already present, but otherwise, add it explicitly: ``` -DLLVM_TARGETS_TO_BUILD="X86;ARM;NVPTX;AArch64;PowerPC;Hexagon;WebAssembly ``` ## Enabling wasm JIT If you want to run `test_correctness` and other interesting parts of the Halide test suite (and you almost certainly will), you'll need to ensure that LLVM is built with wasm-ld: - Ensure that you have lld in LVM_ENABLE_PROJECTS: ``` cmake -DLLVM_ENABLE_PROJECTS="clang;lld" ... ``` - To run the JIT tests, set `HL_JIT_TARGET=wasm-32-wasmrt` (possibly adding `wasm_simd128`) and run CMake/CTest normally. Note that wasm testing is only supported under CMake (not via Make). ## Enabling wasm AOT If you want to test ahead-of-time code generation (and you almost certainly will), you need to install Emscripten locally. - The simplest way to install is probably via the Emscripten emsdk (https://emscripten.org/docs/getting_started/downloads.html). - To run the AOT tests, set `HL_TARGET=wasm-32-wasmrt` (possibly adding `wasm_simd128`) and run CMake/CTest normally. Note that wasm testing is only supported under CMake (not via Make). # Running benchmarks The `test_performance` benchmarks are misleading (and thus useless) for Wasm, as they include JIT overhead as described elsewhere. Suitable benchmarks for Wasm will be provided at a later date. (See https://github.com/halide/Halide/issues/5119 and https://github.com/halide/Halide/issues/5047 to track progress.) # Using Threads You can use the `wasm_threads` feature to enable use of a normal pthread-based thread pool in Halide code, but with some careful caveats: - This requires that you use a wasm runtime environment that provides pthread-compatible wrappers. At this time of this writing, the only environment known to support this well is Emscripten (when using the `-pthread` flag, and compiling for a Web environment). In this configuration, Emscripten goes to great lengths to make WebWorkers available via the pthreads API. (You can see an example of this usage in apps/HelloWasm.) Note that not all wasm runtimes support WebWorkers; generally, you need a full browser environment to make this work (though some versions of some shell tools may also support this, e.g. nodejs). - There is currently no support for using threads in a WASI environment, due to current limitations in the WASI specification. (We hope that this will improve in the future.) - There is no support for using threads in the Halide JIT environment, and no plans to add them anytime in the near-term future. # Known Limitations And Caveats - Current trunk LLVM (as of July 2020) doesn't reliably generate all of the Wasm SIMD ops that are available; see https://github.com/halide/Halide/issues/5130 for tracking information as these are fixed. - Using the JIT requires that we link the `wasm-ld` tool into libHalide; with some work this need could possibly be eliminated. - OSX and Linux-x64 have been tested. Windows hasn't; it should be supportable with some work. (Patches welcome.) - None of the `apps/` folder has been investigated yet. Many of them should be supportable with some work. (Patches welcome.) - We currently use v8/d8 as a test environment for AOT code; we may want to consider using Node or (better yet) headless Chrome instead (which is probably required to allow for using threads in AOT code). # Known TODO: - There's some invasive hackiness in Codgen_LLVM to support the JIT trampolines; this really should be refactored to be less hacky. - Can we rework JIT to avoid the need to link in wasm-ld? This might be doable, as the wasm object files produced by the LLVM backend are close enough to an executable form that we could likely make it work with some massaging on our side, but it's not clear whether this would be a bad idea or not (i.e., would it be unreasonably fragile). - Buffer-copying overhead in the JIT could possibly be dramatically improved by modeling the copy as a "device" (i.e. `copy_to_device()` would copy from host -> wasm); this would make the performance benchmarks much more useful. - Can we support threads in the JIT without an unreasonable amount of work? Unknown at this point. Halide-17.0.1/README_webgpu.md000066400000000000000000000123211456515664200156330ustar00rootroot00000000000000# WebGPU support for Halide Halide has work-in-progress support for generating and running WebGPU shaders. This can be used in conjunction with the WebAssembly backend to bring GPU-accelerated Halide pipelines to the web. As the first version of the WebGPU standard is itself still being developed, Halide's support has some limitations and may only work with certain browsers and versions of Emscripten. ## Known limitations The following is a non-comprehensive list of known limitations: - Only 32-bit integers and floats have efficient support. * 8-bit and 16-bit integers are implemented using emulation. Future extensions to WGSL will allow them to be implemented more efficiently. * 64-bit integers and floats will likely remain unsupported until WGSL gains extensions to support them. - Wrapping native device buffer handles is not yet implemented. - You must use CMake/CTest to build/test Halide for WebGPU; using the Makefile is not supported for WebGPU testing (and probably never will be). In addition to these functional limitations, the performance of the WebGPU backend has not yet been evaluated, and so optimizations in the runtime or device codegen may be required before it becomes profitable to use. ## Running with WebAssembly via Emscripten: `HL_TARGET=wasm-32-wasmrt-webgpu` > _Tested with top-of-tree Emscripten as of 2023-02-23, against Chrome v113._ Halide can generate WebGPU code that can be integrated with WASM code using Emscripten. When invoking `emcc` to link Halide-generated objects, include these flags: `-s USE_WEBGPU=1 -s ASYNCIFY`. Tests that use AOT compilation can be run using a native WebGPU implementation that has Node.js bindings, such as [Dawn](https://dawn.googlesource.com/dawn/). You must set an environment variable named `HL_WEBGPU_NODE_BINDINGS` that has an absolute path to the bindings to run these tests, e.g. `HL_WEBGPU_NODE_BINDINGS=/path/to/dawn.node`. See [below](#setting-up-dawn) for instructions on building the Dawn Node.js bindings. JIT compilation is not supported when using WebGPU with WASM. ## Running natively: `HL_TARGET=host-webgpu` > _Tested with top-of-tree Dawn as of 2023-11-27 [commit b5d38fc7dc2a20081312c95e379c4a918df8b7d4]._ For testing purposes, Halide can also target native WebGPU libraries, such as [Dawn](https://dawn.googlesource.com/dawn/) or [wgpu](https://github.com/gfx-rs/wgpu). This is currently the only path that can run the JIT correctness tests. See [below](#setting-up-dawn) for instructions on building Dawn. > Note that as of 2023-11-27, wgpu is not supported due to > [lacking `override` support for WGSL](https://github.com/gfx-rs/wgpu/issues/1762) > which we require > in order to set GPU block sizes. When targeting WebGPU with a native target, Halide defaults to looking for a build of Dawn (with several common names and suffixes); you can override this by setting the `HL_WEBGPU_NATIVE_LIB` environment variable to the absolute path to the library you want. Note that it is explicitly legal to define both `HL_WEBGPU_NATIVE_LIB` and `HL_WEBGPU_NODE_BINDINGS` at the same time; the correct executable environment will be selected based on the Halide target specified. Note that it is explicitly legal to specify both WEBGPU_NATIVE_LIB and WEBGPU_NODE_BINDINGS for the same build; the correct executable environment will be selected based on the Halide target specified. ## Setting up Dawn Building Dawn's Node.js bindings currently requires using CMake. First, [install `depot_tools`](https://commondatastorage.googleapis.com/chrome-infra-docs/flat/depot_tools/docs/html/depot_tools_tutorial.html#_setting_up) and add it to the `PATH` environment variable. Next, get Dawn and its dependencies: # Clone the repo git clone https://dawn.googlesource.com/dawn cd dawn # Bootstrap the gclient configuration with Node.js bindings enabled cp scripts/standalone-with-node.gclient .gclient # Fetch external dependencies and toolchains with gclient gclient sync # Other dependencies that must be installed manually: # - golang Finally, build Dawn, enabling both the Node.js bindings and shared libraries: mkdir -p cd cmake -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ -DDAWN_BUILD_NODE_BINDINGS=1 \ -DDAWN_ENABLE_PIC=1 \ -DBUILD_SHARED_LIBS=ON ninja dawn.node webgpu_dawn This will produce the following artifacts: - Node.js bindings: `/dawn.node` - Native library: `/src/dawn/native/libwebgpu_dawn.{so,dylib,dll}` These paths can then be used for the `HL_WEBGPU_NODE_BINDINGS` and `HL_WEBGPU_NATIVE_LIB` environment variables when using Halide. ## Updating mini_webgpu.h The recommended method for updating `mini_webgpu.h` is to copy the `gen/include/dawn/webgpu.h` file from the Dawn build directory, then: - Restore the `// clang-format {off,on}` lines. - Comment out the `#include ` lines. - Remove the `void` parameter from the `WGPUProc` declaration. This guarantees a version of the WebGPU header that is compatible with Dawn. When the native API eventually stabilizes, it should be possible to obtain a header from the `webgpu-native` GitHub organization that will be compatible with Dawn, wgpu, and Emscripten. Halide-17.0.1/apps/000077500000000000000000000000001456515664200137475ustar00rootroot00000000000000Halide-17.0.1/apps/CMakeLists.txt000066400000000000000000000037001456515664200165070ustar00rootroot00000000000000## # Test apps from the perspective of a consuming project. ## cmake_minimum_required(VERSION 3.22) project(Halide_apps) enable_testing() if (WIN32) option(ENABLE_APPS_HANNK "Build apps/hannk" OFF) else () option(ENABLE_APPS_HANNK "Build apps/hannk" ON) endif () function(add_app app_name) string(TOUPPER "ENABLE_APPS_${app_name}" opt) option(${opt} "Build apps/${app_name}" ON) if (${opt}) add_subdirectory(${app_name}) endif () endfunction() # TODO: most of the apps need to be smartened to be crosscompilable under wasm. message(STATUS "Halide_TARGET ${Halide_TARGET}") if (Halide_TARGET MATCHES "wasm") message(WARNING "Skipping apps when building under wasm") return() endif() # add_app(HelloAndroid) # TODO(#5374): missing CMake build # add_app(HelloAndroidCamera2) # TODO(#5374): missing CMake build # add_app(HelloPyTorch) # TODO(#5374): missing CMake build # add_app(HelloiOS) # TODO(#5374): missing CMake build # add_app(auto_viz) # TODO(#5374): missing CMake build add_app(bgu) add_app(bilateral_grid) add_app(blur) add_app(c_backend) add_app(camera_pipe) add_app(compositing) add_app(conv_layer) add_app(cuda_mat_mul) add_app(depthwise_separable_conv) add_app(fft) add_app(hannk) add_app(harris) # add_app(hexagon_benchmarks) # TODO(#5374): missing CMake build # add_app(hexagon_dma) # TODO(#5374): missing CMake build add_app(hist) add_app(iir_blur) add_app(interpolate) add_app(lens_blur) add_app(linear_algebra) # add_app(linear_blur) # TODO(#5374): missing CMake build add_app(local_laplacian) add_app(max_filter) add_app(nl_means) # add_app(nn_ops) # TODO(#5374): missing CMake build # add_app(onnx) # TODO(#5374): missing CMake build # add_app(openglcompute) # TODO(#5374): missing CMake build add_app(resize) # add_app(resnet_50) # TODO(#5374): missing CMake build # add_app(simd_op_check) # TODO(#5374): missing CMake build add_app(stencil_chain) add_app(unsharp) add_app(wavelet) add_app(HelloBaremetal) Halide-17.0.1/apps/CMakePresets.json000066400000000000000000000047401456515664200171750ustar00rootroot00000000000000{ "version": 3, "cmakeMinimumRequired": { "major": 3, "minor": 22, "patch": 0 }, "configurePresets": [ { "name": "base", "hidden": true, "binaryDir": "${sourceDir}/../build/apps/${presetName}" }, { "name": "ci", "hidden": true, "inherits": "base", "toolchainFile": "${sourceDir}/../cmake/toolchain.${presetName}.cmake", "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, { "name": "debug", "inherits": "base", "displayName": "Debug", "description": "Debug build with no special settings", "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, { "name": "release", "inherits": "base", "displayName": "Release", "description": "Release build with no special settings", "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, { "name": "linux-x64-asan", "inherits": "ci", "displayName": "ASAN (Linux x64)", "description": "Build everything with ASAN enabled", "cacheVariables": { "LLVM_ROOT": "$penv{LLVM_ROOT}" } } ], "buildPresets": [ { "name": "debug", "configurePreset": "debug", "displayName": "Debug", "description": "Debug build with no special settings" }, { "name": "release", "configurePreset": "release", "displayName": "Release", "description": "Release build with no special settings" }, { "name": "linux-x64-asan", "configurePreset": "linux-x64-asan", "displayName": "ASAN (Linux x64)", "description": "Build everything with ASAN enabled" } ], "testPresets": [ { "name": "debug", "configurePreset": "debug", "displayName": "Debug", "description": "Test everything with Debug build", "output": { "outputOnFailure": true } }, { "name": "release", "configurePreset": "release", "displayName": "Release", "description": "Test everything with Release build", "output": { "outputOnFailure": true } }, { "name": "linux-x64-asan", "configurePreset": "linux-x64-asan", "displayName": "ASAN (Linux x64)", "description": "Test everything with ASAN enabled", "environment": { "ASAN_OPTIONS": "detect_leaks=0:detect_container_overflow=0" }, "output": { "outputOnFailure": true } } ] } Halide-17.0.1/apps/HelloAndroid/000077500000000000000000000000001456515664200163135ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/.gitignore000066400000000000000000000001631456515664200203030ustar00rootroot00000000000000.gradle/** gen/** gradle_build/** HelloAndroid.iml local.properties obj/** proguard-project.txt project.properties Halide-17.0.1/apps/HelloAndroid/AndroidManifest.xml000066400000000000000000000015371456515664200221120ustar00rootroot00000000000000 Halide-17.0.1/apps/HelloAndroid/README.md000066400000000000000000000056301456515664200175760ustar00rootroot00000000000000HelloHalide is a simple application which applies a tone curve and sharpening to a video preview from the camera on a phone or tablet. This application builds for multiple native ABIs. (At present armeabi, armeabi-v7a, arm64-v8a, x86_64, and x86 are supported.) Halide code is generated for each architecture. This build is meant to use Android command line tools. (An IDE is not required.) In order to build, the following will be required: - Android NDK -- This can be downloaded here: https://developer.android.com/tools/sdk/ndk/index.html After installing, make sure the top-level directory of the install is in the PATH. (It should contain an executable ndk-build file.) - Android SDK -- This can be downloaded here: http://developer.android.com/sdk/index.html The standalone SDK is desired. Once downloaded, the "android" program in the tools directory of the install will need to be run. It should bring up a UI allowing one to choose components to install. HelloAndroid currently depends on the android-17 release. (It can easily be made to run on others, but that is what the scripts are setup to build against.) Make sure the tools directory is on one's PATH. - Apache Ant -- which can be downloaded here: http://ant.apache.org/bindownload.cgi make sure the bin directory is on one's PATH. If everything is setup correctly, running the build.sh script in this directory, with the current directory set to here, whould build the HelloAndroid apk and install it on a connected Android device. # Gradle To use Gradle create local.properties file in this folder with sdk.dir and ndk.dir variables defined like so: ``` sdk.dir=/Users/joe/Downloads/android-sdk ndk.dir=/Users/joe/Downloads/android-ndk ``` After that run `gradlew build` which will produce .apk file ready for deployment to the Android device. On Linux/Mac you can use `build-gradle.sh` to build, deploy and run this sample application. Pay attention to the list of platforms supported by your Halide installation. They are listed in jni/Application.mk APP_ABI variable and in build.gradle archs map. For example, if your Halide installation was built without arm64-v8a, remove it from APP_ABI and archs. Both list and map should match, otherwise you will be getting compilation errors complaining about a missing hello.h file: ``` :compileDebugNdkClassic FAILED FAILURE: Build failed with an exception. * What went wrong: Execution failed for task ':compileDebugNdkClassic'. ... Output: /private/tmp/7/halide/apps/HelloAndroid/jni/native.cpp:9:30: fatal error: hello.h: No such file or directory #include "hello.h" ``` # Android Studio To load project into Android Studio use "File/Import Project..." in Android Studio and point to apps/HelloAndroid/build.gradle file. You will have to edit automatically-generated local.properties file to add ndk.dir property so it points to your Android NDK installation as described in Gradle section above. Halide-17.0.1/apps/HelloAndroid/ant.properties000066400000000000000000000012721456515664200212150ustar00rootroot00000000000000# This file is used to override default values used by the Ant build system. # # This file must be checked into Version Control Systems, as it is # integral to the build system of your project. # This file is only used by the Ant script. # You can use this to override default values such as # 'source.dir' for the location of your java source folder and # 'out.dir' for the location of your output folder. # You can also use it define how the release builds are signed by declaring # the following properties: # 'key.store' for the location of your keystore and # 'key.alias' for the name of the key to use. # The password will be asked during the build when you use the 'release' target. Halide-17.0.1/apps/HelloAndroid/build-gradle.sh000077500000000000000000000007611456515664200212110ustar00rootroot00000000000000#!/bin/bash # Gradle needs to know where the NDK is. # The easiest way is to set the ANDROID_NDK_HOME environment variable. # Otherwise, set ndk.dir in local.properties (even though the file itself says # that it's only used by ant). # However, if you run "android update" (say, via build.sh), this variable will # be clobbered. ./gradlew build && adb install -r gradle_build/outputs/apk/HelloAndroid-debug.apk && adb shell am start com.example.hellohalide/com.example.hellohalide.CameraActivity Halide-17.0.1/apps/HelloAndroid/build.gradle000066400000000000000000000126041456515664200205750ustar00rootroot00000000000000import org.apache.tools.ant.taskdefs.condition.Os // Avoid conflicts with Bazel on case-insensitive filesystems buildDir = 'gradle_build' repositories { jcenter() } buildscript { repositories { jcenter() } dependencies { classpath 'com.android.tools.build:gradle:1.2.2' } } //////////////////////////////////////////////////////////////////////////////// // Use gradle's native C++ plugin to build the Halide generator. // // sources: defines all the C++ source files. We only have one SourceSet called // hello_generator. // // executables: we only make one binary called hello_generator. Here is where // we pass compiler and linker flags. // // binaries.withType: binaries is a collection, which in our case is just the // hello_generator executable. withType() filters the collection by type. // binary is the iteration variable. -> defines the body of the lambda: // for each binary: // for each halide_target / Android ABI mapping: // for each generator: // run the generator with -g and target set // make the later ndkBuild task depend on this task. apply plugin: "cpp" sources { hello_generator { cpp(CppSourceSet) { source { srcDirs "jni/", "${projectDir}/../../tools/" include "hello_generator.cpp", "GenGen.cpp" } } } } executables { hello_generator { binaries.all { cppCompiler.args "-std=c++17", "-g", "-Wall", "-fno-rtti", "-I", "${projectDir}/../../include", "-I", "${projectDir}/../../build/include" // "/bin" assumes Makefile build for Halide; "/build/lib" assumes CMake build linker.args "-lHalide", "-ldl", "-lpthread", "-lz", "-L", "${projectDir}/../../bin", "-L", "${projectDir}/../../build/lib" } } } binaries.withType(NativeExecutableBinary) { binary -> def bin = "${projectDir}/bin" def linkTask = binary.tasks.link println "linktask output file is " + linkTask.outputFile Map archs = [ // armeabi and armeabi-v7a are the same as far as Halide is concerned "armeabi": "arm-32-android", "armeabi-v7a": "arm-32-android", "arm64-v8a": "arm-64-android", "x86_64": "x86-64-android-sse41", "x86": "x86-32-android" ] archs.each { arch -> println "creating task for: " + arch.key + " -> " + arch.value def android_abi = arch.key def hl_target = arch.value def task_name = "generate_halide_binary_${binary.name.capitalize()}_${android_abi}" def destDir = new File(bin, "${android_abi}") def generateHalideTask = task(task_name) { dependsOn linkTask doFirst { println "Executing: " + linkTask.outputFile + " ..." destDir.mkdirs() def envVars = [ "DYLD_LIBRARY_PATH=${projectDir}/../../bin", "LD_LIBRARY_PATH=${projectDir}/../../bin" ] def proc = [linkTask.outputFile, "-g", "hello", "-o", "${destDir}", "target=${hl_target}"].execute(envVars, destDir) proc.waitFor() if (proc.exitValue() != 0) { println "return code: ${proc.exitValue()}" println "stderr: ${proc.err.text}" println "stdout: ${proc.in.text}" } } } // Call this task generateHalideTask. binary.builtBy generateHalideTask // Tell gradle that the task called "ndkBuild" below depends // on generateHalideTask. ndkBuild.dependsOn generateHalideTask } println "done with archs" } //////////////////////////////////////////////////////////////////////////////// apply plugin: 'com.android.application' android { compileSdkVersion 21 buildToolsVersion "21.1.2" defaultConfig { applicationId "com.example.hellohalide" minSdkVersion 21 targetSdkVersion 21 versionCode 1 versionName "1.0" } compileOptions { sourceCompatibility JavaVersion.VERSION_1_7 targetCompatibility JavaVersion.VERSION_1_7 } sourceSets { main { java.srcDirs = ["src/"] // Setting jni.srcDirs to [] disables the automatic ndk-build call // which would use parameters defined in build.gradle. Use our own // task (ndkBuild) below. jni.srcDirs = [] jniLibs.srcDirs = ["bin/lib/"] // default is src/main/jniLibs manifest.srcFile "AndroidManifest.xml" res.srcDirs = ["res/"] // default is src/main/res } } // Call regular ndk-build (ndk-build.cmd on Windows) script from // app directory. task ndkBuild(type: Exec) { def ndkDir = project.android.ndkDirectory def ndkBuildCmd = "" if (Os.isFamily(Os.FAMILY_WINDOWS)) { ndkBuildCmd = "ndk-build.cmd" } else { ndkBuildCmd = "ndk-build" } commandLine "$ndkDir/$ndkBuildCmd", "NDK_GEN_OUT=./bin/gen", "NDK_LIBS_OUT=./bin/lib", "NDK_OUT=./bin/obj" } tasks.withType(JavaCompile) { compileTask -> compileTask.dependsOn ndkBuild } buildTypes { release { minifyEnabled false proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro' } } } task wrapper(type: Wrapper) { gradleVersion = '2.2' } Halide-17.0.1/apps/HelloAndroid/build.sh000077500000000000000000000013711456515664200177530ustar00rootroot00000000000000#!/bin/bash set -e android update project -p . --target android-17 mkdir -p bin c++ jni/hello_generator.cpp ../../tools/GenGen.cpp \ -g -fno-rtti -Wall -std=c++17 \ -I ../../include -I ../../build/include \ -L ../../bin -lHalide -ldl -lpthread -lz \ -o bin/hello_generator for archs in arm-32-android,armeabi arm-32-android-armv7s,armeabi-v7a arm-64-android,arm64-v8a x86-64-android-sse41,x86_64 x86-32-android,x86 ; do IFS=, set $archs HL_TARGET=$1 ANDROID_ABI=$2 mkdir -p bin/$ANDROID_ABI ./bin/hello_generator -g hello -o bin/$ANDROID_ABI target=$HL_TARGET unset IFS done pwd ndk-build NDK_GEN_OUT=./bin/gen NDK_LIBS_OUT=./bin/lib NDK_OUT=./bin/obj ant debug adb install -r bin/HelloAndroid-debug.apk adb logcat Halide-17.0.1/apps/HelloAndroid/build.xml000066400000000000000000000076301456515664200201420ustar00rootroot00000000000000 Halide-17.0.1/apps/HelloAndroid/gradle/000077500000000000000000000000001456515664200175515ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/gradle/wrapper/000077500000000000000000000000001456515664200212315ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/gradle/wrapper/gradle-wrapper.jar000066400000000000000000001435121456515664200246510ustar00rootroot00000000000000PK UxE META-INF/PK UxE{MAVMETA-INF/MANIFEST.MFMLK-. K-*ϳR03-IM+I, dZ)%bµrrPK UxEorg/PK UxE org/gradle/PK UxEorg/gradle/wrapper/PK UxEhdf#org/gradle/wrapper/Download$1.class}M 0h5Z+v/ ׆p!.3̳{?~~&(P0MHa3e2&p lÐ|e;D-l ׽C!C"v=lrKOx RhO]!'"՞@yMB` !k>"APݶ-_}ɻDu_yks~ r[=*ek€a)? rg'U ewRĎw s'⥧Ǔ9JZ Y >HH,θ1Ppt1prUNN!;$ i}On->+ſC Of$#5;#*uJID)6j -5}+kY_1}h璥>C0EZQl\!@1JQ!NbN)R_p槩r'GڸS6[Kn0֢\V7pM^E\dMPK UxEXs"org/gradle/wrapper/IDownload.classE 0  ^b AP^26J;t>;ɗ|{z~+%5O&WΔ(a_4[gR#!XbQVg={}1AYCX'R5c/J$S@pP\mKulPK UxEz\Q-org/gradle/wrapper/GradleUserHomeLookup.classS[OAF]R(j[[ZU˪T Od .dYlW$jj>G5=R+ȃɹws??~XªQx)I)`^F\F ṂzQFRhMK K [A*_ɮoANϖvӟtp854˰ZsM0ݍ+e錞K{zahӱa{jr⿅ >4fڦ?(06 %L7k}8e*)v0 DqZ5*>F]m4xqNuj}g'-mZ0Zjw䜦[b!ڋ3)UD0A\>yIA$Rf MxfFӴ*e]ӫxwԯ x wuH𘗽P`{!!}%nx/ q}Jhͮ0,މ=q@{,Qzii“G7 !8CH3 `_[(`+8$U)<$4OZd4}/z@:CYׅ"D "Vv I (&%꿮)[|SW/9s ,n%BrUv/PK UxE] 3org/gradle/wrapper/ExclusiveFileAccessManager.classVKpem&im!@!PlP(ӂbhZ 6]ݍMK} >u蝣2#-XGGǃ://߷6?  Gw^D>tIq WiA9% NߏQ#583x6HSq0(3dD-*pWb~~֤':u$SzOut$%R1%+F.k[kP0vU 6E AJJΛ4lStAjfPM&$xf5)PVuU33ާX^{XXʴrdu56n)j/dbAS;4]mdBK1jOqڢnr-hkz,ceK(.<4̋y5cӘ![vnfF$Թ gPaL13 aД 5R96YK,Ap/]l \ak:rAC}ѫ .ZjMǗf}uUQebPMfA=YT[ {[K8Lb<sHi"3+/a2FXY22֯u{(2NS)G//T> 3h ڸ7B(^:Ga*{Pl6\;#4͠v )l y&ͳ`sua$&&s"\^&x3^9O&39qߝ}~9z(sL7R <7o :ǐG3ăVL=W_ߠ(~׉_hy?;Cr|1C_JV,9fx6'2!Ƽ!09'($)^RV-=wj$\7ud/j-'I%4۵ 2ۤG*1 <PK UxEc`-org/gradle/wrapper/WrapperConfiguration.classn@g8N7Ρ--@[QU6@QB[I*ؑSDC!fN, ?y~:9w(w7͋fzusY.xCCF4Ĩ75%c0.w*&UVVqF IJrN kxB \»'/;'k00#IR,TkA i CeG4G۳B1p3r중2SL6 zl'K9lF#N,wEc"Ұ(v 6[+U32vٸfL;6bfDt,f&6nh4# T;j nID5/G'"皶EY(sJhfZ)Ԭ&Yv숔XH`8BVXnə"Rd`KQW8kKtedTgȄejrm)3.oer$K.uڡz9Gp]LY[pcxb٧ NRzFKKSA7sȧy+:2`ligh鰑qUܠt~`fVT;j:V03v; glcJusyI\\+ؘ_b?ڭbA">qHǸY|u*qSr*-=~PqKǏIAtK.XڮG&`ۊ5K 2c3EݾH}tױHk/u5kXlH"5 "M;|L^x|I;Y%79R, [rng}zlj2 t᳼w2iIa%EVFi.OoΔu9mX<VM{ 7J/Q7و 70_C<.ȿ&(r>)U>'K۱T/) nEh<AXBC4E%/ADo_݃>蓛='O/6.6my">FA_ ,b` ("TCj;'JT|p3Q}4}gx[C8AP"AB;&ꇈ.JziE\"&?nqAKO8:aN>GbF'-&=eh=q /ӋONOp 7gBC8\QJ Xzc^sw PK UxErn&org/gradle/wrapper/PathAssembler.classVcWy8sصb',IS(ID*Nڸkk#o*ﺻĆr(}Z)Ĵ~][*7i73ߛ7ƿ8Rn) CQ(LٶdŋqQ[$'WRQܔ[2FA|$gR q|Sq|gEd| _Q|)/хUQ _1|Ko}q'C9y' XZqp\X(e*PsۃEqtGKC#3SÓG秦'sc$Fnh7lQ3 )0 LLwV+t&&ǟ:`k>ekY4|3}(4y0 TРJ =-u/g5ېur Bwek++ܥ~FLMvo 3zgF{(lMtwoIlquu 9&ilb\cEx{q֩|d/k%З˺h}(莫WrŒmkS? ;(PmVW ?suS^uu2(̆c΂9 s.nxSWXvq`w*CRPJ.d pQy<ŏT *5sxZSEnYO,Jע(żn ?gF*~! _k̻gMU1YN$K\E1pC_tCFGŸUU_xG.;`rkp Ⲃu;Y2]cYgU*擦&HܤaܤO>]aF(TdžKF Irfח7(In 3%I$K u`obM%Tw-5ִp-UhWf^A;]Tp!joCb1 Cݾ۫S(#-S:^p>if~y"c<{Fw12~C %FJ+yͥpD"s( 6]eL/-yk63u1Y;)-lé\}j_,iEfӚ~'lpv2:oFx *W p}nۺ?vV1r WpV8(z t ~J rC')q1μ}$i8ӳt,$0m6KOQηә2+{*{3D΅6b@|&2&qh q(c›v&vzROPdҭH-[!1PiaуUfWB 6.q\Kȱ [Fͅ=$Oq|}t]s,v玬s@14:,v-21)hbf1C,]gM>|C _kdqB]32Rռ$hbFhy'[ZpU<ћ!/b机A==HW=fd`TaF2fƓ jUڶaGy\Qy}ĿT8yN22IAүa=VBD=d+xﺔ%|4#j>K>GXy B:|P;XLJ|PK UxE org/gradle/wrapper/Install.classX |uO{j5t!-Xc tKX`l]%xGڑ#΂D8qppl'ihzM:n&8d[ѦuӴM&=N'=~mziCofOi1}o×_%- EO4ߒOKx4+)H=VuA^ Rȷ@+pfin (7KRƭ&MANٸE o Qr`AŻswo.# e a؊L@"1 ƃ_׸ˈ"2VV#QcFaB`@cIog|bMl,$LZo>=9i3ިaAU :gVBan&{ @UKmZ;h,. mBFh.ܰA!cfʚ]}MT JDϪQJVtA%YȡnP>)ܣr/)/~*.nVkE=AB=*!vV ,)gګ9~*_PPS:!?926_VU>/yD(]ꖑYKr^~T ?!֯.pHn{u׎C+\_(KƦOd:y#w\"s +A˾,uA+կoyEk.'S.~vG/8He*v^U`$36b?D\vG9XS fM 8iUၩd@Q RyՈKi;He=jAqݸ8H j 3a Ƣzn,rvB.&% i<[XO撶ϿIjLɃɗM=zI3uWǀmfWUfpFbr.oEBݹM\T[Jd6x{C\X̾S#ޑd\FyRe[(yE*#OQ >;2upIMEs~ޘo‡TMᔖ2-?Ǒ9IJ>k?B? >Pܟ3B9.{_ux-T%)֭*r H%:#v oMt3E"i_%t*Nq\ɝs+Ǥ+N>м@J"" @e-/R8D-q( C[(:Ȧ3%g(@~[iPbP>6O+hWu0Y-RE9 y[r^/gF*&LkՑgK?+v9:Gr|>!aG2=xMER0K4ܲ@ᶗ-HexZ^y 7N/]y֕PZ#Gp]TOcN;Z5ܳZ ;r3z(z 'yRqz ̔{/G^0/6t7xp[ok?䟧Hm _[Jy_֜Gc88u iQ4R}bafH? nҟH\۝U;hDz1?xoK)/"UGZ*h/jx*"汈uZS[/䛧'isԱHՖʭ 2_]E.P/^KtޠڐRy P`l8rwCs3/U;p8vp$L5Ч3p3wFO;~+賎۫k_LSMQ|HTx{[*iiy<GH~X M i*/:ID~*I,`jqgߺWl p-g$<ϞH`HN0!hF4  'üBLClxpf}8"ʚk\#`P'npHd\\5&#X?|jEV8]n`F oyexȿ@''/ 'Ck~R+g vt>\@N~d *$mpq84dB``E2h=@/:ib֎tZ V]D@Gߛ-=X36Xǜ@K_ iOQykO\}aA|2W[̿e(Hk" fEl__sDl/A7 b9 'fE\ߤ/Q܏=RUk BfW6Q跥ԀIv5)۾UwKJ>ܕ^SUvӟ:_C-+B'/?]Xh+E }3a1!8k?r1-I{H?PK UxEL -org/gradle/wrapper/BootstrapMainStarter.classVY[V=²l1pNSJ4 `:/FD,.iڗ6/|M/vd7w;s̽co/eqWF/f$/aVF+dü,aI1XzwMڴYV5#nͬI,_ m5F:-HO͌i:tV"0 ! a\~X@]T-B4 3L5HpO̎JTpP;"%5)%YҎ:l/O~1yn$۷5JьTL]f9dzByd.hNuT'[UzifJ9TV"܃ )xt\:JFvhؓDA| KAt'zEܪE=gϱ()xCM{T G >Ed4Ѕ׵m _+5rÏÏFyK7 wT92`fg̜ ԩ6 ڻ,R}5(Nimm;==荜+AU*tS(\z&+PUYȲ"Z?,S*O8TTp#]p7ZLgw)⾵,+̞;CנӺBͧ`Z%ܮqfLUE:jښq`>;|4{Dj6R\a(\K$"\&T)3զ@j1-2 X#5E|@-봋,:u;O`L!&;͡cԭ“?G1&1_҄7_&ƄM<'|@3LZ p0b CyJ&_1@$CҊC4}xWnM+vuon$"I&/OP1EѼtİyt,mbiory\ ˿+Z$|cqVv (Aq=:Mawxq1):L%>=ZYJDvPK UxEE C (org/gradle/wrapper/WrapperExecutor.classXw`#Grqlg;e3KHBh<^XNRYd82H[hX-mInh @Kҽ.{M}wg\߻7xO~p 0^k¸-ZBkeua>7(nQ(`ob֑2E6KCwλP=27~>e ø'ΏQtX5-_3u_1+4ɝ0|!c|E/ Jŗe1+Q|_FT/5:׵eohpǖ#Cý#c bZgVMtlMWl-goղE] ]R> K HVT o<$̼nن^Pĵh>m/Sm64ޥ4NY{4+GvhD9n\FwNcB~Ӛ蜰LVci$:sCS40sdjiZ4h{ iSpEk*Ow.RU [oԸn Lk٭e9G-=YzQ` EPpJWAN;I fJ]> (-ҵ#beJӚgi \l[JN9HlۦLYX}2þDm9)4cTIyrQw{}f'tۏNE*hk}:)ȉUW[Adg]i>-?G%(!͚ " :hT|" FMaFdSy"xzqkU>oP)kNVRFߩsMv;71M0 ҕ-Dwh:DUן Ze[v۪4CA,gߕ!{>H1gSz{6;iLBKEuWr7r[9h蘄,f3 Y{)w*Lq"?ן#SH́D.NԮHVvu/+}gn=(Izު[ǣ%iPQ7<>FeU 󅷵2/3sTExK{-q-<)[ z|O`pVgłg6^2g$_?T"/K8_wQ_}Ɠ*@((l4Nנ9dj}bnfv,SIG(Ұt vrek TP"!e~,%v#2F {|ĉ/ISzQL*=KE.{2n*IM"iKOAnIePp6碼iVcгS9~ݓү)깴^aV~qU:/|% 8 @)a[q Ø&,z1cڤ|j! Wܹ;1z^DѧDmGАHAcCS^`4:e2Mc VLTw[ŝ3ܽUޙCh^r IFK3گXJ-g?랪{1܏xI.)^%:Fi%T@ϟ ‰6Mڅ|.) '5Nq `ih1p`(j9N0{p;&[ЀW0J OM4 :Rfػ_5űS-ޭPm>K= W>PK UxE”" *org/gradle/wrapper/GradleWrapperMain.classX x^y#lA0,HBlBbllml됖VHWzwh6iӻ$me7nBϤMJ,-̼ ?#*v[|9@/T"r/V/^ŭr2/+REܯƫUUp :xD]wTNw VF/oRoVp&ܧbޢ*JpyvĻw=rxx@9ާ fU*¼)xXťxDi|؋3c >.~Bn>*7@zI )/Wi-r7g ŗ/ *:@Pუ}# ;<4|hPN?czb*8j%.35҆@xo߁P_obG4v xz. E`:>i$1C3zlLOF<豦) !39JꑘgUГ;F Ŝt KOkRE4_ڊƂ RO cƥf;aӖI1JڳDYG" 4,6qDi,"o-v4}F/_xoNU.A/],&eVB%(ӄG+B*RΒ\4BxV4n7)QRTjeF`gHV bq@ S5pLXGf<fSF`ƎEQ94y4\$ZMIvvNG+A3`$\JSt"B.unUA~JRbd C aW@jGDH 8~"VkzLb l!]FM4O\X,S,ձy`h>+,q*̬8@ jYPjZOu'q#aKhǝz qC.gܳ嬝vXXӎFJ.np8W}ɭj[؋u'1SbGwLc店 ;6%64Xvܽ2ͣ\-` (سJVvB<`Lpq G!!\o%E`*ÒPq Σ|b'e)މ2_ǧfP6>jR-,Ke~O[5sXީVG 2XN߻yDʶy sX3y4My%º tۅA3Y؈z` Zq q6Ѥ/q$ 7rKCKqta<_klĝ"ӓ(#0xC)=Ca頣AḝI!h36^(06߆Y9iKK /8U؀n[Rd WVE,mHZ⥷ ^lkypw`RZl`kepqաy\:>Ntށ<emsb;OgrWy6۲40ȵ\B(# QdfWGG A'N}]Xeugkocqe)ٷ)G/)6 rk`N'~^4ћEK#_M7Gud:W3M fڵ23T Ͻ Pc9Sӛ*x&nzWЕYE <YХ| ގ:'X9`X|<xrG(77}DBɓG:[ 8 V (dc' ޭ OiY(ԌZÊj!=]71mP VelӓNn l/薹6& EN]-$bpb Z"!pt4`&ԝfLr[(V<Mٙ>}=<+TӔv%$Kg`T+FΗ cp8p%zאg;ikefk>۔2lLb1sDDJr@ʧD(Ŷ,G`aÉ5K)73Q&}x243wLFyYzԜM- hʶ[)RLضVʎJy56QJf|t/x/t֪Xz ܄Z:LS7PqM5RW:e: o⬊ye]oCB҆Tn pNGX'ԇT|/T|T|o|Vy|? B'Mae܏B<'L8)~/>K*~e&vuL5KS/F&L[C|Fu&uVyice"?nOEWug]x5='_B{g`]bݯ%;IPLi)>ō-hS=iw/N9"KPodzzC밞7Ш %R8r "j %1xBUx/Q[Jk@SfƋtǒ!$R _Ge+iUԤ1 }!ԓTL| I X*捣b]Y~><%#8:# 4^Be֨b!RIcYq7*_b];{sLƱT** `# M)5_%.Uq{S,vݩp),jC٤dЕqK[[f A% R l"PS4 ianPoa`XD;Z>~l'C #S]~LEU+F^k%-؋m8,њ)·c;<<;Hla}$ IvPE#oD43ԧ7K`]N{7B7U2PK UxEj jV8org/gradle/wrapper/PathAssembler$LocalDistribution.classR[KAftq[yk[KA!|(L!N MJP?U<3Ҩ|ؙs.s˿ū9`OM}Q&m5ȏT0芟"VY~ܪ+Mc56-ډiK` {DjyT򏄽Ilֲ$,4T*? {M Fɡkmjs M%ydw-~[ؑ%Ru<[5'_n$6E<`=fAxwZIDډ?7 U46?S@h=W-ĝPK UxEۅ'9z !org/gradle/wrapper/Download.classVi[~GH0 0E%ncGNSHeuef䥛itItI׸mة-\C>/ m;I`=.s9,瞹:!"Ģt:.ࢊEl%$Gqux(Ⲏ6\U K&/ 7i F[r߫RokxMw4|W4|_*~xCś*~ HYp& WA_cٕ'޲|lGAtf3gO+PR vLؖ뙖7oʢEu|ԩI+h3Ki(Ħ.тi-='o-aۼp\>[y︂Cm+N9h[b\\ Bjg¼* zywvF3W"ѓe`9&'k[#ْ\@jFfST|Aif.[ÜpUj@%tUDAfؔ 2 ̧9e ĒRFxʋ΍3%\2c/&tJŸW)sfރĈ5bL;ywSVydsY*J7jD慕ں:zIFFlQx.Y.;Y!foVc qGpr8]HIV[O ?Wp:mE/ Uko I347FZHGc4d*h]Y#=}T_p_qCMo^-VĈ*XUqqV KX|F.UY1*mb]峢sElpG1a#P3 Xo*7>h xMz8SY[;[p, U4uޗxP_?wyֵuWx'ŢY.$4t-|Fce=cXB\qwvA)y6a u$oDROjFzWQ{[-݆̊u^|톦ࡆڭ:}Sn61nӪ$WjVD4=r2h*Z|Ge([]"}ov&=$ܹY}\yBy9züQ?ְ~189g$q|+{x]@* F W8vCމ0^D0/P)0m)L ~] w+=DנfUhwNj>rЏclU(I>[o@ŁU in$S3x)΃<`? j֕i%`G2hf mFm_Et<\A,$I5u vƃt'C׵9Cq2Gon}geޭwaW1[|c=E>ZvF*{ CCCp\G_hV:n|x5=V[!e(WcqS(AbHQ4gx3KE"RzHB,_#7 eP.e7(E(1J+S\EL33s4j?YKjܭ׸[r]~+eaN0OA*s,?75<2c}>{tl?d0v@>B5hGVt&8΄t&3j\Mgt=<́53 |IZ? )kOg^>Y S8Y^YBuG 0 ] 4GtnhרGPK UxEB=PN#gradle-wrapper-classpath.propertiesSO)IUHIM,RS/S02Q042126Vpv Q0204*(JM.)M/JLIM**+MPK UxE qbuild-receipt.properties=n JBv\zTU!&*@$7o<3]gL,ԏKf<Ε7RCNŷ,/z%~8Lǖy➏R7q=2^{?yZPK UxEorg/gradle/cli/PK UxE<S1org/gradle/cli/AbstractCommandLineConverter.classT]oA= +mt>BHhBem` Cߢ/42YPa9{s?p# 5ϕBy9yEx\Ye ZpՆaEjw I7|ZU c:tiw]HaٲzQdu;BrQfظ&DV\%6# ǹƙun]s̷X3ػe9H%ҲIk.=,ȍ-1ak^1蔕)9(id0^}y_7бp:%`i6t*㰊jBL F vUWm_ImuוD I8H7Լ&%DDՄ3qONB :ACvsπb0\8y 3CEw&T*kY+$@B!K͡5IYF6VANwh`+&XEKH,έQFV# J#hRq +dAy ~#TN*)oްOOSxΑ 86'??'k<ų@zV`PK UxE2_e(org/gradle/cli/CommandLineParser$1.classA 0EhZ v庈kCPEv-iIp.<S\p>?fxCDlnmMJ]k'iu#0BWՔ!f,By@wZ͕t!BI]#HI9|g|{ -|PK UxERB <org/gradle/cli/CommandLineParser$MissingOptionArgState.class]O`6d 2 c&/1N4`W5S/ ^H#2-sKƖ&9O״ 1UQ4%ի*gE)D15u KD10ؓp :@^3\ٲŪ+^1UqQ)B&@)ʊ! B@n!fU(AiW|ۤᲵMnR# yd8!>] ZEGO*%UoulGQ/\TMn˹-02#/hAlu@t%-q*2Nw e&BgsQ\gݧ-F͌& nX{~ϐ2zfZC R6A Ӱἆ%GAa3@Lv0Lc8~csC7]Saf=/b !i!WiXN$!9!:">i!z&|'d&w`8Ig9NOXFiХ uD5x Rl( 38qhKCO p:mO.6p'%0o %rC?7o@Ӹ<=@QO b쫇|CMa"!˰߰d^7Uܰ^t^EY3G5Dh8%h+aVE3D*PK UxEM2=org/gradle/cli/CommandLineParser$OptionStringComparator.classTOAfeam`ZPd) HJ  ަ Yvɛ{&ƳvBkj8tޛyo/ jѱcр%:UlQNJU-`!?`nbHzT(gHuO:{|"<Cbq,l ߷YqDݶ̚-M oyޕJ z-Qӹ|H Nl#Kݑ]:Պ}^!C ެ\kW=jA;W{D9!ŒeAS/@%ئRƾj n5nIJ dOcy_e[o9 =}&fH]ͰqiR-x/ rahf2=BF=Y \ KaڙdiZ_L*SHhpPsED?!Fq`pZl{ݺ=1#[s!B{./zi|CG1ǐIVy&$nR$B;C'r."{i"Llr".03m`Y(N͒V055QM<3t Ʃ`1< mcd3B= `OPK UxE# GK1org/gradle/cli/CommandLineArgumentException.classJ1O3Zm+Uו0tD23k*|(1IK-Y  2aEE9炧=OQzTqvK9fU v#tiˀFCb!ė*BE{2ȅ 9`)K,Ihh\x &J>p1YITةQcBW~ͿDcD y f@]tӻLA^%6uV18(c]3 2gy=ݴoa̝̩ kqv>PK UxE?h=org/gradle/cli/CommandLineParser$KnownOptionParserState.classXkx~dY&Vdsд!@LawL73ۙJ[^6V{zJB^֢Oy<}ߟ=gfvݰ?ܾs@;ޕw%!cL!XIy^l^^\OIcB w10X/LtL{$|Uz|={el}2oɷd|a=_P8? cu 'yy(agL K%yT¯$PkXk戄ٗmivOBsh['Xh43g~vt&@A˰qlŀ1bjn&5 wfbqԴ]L0 KB}qeF;zylXqLҟ;C Dl2XwiMU9^܀¬[2ԘnհT ky|qG\R-VlFtw[^ZpwL&tJ%Fڬuu[s-ά }A*I/m^E> @YK#m^:y}` b(`Vʎ7hZ8h W '̇۶g vGVlWpv(؉)BA 4.nS:vA :WV:ujմ\UTBq>:ȲI6)Oݦ%ݣ6uBsԤmq=ajl-N+8*7+xC bH”.k8e9tTij$ *9 cNsxF ^ ~ڛJjFN趾d/?BQ2xEE*~!Ů656ד#f"zHxS[,mUbj%pAMi /}2h4i-lӨo/F[Xulxz o,<#^Z<;^=F݉5y豖 g7eÿ%8#^+XLw6j=K֕ xr^ i/=Tqܼ42b6vc&>ܰh0g|,eŨCgm}C~Rqᦊl>hAdf^]2gbV% %jO2!ǃL&~spfc9OvtYSr\w 7V*9dpŮ[ [3Gҩ7~vOrDx (Sl}izKg:$D)ahJb9gP6hhVD +$D땐h<Y<*6JBx M'X>bAlwĶ%Y9!e} n/*XO߯i;BPK UxEk7org/gradle/cli/CommandLineParser$OptionComparator.classUmOP~nQ:oSD oe b0!c,Zvğ_Hb"6sJC{=9Ϲ~P4dW%,(c ^JX^IXu E ^3tmm2$r}d }w5C}5\ޱgVnkR,)XN];ڡ5P ЬC渺3y  ,#e}0Ђ/]i'jjV]zaՋ S6^~ u- TtY+y&duK< Y$jj7cv 2W[VAۛ+mS*(uB1lņ,i^\r 8;c =x\8npﹸvr CQo *]] 4|[Tuޒ%`anot.AAiU WGŵ S/co4<}u!TZi>Jv!a<@*QȒ̜!q ;G'.!v {nD<|ڊ֋1(Н0Q I9b|b$m:ݳtBHO\g3(W`iL9I k aXtβE$i,:AyLS)""J?QAX_PK UxEb'n?org/gradle/cli/CommandLineParser$UnknownOptionParserState.classURA==$a2@(""UZ`$SdL__\H*~7= B%q}q[A :0A0iDq5k rCw˺& ezabq [vJ%nWL[F ! ɕgY6rkk X>O 6E=}9A{gN)Ҧ>曖.^]S`{jHO~噎-pe}^^rJI=R7٠cSޒ{2iOcinی1^#r޲t1b7?4yvo=)L^)aFZ֜[c9*z'̼n$T BAa F&00 mGd6a4j3 C'/n{._VƵձ+1DX UR%H}4+.L_c=:"L32}F7>L`g%.dJ\)+QLgq.:K^dI׵{""¾j4e8 ț#u`@Ts䥓?|pP{fr mEhQr!"XOٯUVH/.\ 鋘6?9R?SͩUÕ@FcH:U1-NcjXc4Uy<blfq=PK UxE"zZ &org/gradle/cli/CommandLineOption.classV[sV˱E  cP@HICEUGT\Y S¤tCg'C~GVdٲKΞ՞ow=m;0%c:6Lu`; fx%p -xK ܆*cAF>ꩂ4!#x,ƠpG(ߍ%qKFI{dY5~G*n4gXBǔ^4Ubkֿn ,[7'yBQbeI3%5r&u#q2Qy[wHHSYJWɬjJf@H7h[Ҭ,J8ڲ -7ܨq&nԝR1GC (ONT4{Zj̨.2jyi~\PSDz5,žcKBO}p,̮V!^ԜUmH(Q"lbQ-#a("Zrw6 ivΝ=w _|dŦmTҜ1d{mC);BNDfQkikɄaXq2ЯܵRB7"$(XAy/A7A#>;:|(F ͹=U&KcEZP|yq- S+·\U|HQS$dú݄31TMZ@wťK) sAChU0^]<. uʓGa(M"%z.업cg"W=d1fIٔd}F6,O).a9Ţ,se,~Bٿŋ#H r."Y,3❂Ĕh%aG1C Bրަd0vN_y)]}C[m#U mO]ծ7!VWT{z,jVT, AfDm dUO kjUj38QESV-J-墨l۱n`KFO{3FDe4uI\6%Bl5ܸ`e۠<3-ԡLԖ!J9v ]F(=75sٛ5}?{^?ꎹbxm%;sf횳hmNIxoM&?: C]G(سbqu_u|C7u| Wuo+H-ekg{nl]u?Tp{j.g-ٯ:bJǏc~NC: u *~8%sӽ(z,>`u(]tѼLQq :~ :~ D OV\#ZFg,~!3CwHEnZG$]hec9gڝfl;+}<p;/o'_ cuլ7[s_+a {0{&`vN7s0ċʼnrsqf&8 *BW?=qc4㽔(M(D:"cסRS-t*݂*Rإ蚹n[H(^ZHRqh,{DRTzØ۸ύ;{W"9:L!YDaL^X8Џq/!G=If11<&|x'#ې8 H1"G)\MRtqItboOH·Ĥľ5!ߑ? !1Z\c\웎{F]ʇS=6\w.8 Ti߸ #W%#'!yr#1S8S06|38-I")XZ V z'*=#7ppNPJGfFM*vX! Bg鴏 ]5HgUR(sKX d1߰6kGDg6+! \<$/$Ӟæ<~xQ~L%s@T}wCG4x̯.Psn)"{T@wX ˌ菜58o]$EZЈ/g"%1B-)i~G-QFBJFn^$mPK UxEA5l| :org/gradle/cli/ProjectPropertiesCommandLineConverter.classKO@D|?Pâu#Q+$C;1m  JW&.(1D,9vo/[@yl汕G)v }FHWkwLS!]nY7ZK:̿cJDZRysV;H+-)nkS#cruLXgh|BjFYDΏ%L%񎅎*_?ֈ:("<ڄbJՍ ؊tf^*K ߵ XUVi01k p8wZ8T0g?PaΛm=C Ss | 1\Zq-}C_JEˉjE+ w'PK UxE2lWJForg/gradle/cli/CommandLineParser$CaseInsensitiveStringComparator.classS]oA=3|,bYŊ/b JbB$jBfmvƿŗ`|G,L C{ι=wo4<O2H%= Ttjnੁ}dK9*8b+' ; ]wI_zǢoS$u>(>*A[Ue/3n3̎Hm ߗzck쉡-,N3U?ϗ^i e;^{*΅e gl͑HC9Y\Y,X f.Ggab@HFSϢ/@tOL]u(#_k#ܩ7onaܾN25h8)Jeb[11_b7İ#j1lFד>^I>7ʢ0uI;, W/pdiM O: 䡟 I%'/;Lق)<$O#30VQ+qvC#B7>8^C~,rPK UxE( ]UT*&org/gradle/cli/CommandLineParser.classY |Tչrgn."Y"d% l&̄ *JZ^( 1uֶu}^m{k&ޙL.&Ǐ{Μ||͏;NDs ϟUN>:&5I֣AOk9ȺN9_64SQ\ϣP>hU>bNWOb&#S< Rt*eTRiS.+*S) f *9պ9<ϐOg괄i\2괜ktEBX%:#VXy2i|NZ>#*u2svv.+sy[/* [{m/ߍ2a|Ho\zoŢWN|6xXgΗ/c3[EY*m32u3LcWw&ñhc2::͸53M3hŠ&N+^nۣ]Q{rOXƤ4ގ/8\ipSQ}8lmMMA֤GGƨsDk+0-`jŭqci2V־LYH #ZL^n- +]nŅ LU̝fUW2[m5fRE1.NA]ov&nɮ84\=1mU6Eb񶪶#*GUe h)x6#خn1_ x0SbuƭfS@x;RCa )%jᄹ5bh8SR[kDYձՊBl*lF֛v0Ę ٞKr9"~LNY4d`+ndl? V* a%T+!h8 %/3A.~+ڂW"b˻/iDs61C^x:2,#">El5wMy]-t|Ъg+=6;;S Y5lays@HJFqz Fn+Fޭ f76^3;+fts.l|r9ВA,h%/S0碄'2=Sى oIJzL,bSIdXVHAX Գl4#]gEqnyU@C8fqNj4#pafkA𝞣Iv̸7λ-F-.X_Dfqv+ qr&޺ Ld"=.d[Z['m\vR>d b5a*3:l%͞4dh(E{TX,Ώ9Jg8,]H荱x3j8XIwt0 tA_;&xJvv%aS찷8a9i!‡3P'4x6!zx`gd_W{䳗zϵ՟J}_m5򵀸ܰ2s|B!7,ex|l-oۀe%ŒYe_; >wռL,;cg!#34Uhcq NpErA>d%C˷|o|?_anh毤b!%tuq˂ vIuW ?v%(mfunWt Vxp، [[pGY"ŋbdCMƒfKKq,cHMHdpxfYLh /kd~XFcѤBv3 uvx!@|^2s'|_55dbɦe{6u^Q߽ӽ-J ~߀Q% hȗC8#piB O<1oxp:bfTiIӻ^yӝHZx@EsFWxBI7RRl\.GdaƗ< 8ƹ1}šsn@v,2녃GfڝT6 QcLI[]avoz<1Ď.3!H +Yퟷø]LtmM8[Tҵ!ѝ%.*37l׷fbd&3{W -mv VW+,RYɴ5v|pJ\ 7c.W')ˤOWGYI7vi+ږlWa Ǣ hʁ2 &%n`j7nS[<չ!,fB|ô eC%DrϮOOGߍ4=6S-]JZGi+uS3]I-t#YȰVDY=K;~ )BQ(ND;\JIO]H;y#m͝t9+Ï^~ >\Wi?Ct=n?-W?$%{`Ȇ𽃦c8z#dLd= Yl: M'I9 +j `ߠ+ii^fZ~AG.,qTR=ϛC|4o[ǐG)QTgep/^t@gF$2/)3pKKwyX؇df^H  n^OW})Y[HVM]G?L 6иJKT'>(ɩ6de6jxa-~~WrWtD7(/oKi84\ -Ig!UDk~^SʈoҷlxVДeg>k˛utvCe_hLOtx+5ja=ޒcTX i*Pn+t >_arwsG&+_Ch}RO;AUQ[SsjP?69 k~E{-~uh h?wPK UxE_>ң)3org/gradle/cli/CommandLineParser$AfterOptions.classmOP( @ 1Q|Dt1wkծ5wE#]|!Je<1Mz99s۟PƊ'IW1!y &TU,2~=x6x !fu'\lkug!s6tL![/Hvfe׹[‘zdL[N!e`oyXuyej&!&)bA02]yr^5Vf6R\4dr] C0ԖQi26{{J.-ߏ޺&Ww{Ֆ/IU0ᘓ*3Le o13/#|]ˣ%D3zaS2u7XMc qaA78yWN*%1|avť-N x!*JB)>#iIShln|AjTDVИF(J9DRR; %܊1C!˩1(|GFJfKmt$F5kj|EFߧjr dY+!kmrQ>э=ep2n&0ft9\9E'tN* iϤK49hPK UxEGf3org/gradle/cli/CommandLineParser$OptionString.classSNA=]ʗ(U-e)'#  XlwD w+Bs;s=??~ &cAYs)TҘa! ei,pW]p_8 =[wfnsak璳bv0[=כ)m1$j^C0dlWv8Bv-(wAP ;e(|¯9<Zkߴ>o8ª;UZ-6d7_8'_AyKBkxy>Eӂ4Ui 1 z]A@Z Moz~]{*Ka:r摎, b^hxc+ ݥNC:`U L1r) ;{2̜?3zE9nB;;esy-w'i< MI^NN֤ʍЛh8Yq|9wvc 91Ȩ(` (OaʓbmDm>X1#Ka ~B>FJg!6l|;4 kaCԍx0r/iA';`CG G¡c#'30@ÌnmaF⺙igM1*[Ĥ=\E7i PK UxEx&T` ;org/gradle/cli/AbstractPropertiesCommandLineConverter.classV[wU L2悡5&I  i-4J`Ei+v fpfD{_ m\KWu dY\go}_p?H8"%ެHO%"ͧ= IX6@7=q|';y !H"r k7b0tuVɫ}%_&I cbj| f1\f)^2bYZ ڦbZ:c듶r*&2 M n8&M2jV)DĜ2W5( ,MIzn)嚺Re9hst4%D  Tmgv֭ttKzAm" e7tv٧) ٹW v"Qys;]^W/ypF,w5h}ے6&FOڕ.e`ZcCQ@Q,_Q+WDh2DDYFCF0-6j qdqțt+#y:*[0JZEf~]r~N WScXD\/ô.'Ƕk_4ЫTS!Wn[0( 5R%պ<a՝ISؾꢦ.Ci$h{wfЪCMx(t'9%f9o=,N=M柃B&; ﭣ/ N?gyO:aeJ^/F}o b b4kt"tZ3 яU0fx'ySr 6 aadB3 /)LF}Atc^l?$ͼA_ !7Vkg렾@ ‡CGGDoЅph.` (rroYJ]kKIrIn#)`,r|4 mYYBY-B1>q?PK UxE ,org/gradle/cli/ParsedCommandLineOption.classS[OA-"\A(-ʊP1!iĤoC;Ylw- &JD}G.I|9|93|`+)dPPQL!B0冊I,-,&q[~器+䲤⾊ {lqWAW-+ b^ f~,Cz2$J^YA4_T[Wjls6)5fn2Gsy;+c&k_2U`V]Rm4=a[T.ipoSP0/DND=t=Z+jpS0? KOrbjrjd0ajWKp &N&7YָE4Wl/xY92,BS*jx<Iy2=4yt Zce[ZҚE̙swy' :iB&1!CŴ3*f5$0b^E!YsMKc2\(4ff[\xf}f!?\t]($zr,.épx| luj6%;fOxզPɭq…% v2^l"P+ܧ8ش癄[2۟kVMa*W`sQ(r Q.$uv\ y;>ަVv[fnYx2=Y#X13:ncTŢ%,Xѱ5Pg),V5H3 K; 3'<<JڵSΐ!/g&W;;^ݸ v,ly/x-/TQZ]܉\E/˕=lk<RwgrŁtZ.=EPhh+"J6mL+"F~OA-, f1B 娆E$5"y  3@AU'2P*= QE3D5@}C2Pw'3y=RHI8[-vtn5@ $}_+:)+C EoPK UxE'H g)org/gradle/cli/CommandLineConverter.classQMK@}ԯ'"4 FM)HQ ޷lI7ݔ6MBya=t$S l)8A {Oyb :˄3I5' JXdT"qx{a/4OR1=Q615 ڹ6ƇEWbRh{'qj1M8zta,gZTRwZ\SYVSݜnhUYB/n/1Oz_G86\fjAʚajJuG-UʙeZ7^[u(5}2(Y٪Zk$N(@yA(تeٺip|0Xh5UUJ`[QtZw&~ժ:ѷ3<MSn9j*ߌq\6 G J*yN;{2#hfc f*SL Èaq!y" Ǘ axIbtxh.x%0&X}?:Ui4^}R֚Be*"nE& ̗he;*J"b68pÕ7rC.?V-0fO~^y뱉6{*B^*Y-z=:7CRyʍհjHC*dfN͔W{M ^pCMPzkt_gBR,]x;^Vw+eM+" ;zf7i^/O3}xb]C[B]d5c|Q%G7GO=m~} ҈'ҮOC.i^yb;30rؑ^!yڢ{f U5$:2lދ9eX Lm1 }IbR$Yy|M7PK UxE;|9org/gradle/cli/SystemPropertiesCommandLineConverter.classJ@ثmjE5BDąR/P~ӑ$&BJW 'iAY3͜l "lYlE <& d@HgL{:rRs:C*X4NĬQ ۴;hZ3a ѽG!]Gv7S"5eb o}ɸGtFMz9y~X{()spL`7e.KV, TXxɢfDTEGPWJmh~49AjxѰ sh gԙn85].FԒs9Q΢*s/@Ug J*ce+s+1 $p6/t-,;h-.Z >kZPK UxE-h2org/gradle/cli/CommandLineParser$ParserState.classSoPN) sT4706|3M$m f{@ú[s-?&>GmdiҞ|~pF62΢jv,-4WׄTqB0;%v=^(>{J` aŴ9,D!PK UxEF= ;org/gradle/cli/CommandLineParser$AfterFirstSubCommand.classVRPN[J;be-x)U.R(XE_( |gtPq猏C8I K0㟳{v|gw9| E͈4тHA/pK\3*B;ExqOH0ԛoB Нҍ\$gȫy%ͫ!k)USdqW5ռ0l%^--1x[+T^ɓğҳr~I6T/ =]kS1&U`fV Ҍ)F2/ Y88f-IMy$ M1eLo H2ڼ})MzٛZʑ"P\\y6O\}9ݲ'JBKeվm gËYzK/ Z1}**2 ~CG&1%`Z 3t9eaёgںokǔ@g%Ku`|G%E5e uS閻5ϗ4ڪ۲b}B\EU6:YPFJjh>3L578")&COH)YslH#?nA;ΙZwQW+S ^ͼ8W'AE[=|>>Јs6MMT > wH{im::^A:HG8d"Fx&O+\AX!vѸq HW69Z e+˝}с; \G7Ĺ(( ]LSh`cB p.9Bj1 D'($U<"z@'bj']GZ f<_PK UxE٥FDgradle-cli-classpath.propertiesSO)IUHIM,RS/S02Q042125Wpv Q0204*(JM.)**+MPK UxE AMETA-INF/PK UxE{MAV)META-INF/MANIFEST.MFPK UxEAorg/PK UxE Aorg/gradle/PK UxEAorg/gradle/wrapper/PK UxEhdf#org/gradle/wrapper/Download$1.classPK UxE[pDorg/gradle/wrapper/Download$SystemPropertiesProxyAuthenticator.classPK UxEXs"zorg/gradle/wrapper/IDownload.classPK UxEz\Q-dorg/gradle/wrapper/GradleUserHomeLookup.classPK UxE] 3 org/gradle/wrapper/ExclusiveFileAccessManager.classPK UxEc`-!org/gradle/wrapper/WrapperConfiguration.classPK UxEQ}i 0org/gradle/wrapper/SystemPropertiesHandler.classPK UxErn&org/gradle/wrapper/PathAssembler.classPK UxE Xorg/gradle/wrapper/Install.classPK UxEL -,org/gradle/wrapper/BootstrapMainStarter.classPK UxEE C (1org/gradle/wrapper/WrapperExecutor.classPK UxE”" *<org/gradle/wrapper/GradleWrapperMain.classPK UxE{x "Forg/gradle/wrapper/Install$1.classPK UxEj jV8Lorg/gradle/wrapper/PathAssembler$LocalDistribution.classPK UxEۅ'9z !Norg/gradle/wrapper/Download.classPK UxEB=PN#dVgradle-wrapper-classpath.propertiesPK UxE qVbuild-receipt.propertiesPK UxEAWorg/gradle/cli/PK UxE<S1)Xorg/gradle/cli/AbstractCommandLineConverter.classPK UxE2_e(Zorg/gradle/cli/CommandLineParser$1.classPK UxERB <[org/gradle/cli/CommandLineParser$MissingOptionArgState.classPK UxEM2=^org/gradle/cli/CommandLineParser$OptionStringComparator.classPK UxE# GK1aorg/gradle/cli/CommandLineArgumentException.classPK UxE?h=$corg/gradle/cli/CommandLineParser$KnownOptionParserState.classPK UxEk7Fkorg/gradle/cli/CommandLineParser$OptionComparator.classPK UxEb'n?hnorg/gradle/cli/CommandLineParser$UnknownOptionParserState.classPK UxE"zZ &qorg/gradle/cli/CommandLineOption.classPK UxEl\ϧ8|worg/gradle/cli/CommandLineParser$OptionParserState.classPK UxE#4P*&yyorg/gradle/cli/ParsedCommandLine.classPK UxEA5l| :org/gradle/cli/ProjectPropertiesCommandLineConverter.classPK UxE2lWJForg/gradle/cli/CommandLineParser$CaseInsensitiveStringComparator.classPK UxE( ]UT*&iorg/gradle/cli/CommandLineParser.classPK UxE_>ң)3org/gradle/cli/CommandLineParser$AfterOptions.classPK UxEGf3org/gradle/cli/CommandLineParser$OptionString.classPK UxEx&T` ;؝org/gradle/cli/AbstractPropertiesCommandLineConverter.classPK UxE ,org/gradle/cli/ParsedCommandLineOption.classPK UxEs=org/gradle/cli/CommandLineParser$OptionAwareParserState.classPK UxE'H g)org/gradle/cli/CommandLineConverter.classPK UxEC| <org/gradle/cli/CommandLineParser$BeforeFirstSubCommand.classPK UxE;|9.org/gradle/cli/SystemPropertiesCommandLineConverter.classPK UxE-h2org/gradle/cli/CommandLineParser$ParserState.classPK UxEF= ;Xorg/gradle/cli/CommandLineParser$AfterFirstSubCommand.classPK UxE٥FDgradle-cli-classpath.propertiesPK00qHalide-17.0.1/apps/HelloAndroid/gradle/wrapper/gradle-wrapper.properties000066400000000000000000000003461456515664200262660ustar00rootroot00000000000000#Mon Jan 05 14:23:44 PST 2015 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists distributionUrl=https\://services.gradle.org/distributions/gradle-2.2-bin.zip Halide-17.0.1/apps/HelloAndroid/gradlew000077500000000000000000000115001456515664200176630ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################## ## ## Gradle start up script for UN*X ## ############################################################################## # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. DEFAULT_JVM_OPTS="" APP_NAME="Gradle" APP_BASE_NAME=`basename "$0"` # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD="maximum" warn ( ) { echo "$*" } die ( ) { echo echo "$*" echo exit 1 } # OS specific support (must be 'true' or 'false'). cygwin=false darwin=false case "`uname`" in CYGWIN* ) cygwin=true ;; Darwin* ) darwin=true ;; esac # Attempt to set APP_HOME # Resolve links: $0 may be a link PRG="$0" # Need this for relative symlinks. while [ -h "$PRG" ] ; do ls=`ls -ld "$PRG"` link=`expr "$ls" : '.*-> \(.*\)$'` if expr "$link" : '/.*' > /dev/null; then PRG="$link" else PRG=`dirname "$PRG"`"/$link" fi done SAVED="`pwd`" cd "`dirname \"$PRG\"`/" >/dev/null APP_HOME="`pwd -P`" cd "$SAVED" >/dev/null CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar # Determine the Java command to use to start the JVM. if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables JAVACMD="$JAVA_HOME/jre/sh/java" else JAVACMD="$JAVA_HOME/bin/java" fi if [ ! -x "$JAVACMD" ] ; then die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi else JAVACMD="java" which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi # Increase the maximum file descriptors if we can. if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then MAX_FD_LIMIT=`ulimit -H -n` if [ $? -eq 0 ] ; then if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then MAX_FD="$MAX_FD_LIMIT" fi ulimit -n $MAX_FD if [ $? -ne 0 ] ; then warn "Could not set maximum file descriptor limit: $MAX_FD" fi else warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" fi fi # For Darwin, add options to specify how the application appears in the dock if $darwin; then GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" fi # For Cygwin, switch paths to Windows format before running java if $cygwin ; then APP_HOME=`cygpath --path --mixed "$APP_HOME"` CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` JAVACMD=`cygpath --unix "$JAVACMD"` # We build the pattern for arguments to be converted via cygpath ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` SEP="" for dir in $ROOTDIRSRAW ; do ROOTDIRS="$ROOTDIRS$SEP$dir" SEP="|" done OURCYGPATTERN="(^($ROOTDIRS))" # Add a user-defined pattern to the cygpath arguments if [ "$GRADLE_CYGPATTERN" != "" ] ; then OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" fi # Now convert the arguments - kludge to limit ourselves to /bin/sh i=0 for arg in "$@" ; do CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` else eval `echo args$i`="\"$arg\"" fi i=$((i+1)) done case $i in (0) set -- ;; (1) set -- "$args0" ;; (2) set -- "$args0" "$args1" ;; (3) set -- "$args0" "$args1" "$args2" ;; (4) set -- "$args0" "$args1" "$args2" "$args3" ;; (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; esac fi # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules function splitJvmOpts() { JVM_OPTS=("$@") } eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" Halide-17.0.1/apps/HelloAndroid/gradlew.bat000066400000000000000000000045441456515664200204370ustar00rootroot00000000000000@if "%DEBUG%" == "" @echo off @rem ########################################################################## @rem @rem Gradle startup script for Windows @rem @rem ########################################################################## @rem Set local scope for the variables with windows NT shell if "%OS%"=="Windows_NT" setlocal @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. set DEFAULT_JVM_OPTS= set DIRNAME=%~dp0 if "%DIRNAME%" == "" set DIRNAME=. set APP_BASE_NAME=%~n0 set APP_HOME=%DIRNAME% @rem Find java.exe if defined JAVA_HOME goto findJavaFromJavaHome set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 if "%ERRORLEVEL%" == "0" goto init echo. echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. echo. echo Please set the JAVA_HOME variable in your environment to match the echo location of your Java installation. goto fail :findJavaFromJavaHome set JAVA_HOME=%JAVA_HOME:"=% set JAVA_EXE=%JAVA_HOME%/bin/java.exe if exist "%JAVA_EXE%" goto init echo. echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% echo. echo Please set the JAVA_HOME variable in your environment to match the echo location of your Java installation. goto fail :init @rem Get command-line arguments, handling Windowz variants if not "%OS%" == "Windows_NT" goto win9xME_args if "%@eval[2+2]" == "4" goto 4NT_args :win9xME_args @rem Slurp the command line arguments. set CMD_LINE_ARGS= set _SKIP=2 :win9xME_args_slurp if "x%~1" == "x" goto execute set CMD_LINE_ARGS=%* goto execute :4NT_args @rem Get arguments from the 4NT Shell from JP Software set CMD_LINE_ARGS=%$ :execute @rem Setup the command line set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar @rem Execute Gradle "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% :end @rem End local scope for the variables with windows NT shell if "%ERRORLEVEL%"=="0" goto mainEnd :fail rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of rem the _cmd.exe /c_ return code! if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 exit /b 1 :mainEnd if "%OS%"=="Windows_NT" endlocal :omega Halide-17.0.1/apps/HelloAndroid/jni/000077500000000000000000000000001456515664200170735ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/jni/Android.mk000066400000000000000000000010251456515664200210020ustar00rootroot00000000000000LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) LOCAL_MODULE := HelloAndroid LOCAL_ARM_MODE := arm LOCAL_SRC_FILES := hello_wrapper.cpp LOCAL_LDFLAGS := -L$(LOCAL_PATH)/../jni LOCAL_LDLIBS := -lm -llog -landroid $(LOCAL_PATH)/../bin/$(TARGET_ARCH_ABI)/hello.a LOCAL_STATIC_LIBRARIES := android_native_app_glue LOCAL_C_INCLUDES := $(LOCAL_PATH)/../../../include $(LOCAL_PATH)/../../../build/include $(LOCAL_PATH)/../bin/$(TARGET_ARCH_ABI)/ include $(BUILD_SHARED_LIBRARY) $(call import-module,android/native_app_glue) Halide-17.0.1/apps/HelloAndroid/jni/Application.mk000066400000000000000000000002031456515664200216620ustar00rootroot00000000000000APP_ABI := armeabi armeabi-v7a arm64-v8a x86_64 x86 APP_PLATFORM := android-17 APP_STL := gnustl_static APP_CPPFLAGS := -std=c++17 Halide-17.0.1/apps/HelloAndroid/jni/hello_generator.cpp000066400000000000000000000025111456515664200227470ustar00rootroot00000000000000#include "Halide.h" using namespace Halide; namespace { class Hello : public Generator { public: Input> input{"input"}; Output> result{"result"}; void generate() { tone_curve(x) = cast(pow(cast(x) / 256.0f, 1.8f) * 256.0f); Func clamped = BoundaryConditions::repeat_edge(input); curved(x, y) = tone_curve(clamped(x, y)); Func sharper; sharper(x, y) = 9 * curved(x, y) - 2 * (curved(x - 1, y) + curved(x + 1, y) + curved(x, y - 1) + curved(x, y + 1)); result(x, y) = cast(clamp(sharper(x, y), 0, 255)); } void schedule() { Var yi; tone_curve.compute_root(); result.split(y, y, yi, 60).vectorize(x, 8).parallel(y); curved.store_at(result, y).compute_at(result, yi); // We want to handle inputs that may be rotated 180 due to camera module placement. // Unset the default stride constraint input.dim(0).set_stride(Expr()); // Make specialized versions for input stride +/-1 to get dense vector loads curved.specialize(input.dim(0).stride() == 1); curved.specialize(input.dim(0).stride() == -1); } private: Var x{"x"}, y{"y"}; Func tone_curve, curved; }; } // namespace HALIDE_REGISTER_GENERATOR(Hello, hello) Halide-17.0.1/apps/HelloAndroid/jni/hello_wrapper.cpp000066400000000000000000000076171456515664200224550ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "HalideRuntime.h" #include "HalideRuntimeOpenCL.h" #include "hello.h" #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, "halide_native", __VA_ARGS__) #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "halide_native", __VA_ARGS__) #define DEBUG 1 extern "C" int halide_host_cpu_count(); extern "C" int halide_start_clock(void *user_context); extern "C" int64_t halide_current_time_ns(); void handler(void * /* user_context */, const char *msg) { LOGE("%s", msg); } extern "C" { JNIEXPORT void JNICALL Java_com_example_hellohalide_CameraPreview_processFrame( JNIEnv *env, jobject obj, jbyteArray jSrc, jint j_w, jint j_h, jint j_orientation, jobject surf) { const int w = j_w, h = j_h, orientation = j_orientation; halide_start_clock(NULL); halide_set_error_handler(handler); unsigned char *src = (unsigned char *)env->GetByteArrayElements(jSrc, NULL); if (!src) { LOGD("src is null\n"); return; } LOGD("[output window size] j_w = %d, j_h = %d", j_w, j_h); LOGD("[src array length] jSrc.length = %d", env->GetArrayLength(jSrc)); ANativeWindow *win = ANativeWindow_fromSurface(env, surf); static bool first_call = true; static unsigned counter = 0; static unsigned times[16]; if (first_call) { LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count()); LOGD("Resetting buffer format"); ANativeWindow_setBuffersGeometry(win, w, h, 0); first_call = false; for (int t = 0; t < 16; t++) { times[t] = 0; } } ANativeWindow_Buffer buf; ARect rect = {0, 0, w, h}; if (int err = ANativeWindow_lock(win, &buf, NULL)) { LOGD("ANativeWindow_lock failed with error code %d\n", err); return; } uint8_t *dst = (uint8_t *)buf.bits; // If we're using opencl, use the gpu backend for it. #if COMPILING_FOR_OPENCL halide_opencl_set_device_type("gpu"); #endif // Make these static so that we can reuse device allocations across frames. static halide_buffer_t srcBuf = {0}; static halide_dimension_t srcDim[2]; static halide_buffer_t dstBuf = {0}; static halide_dimension_t dstDim[2]; if (dst) { srcBuf.host = (uint8_t *)src; srcBuf.set_host_dirty(); srcBuf.dim = srcDim; srcBuf.dim[0].min = 0; srcBuf.dim[0].extent = w; srcBuf.dim[0].stride = 1; srcBuf.dim[1].min = 0; srcBuf.dim[1].extent = h; srcBuf.dim[1].stride = w; srcBuf.type = halide_type_of(); if (orientation >= 180) { // Camera sensor is probably upside down (e.g. Nexus 5x) srcBuf.host += w * h - 1; srcBuf.dim[0].stride = -1; srcBuf.dim[1].stride = -w; } dstBuf.host = dst; dstBuf.dim = dstDim; dstBuf.dim[0].min = 0; dstBuf.dim[0].extent = w; dstBuf.dim[0].stride = 1; dstBuf.dim[1].min = 0; dstBuf.dim[1].extent = h; dstBuf.dim[1].stride = w; dstBuf.type = halide_type_of(); // Just set chroma to gray. memset(dst + w * h, 128, (w * h) / 2); int64_t t1 = halide_current_time_ns(); hello(&srcBuf, &dstBuf); halide_copy_to_host(NULL, &dstBuf); int64_t t2 = halide_current_time_ns(); unsigned elapsed_us = (t2 - t1) / 1000; times[counter & 15] = elapsed_us; counter++; unsigned min = times[0]; for (int i = 1; i < 16; i++) { if (times[i] < min) min = times[i]; } LOGD("Time taken: %d (%d)", elapsed_us, min); } ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); env->ReleaseByteArrayElements(jSrc, (jbyte *)src, 0); } } Halide-17.0.1/apps/HelloAndroid/res/000077500000000000000000000000001456515664200171045ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/res/drawable-hdpi/000077500000000000000000000000001456515664200216075ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/res/drawable-hdpi/ic_launcher.png000066400000000000000000000222651456515664200246000ustar00rootroot00000000000000PNG  IHDRHHUGbKGDC pHYsgR vpAgHHy#IDATx͜y%Uuk:s瞛dFGDE(јhL$*O4qbFx*4Dno\U{:Ksڿ[kPW8LJl.zw8}j_ǘo,\; }s^!yC29VA p8:d1thf/ +Žp/%!:y*ܲrN]|<˨8*b!]zc9%9(!8 `ĝ{\s6)/QO6ů ):t6*.Yelz cܰÿ 9ÒP} |ճl" ӽGX?N֞!nذu_4mh0^w'eg1xu{{MyToFX0}r;V:4IIҢ:̑qA^x^֦ЀEś cå[ݚS$ԆRqїPVO>X0c)C=ass8zH5#Ǔ C"-3r2urO\NqNq4g<:d4Ż`3#H?z]L}ICԗtH;;_*5= vw~ؚrv-Y'ymYF:GW>Ōr<C1U*^^.x 8qyx,k/?aK^h–59OnAJw޻od2eԡ3^ c.=4} = u{ا>7nಛ_RCq 4fz/QG6xcWw|Q>v jm4f oh6ֱc*<ˡ#Ya, t/?=;6\QwuPA =5<թGj/\ &PRM,V8YY' mPb>sa^dJrjρb`SV%3SS1"qHge_%ë#rtHb6>C"%ƺ+Ũ@0bQT1R  Jfy`J9SvRݻ-s{‹Tb1HFq:ǣؾQf;S^rgt>vdDY2j2JCT3`2IT#щiFHĘ",-`=:@h WB'i(WZ|!ry8Xw^X%2)M2Djiǭ[w<][o|O/;+uytiTES*jVq25<ɡ.gZDv&l;7nS{vN>64Zr~懧?/rpxm/pj2{F ;E)Ҟ;ro[sw>Դ[j F>pWZ.aeh8R5DF̀2EyB2syJ :N˓ y'ѴY\w=oe}_@O[lɒhvlo>@{ffj{z[_ \[fR7xF]ԵiUss{]Z~ O|WrZ;&|O;o,Mc&%Ms(*5KeHCSY`!N,Qb |Lu#O=iWI;<<X$KI4z_?Y+\IIjBi֋UJj]_&ccd#Bt^gxe  yiεS$wlMY&$mwhvh{{zLh洛]fxҞ'K 훍WR0X! IxUj uhhxDnHcvl_?~CXQYQiL:˱{[VuO\6Ϫ)6M`IQ)ҞٜmڿOy=7RH c˄kǰ8q\! A^.RFr|^#N_:"Fw,dianvnuu݊>)gBReEu 8!xBe_ +&bDئRLi!Pf1@] `D-W>YE88B\$ (Oy/ j_ a9[0Ct E҇ξ B)WOkYLBCELEZ!A\ F .Kiu2fz(f7`R@x1a^ji4բL#Ì }>* DDv3 (xny2J;:WF *Hg1rץvWr!q3>^M#[ SMF 9ZѺ od { Z %t PBՀu11UD"XgA]oi_%i>@Yj!C_вXs:l6%6k}E"@|g W<17=}C+Y6PYFC=ThufxUG{W+,B8ƇWs֩M/a{Q*[V8fp} (q/-H/9t|o T'N/sA,.g͒S|?oZn-j= p?G(} +yward=Y3)(Fbl{{a2V`M5|[_%W?Kؽ#kcT[Hp͋ڟ_oaQ.zh?zW.bffc?=B(ɛ.c˄/|ɵxJϧ (憆x^̣XDAUNG1e*i-Ņ,h 4nB>8sW5_y^}'.]yFuEQ5Z]Kva^Ji&?@CL6`D;jq@S\y0OY)bPOdyyI7/ Gu<_8g/cE`qdk鳧>@L[lkb)L̈)^qAK] O2}VR`lwǯ O>eO 9YOHq?Ed^Kž-"T.@Uy{HlْP Y`2}3!/GY}0 g\j!0ɾߑiqFĠ0!M{upoyyFР!_k":|F 1*k2}Ml@Bhx~n\nS֐M_U~AthP@uPT#E!?@c)l`>"lXѲ8W5{~+)/U6J4DZ"o"5h*ƈ.d"'].y| fP;R>ۏ:s)TBGCH?|fvE13*]. %wiH[5&$ B[T6&BY5[WG5dT,Z"@ig,p |(NQtE;X=x_&BPW#\+Uե+\h[}W`Mҕl2굱R 0ZuAl~UQ6 FE>$5~tW"Q)hgC1+eXk-i>྇nV.94H|ˎ/y| LTC xO:}sJ'>"gW cD_wE_|L3slwhc֫Xl:N˯:߿Gt6DffJgYUiT]ZIK'kII6[규LR5z~߹fO.~Q :=jahueH~̻=ZXlE, sC~;p/U缛c:Pwz)14M׫~vϊ:l=W|kj,?/:F Y&,m\;hZY)Ǯ8'\w WV кpQcuP}Æ%+O$~h<δ]h^淿kt~~/Wu,g՚_<}9# +ʿ_g>s+C(lcTBTBT1V%o*Cg'#M=pûmڣD DWp$z#?fӃYrlmټG6O{A'?XӖgm|~^uUf-vըL1c1U֬[Ay~ix$I&|۵w?7rO^1/,/ueE UKaHM:E_?훭:љ㧵2@{.gbu̷>;?O }%/ᡇ 2vwegbWr_]lf]'ѹyKogv8ԕ8 _AMLttdesNh0Z?mC7HDŽ~!H1,_>$f;K5KX|1mk)̪ X42N|vvεv~Zׯ-[X,YfIz:8 ^r1SK&cػgF4n}Ulq5jf"+&P ?a|]OёtyK?P T|T 339"MdI%:TRIVjT*NJn5ƨ6N{yyI夽NGӣlS12Ku G.x%å#Z' /!X0=4XWjDLJz@SZ=OcsM5K!oͥ?i|kGڽMg|o]UBYQamAJ$tX 섇GC1'~XO聺y~D2qP=f%4juXDҘ% #cpLm(RIV&$V* **CjJ-"J,H`gk`z[%+oogPx9_K_5^I*Zp%J(YEu)´ `id];.wе "jm1B0! aC<"`#3y⏃p7le;T#D!rk Q**6*Ү)捌Phf q\T-%N"l smb1=W~/Pw娉 cbk%BuRZb) X|uo\YQc ͲMP*qT1EA2$PL(՚-͋آcL Ћ^ LaD;Pxa* ֭<|n;|:eeF /:H*B&F\kօ''4Nz.5a·M7}U]1aH(cc45ӻ8C^ׇF%P:ء?NЩ]fێ{6e=^Bbfdڻp[/mj܃][-3{Zre|d9˴ZV~K3$,k-Aٷ/8=]u?U/oWq/gؚozOeù衦OmӏCycιWZGZǭ_kDU G&&'شzm8M 7޸ok34ۅVm3'—f0F .O?t h|@|r%tEXtdate:create2011-09-22T09:32:32-07:00b%tEXtdate:modify2011-09-16T07:27:36-07:00wjTIENDB`Halide-17.0.1/apps/HelloAndroid/res/drawable-ldpi/000077500000000000000000000000001456515664200216135ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/res/drawable-ldpi/ic_launcher.png000066400000000000000000000052511456515664200246000ustar00rootroot00000000000000PNG  IHDR$$sRGBbKGD pHYsgRtIME  1 )IDATXõ{_Uu?kyonH $T ЎV,0Qґ)%R:V0H*ۊ2*Ih#J 0$M;{>{sK*3{g}w}lD{5ϝwdּ0rYoußߺko886;qz9Go}?z}jCd>PsegС^[_ S긬Ú7=Gݽi gfznqs)`7DQDy;od)""D!ۖ^7䬙==cdЭE2 pUլLŜ|>gvX O7A+ VD=d&!a!m[d?y(wl?z gLЊ{ b%%X-q$)QG 2 9:S;AN[_s)lxdwϲV3`Ȃ#_(͔V+١ A'[2\,pV_tn Cj9 zWG\)G 9uԒ)I3ptXXi3|󽷽rǤEq06_vz9nw;Q $ b:0a Il?X2밉t"{osミ|L ; QjZ5 QvF{1IĖ$H eXk5MS\& Ǟ{Dpo)L|71ϻ+1xW{~9pD>wp켼X?Tͨ=%" (U /bpb%hdp!6~MZ6_ws=O]pߟt~TkT}d3cˀ4bT ŢĀ4QđŦFjsN*ygOZwK*!i5Jc%UP%̪Xvc "Ċ")b374qđ4 QTQ@n sBEg9u2x8q(B2{)L~y,m,/9:ȬSp9ũq01CL)8YF8\1Oջh|gkxi:v9l9p)gejTD xm&FGevԻ2ro!\d{6]ѽE^i)Z)WUSN9eJ$ᄅQHw@M$w)¬38K θ4fϫ;]"H[U3%Q8ɟIamU4)Ip|j~~V>su)sؼ126OO"K5/ .װo<_uЁph_N,jLTLESY80%q-~7c_Vm MUL(i:h;^SQ99k ^ΘXJ*'3Sv\-iӍQr;H&$"a|SEFs|{}0(ɋ/^.׀0$H.7> J阮Tt7=ƱZ,s'P*4PbP3x4q<~#0{mj)J+ Xr 56]X©DY9AUgr&YmX+] +l:?/m}ܺ&b bmGi6̚&%<|{Y={ QO227SmTiݣoʋWGs~GgEWջ8tVl [8> >bD\uI$:'x?ve>R_rݩY"W]?8Q?}y|O]wɟ]U1O'o "C!R,-}sZ6q:Iݷ}_>nY!Z-FU 8B&A_U=Q'ĉt:8 Gðz&e7&h׌oܿozFwozARJ~6ML 0EWov4[׾7_rxf%LYJ" ڳ {/N4JL@/_- pW~[?t 2V~2mfZ]i9F-I`SoxWe]\R/S,z*vdRd&JK<5Q'4χrweHQdiv2Ai·D%y"ŲGV\)iQurZ.PiS2xtu,JPR,e`q+wNL=Az{פs?REAĆX~/7=U[['30~̷jIENDB`Halide-17.0.1/apps/HelloAndroid/res/drawable-mdpi/000077500000000000000000000000001456515664200216145ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/res/drawable-mdpi/ic_launcher.png000066400000000000000000000121651456515664200246030ustar00rootroot00000000000000PNG  IHDR00WbKGDC pHYsgR vpAg00WIDATh޵YiU9o{^&IBA& ( "`b *HQdZTef !aJKHޔ7{V9gQvKZZ}^U}>vq<8cx%~UO|To=V| b0@@R\UA'TX}M'1x8ViZ"]J03o `Mz6 4hQ1GeZ,bH ($F:,,K9NԩhL\eXu߸cTj%׳Wc\zgtp DiD(mBSh^ GT!GUS[QCp(r) r_N;]#.uTȝp.SĶVZA˖dCi/?A*BD)Ѓ,RXİp`XpM=eԓբQ@s5{ym?n*S_M 8r\?CaAp#mz>S1GXĜ"i&6qV:' Xkޑ?*u w}N"ҢX{vcg?=YA" @A`9k-\ʈL9;GItKX"R>죎*@ kK  cwR[۱U*zRͨbc]a!&iH53sablظiu˗~<F+gog Ax!&T oرZxQ{|"seR D9EvdYF`cX3~d8҃}난\`ֶ/p`4 =kx'w=ta:\\c=&bT;<@ ձYPB<qo=c8 YK7^V5cT1_!6?#(²ŗ>"K|T`qd)QQZt:+*%vOҤD (DE Mᛗc8'MM}) jcRQ/JAgUfl!톹Rx?+Wm^=pl] '9(LEQ= Y5ʎ/!fs{H @ M1 xϝ+3(-س__y덫Z|8v];8:  qW̧2ooWMhmmVL]>TjIj]UF+wK]v0VJLPl ꛅիnƔ % 1%B_ɅT-:MNıC*WG å;xq0LhC0!Q=ɾäE!+Ԍwv∩P,fDINISrNt3C}l~򚂜BjFQl30@)42>·Hiq^ (4Ydj%q[!N"DHFN&*挸B;'S" Q}9=npra.fDP%sg,ٷ0M LISDf eʙ3~譈#q@SVäOțN,lh-Noǚb{Uh l^ōϔً̭H!xSCD#2!BHMHDy/1L iRGT< w? w? iʹs=N[5XuGpop`B-dK35#zsf\"C*-feVHm=qһC[#4y,d̟uF^7VlBav ^a/Y;҆VrhA!(Pd\fHBSD /[LK6&ph(xΌXW9鄈/)C) S>Ug "Ks`xɛUB@7)5fBjcv_u%(XG4Yɵxs&|'_~>>pGpؼOa2 ( !fa@# !rPT${J4P# 38gqK7$BG[|Rl|s pR/p{/A ۆRt1G~&y 6 l=\> HnG eH9f"ho #Sij)[S=}|Xc-ߊwGϝM2*ă="lM!\Lnd@jT$tAD, , jHU0w싋p}?c+oy0sւ7\ehENg?+kf+ Y*`tew05ׁ`T3+/j8#F5mFoA.ùP ^$i(^CgM/ 9EmJ,*d ZR&f Ez|gY)$e8xpPHKēg f+(i5nG p2^͑Y_5zFےլj́$& Bi-JR5Xt)ADPl}!';rl72|!0 `H$(.=/1q\s|YPAS4@ Dј@lK9 G(9x ['Ag5I=}pȞ/ qI ݉ۆ\r\jG͚=ͯg=o2Ļc%.ٱqo%tEXtdate:create2011-09-22T09:32:32-07:00b%tEXtdate:modify2011-09-16T07:27:36-07:00wjTIENDB`Halide-17.0.1/apps/HelloAndroid/res/drawable-xhdpi/000077500000000000000000000000001456515664200217775ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/res/drawable-xhdpi/ic_launcher.png000066400000000000000000000340571456515664200247720ustar00rootroot00000000000000PNG  IHDR``w8bKGDC pHYsgR vpAg``x7XIDATx}w%Uoo휡9QTQA3*`#bz&8`Б"D Mjh龝'W9MouNݺ{[kKM~?f7ߍ=`l~[WT0 Qs!@2:}ҽdw%spz3J1?m*#^{ރUOP"{v)].C"UO2Ow]Y2%M9#N{kZn*@r:X|NTD1y6!B$ Ћ PC UhfA'lQOœԥx\ ^\z^.^c֭q wq.bw1O-C26kYW9YQ۶Ι;oLJށ}x  X%a&;4,ZHȢIMl!܆= G?:9T q1`GX+_`#mtъ7aGVaӚ tmȲ&z{v2h"y9fΜy1SisX'8=QQlh$X@8Xh1!X`XGEF~~y̪G[pܓ#+Zk[7FO><6ބZV>r6 ?npsg/]e(;n{ w(\a1Rax8Ee"L1b'cp>S p0Ք0n1'Q-<bh4/f햷X尵+iZ{sC^q;ѮncE 5B9^ VmM}cZ(K_|9Ka-pOP{#C_&E8^w8߿=jrF1G͊OpQlZӺOl\t1=b[{ޛ[(z㳗Vh( qYgԙO\/m?gG#JaO5h`ш,  b$A@P Dm,`;L,0΍C#>zH_㑇Źrrk =}$~GqFcڂ/ߝ9?|{T}d=9]CQGL1$B`800 f &t`.=dM:+?\v<@r¢0|оC@HYO?h#0_>`FnZW!7X6X.b0 *L>p F p`xXX8:u`Hr}exx{˷W=R3>~i#?ۋggml41^ b6k 3Covuu'(Na~\+:F!zP4D01 zXhؔ-sYcMAPα} sҏkJ` M6Nse8ݿ")&{_+وi&{Y{ˊGMLa*Bpֈh&5XtuVq+/wi?#x Ї(cz*Ի1HPfa5X`1IbY'@N?ͤ4P'uAЉÇ_lg'bIhv*Cd A ފ /"lr((‚0B  /; T,vf0 0 8^ $XHZ>2]7Kgn6F6:ӻ>TLތq* 615;=/`7ac(axЃtF+|88R^ ék1F[ca:5|Ȇ߯'y֢ql[|;vhjB2W- S= p\D)0XhXh1T 5#A `Xpяw ǀsgvH%K?;J@*$ݫ{gT.^},86u]0eF T!D͌pKX(ь0B`U ,00h $]wu8]#Y9Z-{knZs 0)tnTF$N3xfʓQƾKE.'3c4޷&9n6 s>f f2 DpΒlv7~/y.р=Ey 6Gg*#uW$LhpF@@px4fm6Lk$ЉOXmFL3⑘,SJKpObJa)X ! mCS5o3ວNEfqhϧS澞%g%嚭 ZUlUC#SXdՉE6COhݷю1YB#.FJ^<qRSHHQ &<_ȀW Sաx`6\fk757 X Ro3 HAPF 6L%"%ܦҮ2D73}oc P{a.@N+[v3;AD CmolU,e jA:@,-5ٜ?7_k U/? g֩[f(hD?3?L t-\J;<[/W`DƘ4"R&kc8k G7> ' b@ɭGޓ=!Zc ޱ5AcBZDA/dH)"cYĂ4K؁I9oZ&X`bbBwN$^D2\бXS=3+B(|iBUu a`4m\c Fjj"Mͩc4+%Q+GEaK:mdL[ڵ2]1iD~2:8ǰ}ԁ> f! f8vDdg>^8EOZ߱>I +ǀhۛ$-' D L>'9]u9a(9K:0p`f8ga5\ι~S@O#>%84!eo2AXHcOFqd 4C+V&"W#{EU>*0 mSBD9Cp6;Dh "o85K٫~g)l7_>2h5U&br<p_߿dqQ{a1 01lfpƍ?ͻMFshs\mIt4hK; u$jkmwc'KyvA&FZk\[2-`Mg i}j3hɬEYЪ.GF"^0@!I*@x `a|v_O} XBlƟQR DSi] cjXaTmozrYcuB:g=rb.m=sJL:O涶׎jL4瘘 qI*=wπ %ISMYTe1мN YF~|#&6<:`w;BHH$ w81gi_pFCRHr 1%~Ԛۧ3lc4`ù eϱ:%ˤ={b1mF3F4jj_# ̂7uN0 4[c}c#5V˹tKHc0jӀf2DH)3DzH\|͑ύ+̻QP FM"Dv8 n:0!9nxʰb>PT!3+0I}^ ҈3ť]uu\۱x6-|NR5Hi@Hb1e c;q/Wcw>`֍۰u'houlvkWFiNY /F\{sB`zU~JĿ7Vis axtX3.Lhh٪q+` $;ܡvrJ;ݶt#w. q{|I8b@J:q7 ;M;ֶ~7Ϡ iGձj'oP1O[g5;hΉN?yF}Y.U;۞i%>Z|xY=0ܢ +>Uՠ-BA@`D?jm|nclIwxir -LV\+.%, a 2Q!K‚@ E>a`-42<86IF(Q( J@PJ!*HDX b!*d OCVk:` #-ǔ"i:icFܲЉQҬ7eʼ7}+ڧlΝai#jYءՌl&h55gYsfb%Ռ`gTG+{ݶzX4tf36 b)PDRO&Q>VuU9ȃ޳ρ1k>GW Xغ. #[-LѬ[Ht0NH BP%z{==SBGr__?&זB˶nC_=t#iDAo}[YR(tns3jCܲW4UcjOo8p BHE 10 D*Q޾A*PBB}d3@NZY3N܇6~h#8&$C{4jT'cctɣ}(g33@1S.e7:Up1 rAM!s눝 zȳaB94͓K@hS?/SktyGS@;]Ք\)Bh;K%̺.fv0evۈce8ialrbޔ$vcV-#&~FtrCnh9fKxzlGE&bq= 4+2?k@MXZt%R"oEضp{HZS:нK_ra쉟M@lG%C0;ft X;3L( ٟ؏:aitY%Mqʇ`Q} ߕG<,aT n/+* )dΑL=3I}9-13pZZdAK[ fN]1[q+غ0ZY@Erh2fs`BFt`ff6ˇcP>ӏFyNFv03g$ &)+cxxո_lldɖz8BH}B@gh)w.83m"gvN?ҫ`23,Ă:C8;s&T1z;pS}Kf0 0:QL>i^@sLNpk`ՀoHl `X!Hg@`v:0JQP(Bm萖|LHLX8B\wIH!ӎ vYTWOS g)(X΀"mMDT ЭwWFv^m7$cE^=(qMtp##쐺 Tоٝ5wđ: '}]p--u8#/]w1!!RFX<ۑp )$~sxy+>> ?-SaLҖ|_ Xȍ-:=[,}j:QX9I ʢv$ٱbdJy^/E߀?B|[!|%ppaǀK,9>o)%qMWW^6߃C<yU8S!de pwE; xC" pgY&a@ɰ#I~qO_H$:H d P ,42>B3Xg/>M8M˾kn&>yKpޯY~› r񋫿o>O7&@;8bߋ4 ͒"zƬK; U;QsX:(J)BX͐ayC)VII@z6^!I#|v;wƾx+/Ɖ~)ᛸmٕXwܑbnFmʈg~8]ޕò^O#>:/%##t"$IWQ;O+bcbNzgcig"4N!Lp;:25ͤ zyٰc c5=/:_0wq y9:$`vB=-|Gu.:q̼HMnI(Н`fi~p4&K;g\BPL:Ob]Ju*7;;:&otiΡ:لNMg"p0o1 )]{VFw6}YgIHΥ-0IՔvh{ZtIΆ"G:y*KRPRM!"JC;a$@sd@`h*\ ]!Q<{뿍{{dIXk%,A,cuZNN</4 uL# |eɷߧi}nIods$ѡ`qf&:ȝ<+Y&ڷu; } `#i]:4H.ܡm~pv@G jΙ|I)7];0) !.' ewQڧ޶\QM)΁Kߙ96 yT,! ^ 93!mAڤD,>/ PρDqjNFlW0R2Vu`L+YuB j<ؤT (#O) JQݬZ%DH@%-b= ODP*+/@> snAa2r^!x =FڲeK0[Q?KIuWTYAYg1C1;q:PzPU[We'':*s)</-|n7( 3,c[RRt*f@4S p9S% jYoGX7<6$W. 0Y(">gYHk5[-Y :ie}13U(ڲ;8 K9kشn ??^bVddNx碟-7RBJѱ?o.&1cA"EgK1,=T( @ҬJX_,sJ~O$IP椴Y׌'?{|O]vr(zp/݌nw@6+Yݶh9p2~QᲪMhTͷ7S(dzyԊ FX`]8.Oau1VXm{>AYs{Po/?z٨] 'ckΟ ժ=ucZ7XWH:CY}ӥ(J Ϯ9t+OFsRJ0{Yd`:~\oLv۬ v|xe AP!CH܅5w۶\x+ߏ: zڙ"@d$q㭿ů6ySp n{Ī3K3I?7Oƣ/^?~߉•,nQ'~q_P,Fwq@"  PcނAtLp&?r戭 BHHp˝܏_Fp̘2d@~R@Fgx;:wqk!b0LE+a5aKU`S/rvhx`5hEϝpOFiS{! D*#/~WZؔПdT 5?_ W^kb~߫`5Vutm8#nB !,/_}5>}۱fh%D:+siox, :HPʯΑ3AHʉn:X;c3ГC[bH BkHͧx0N;aT*G0 mPX2 ̙tht~4<.{/BBh%kA!a̮69,Y'PظdjV7*%.Άxgy/_F0u pYٓ(%X, B:N:v[IWY`d6oޠ7wЦotJ*eSu YnLԆz U˿'yGC"P$,FO\RE"&*\x[p?*f&c߫=-Z.ʯd;,Uڑ(|X ۪h5 ,x @" lY32.yr~EDr6` ZmkV(.0Nsu鹐YJ|w"*h6bm|猑 WAS!RI(A~L󡵛̺8:o@@ n:!6!:9n1AHlqCuݫ䦍k'X;}^hnܺxpo/cIaW8il ( " ;}6g䗳 ) RBt pkVY kt+^lK.c[knxX )K5cެX} W7 }{leWi&q'ǂKS^G`lJ(VȳiH A@"HB$E2"`(SNYm[ؚǂ0| GqÍ+*8O/yu-?|_qaaؓ,Y~wz+&m F[-"S)PO|+S&()  !D]dGȿ3 )>ׂ{2 oO}N6X䞋ϾZwp,=Ϗϳ<.ipa-OY"rq=鸳gxOWm}pX QTwF{lAdp "GN<@ deGipNs^#]RVl9`Cqꎷ;0YW]y7yJ _k7>VyEk>uCCՉZ*ZzvU8nAJ%AIQ*PJEAUBTBXFPDE* ฅ8D6ѭشqƷY7 ی]r^2'g?63y#U~2 ȎJ̙3c2>˾;}buCIklDIR bRB5)HMB 9b%FdC| $6Il ?,6<пCY[:f߶iuK,޷G܌ gB}_̗>z˳ |(tA$XVPF:ݟ$դphV9݆auCk=? im?\}/o~$H' ** )}F2 `N۪+]SpK"T7]J|`7+U"Ö;DT)*zz=DP,B_GET!9[W86zu*bb9BXQ,(HB>u3;4k<7w9mvۿ-ڃ{[ȄxfH%;\:f XyфH Halide-17.0.1/apps/HelloAndroid/res/values/000077500000000000000000000000001456515664200204035ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/res/values/strings.xml000066400000000000000000000001611456515664200226140ustar00rootroot00000000000000 HelloHalide Halide-17.0.1/apps/HelloAndroid/src/000077500000000000000000000000001456515664200171025ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/src/com/000077500000000000000000000000001456515664200176605ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/src/com/example/000077500000000000000000000000001456515664200213135ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/src/com/example/hellohalide/000077500000000000000000000000001456515664200235655ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroid/src/com/example/hellohalide/CameraActivity.java000066400000000000000000000030031456515664200273310ustar00rootroot00000000000000package com.example.hellohalide; import android.app.Activity; import android.os.Bundle; import android.hardware.Camera; import android.util.Log; import android.widget.FrameLayout; import android.view.SurfaceView; public class CameraActivity extends Activity { private static final String TAG = "CameraActivity"; private Camera camera; private CameraPreview preview; private SurfaceView filtered; public static Camera getCameraInstance() { Camera c = null; try { c = Camera.open(); } catch (Exception e) { Log.d(TAG, "Could not open camera"); } return c; } @Override public void onCreate(Bundle b) { super.onCreate(b); setContentView(R.layout.main); // Create a canvas for drawing stuff on filtered = new SurfaceView(this); // Create our Preview view and set it as the content of our activity. preview = new CameraPreview(this, filtered); FrameLayout layout = (FrameLayout) findViewById(R.id.camera_preview); layout.addView(preview); layout.addView(filtered); filtered.setZOrderOnTop(true); } @Override public void onResume() { super.onResume(); camera = getCameraInstance(); preview.setCamera(camera); } @Override public void onPause() { super.onPause(); if (camera != null) { preview.setCamera(null); camera.release(); camera = null; } } }Halide-17.0.1/apps/HelloAndroid/src/com/example/hellohalide/CameraPreview.java000066400000000000000000000100651456515664200271640ustar00rootroot00000000000000package com.example.hellohalide; import android.hardware.Camera; import android.util.Log; import java.io.IOException; import android.view.SurfaceHolder; import android.view.SurfaceView; import android.view.Surface; import android.content.Context; import android.graphics.Canvas; import android.graphics.ImageFormat; /** A basic Camera preview class */ public class CameraPreview extends SurfaceView implements SurfaceHolder.Callback, Camera.PreviewCallback { private static final String TAG = "CameraPreview"; private Camera mCamera; private SurfaceView mFiltered; private byte[] mPreviewData; private int mCameraOrientation; // Link to native Halide code static { System.loadLibrary("HelloAndroid"); } private static native void processFrame(byte[] src, int w, int h, int orientation, Surface dst); public CameraPreview(Context context, SurfaceView filtered) { super(context); mFiltered = filtered; mFiltered.getHolder().setFormat(ImageFormat.YV12); mPreviewData = null; // Install a SurfaceHolder.Callback so we get notified when the // underlying surface is created and destroyed. getHolder().addCallback(this); } public void onPreviewFrame(byte[] data, Camera camera) { if (camera != mCamera) { Log.d(TAG, "Unknown Camera!"); return; } if (mFiltered.getHolder().getSurface().isValid()) { Camera.Size s = camera.getParameters().getPreviewSize(); processFrame(data, s.width, s.height, mCameraOrientation, mFiltered.getHolder().getSurface()); } else { Log.d(TAG, "Invalid Surface!"); } // re-enqueue this buffer camera.addCallbackBuffer(data); } private void startPreview(SurfaceHolder holder) { if (mCamera == null) { return; } try { configureCamera(); mCamera.setPreviewCallbackWithBuffer(this); mCamera.setPreviewDisplay(holder); mCamera.startPreview(); } catch (Exception e){ Log.d(TAG, "Error starting camera preview: " + e.getMessage()); } } private void stopPreview() { if (mCamera == null) { return; } try { mCamera.stopPreview(); } catch (Exception e){ // ignore: tried to stop a non-existent preview Log.d(TAG, "tried to stop a non-existent preview"); } } private void configureCamera() { Camera.Parameters p = mCamera.getParameters(); Camera.Size s = p.getPreviewSize(); Log.d(TAG, "Camera Preview Size: " + s.width + "x" + s.height); p.setPreviewFormat(ImageFormat.YV12); if (mPreviewData == null) { int stride = ((s.width + 15) / 16) * 16; int y_size = stride * s.height; int c_stride = ((stride/2 + 15) / 16) * 16; int c_size = c_stride * s.height/2; int size = y_size + c_size * 2; mPreviewData = new byte[size]; } mCamera.addCallbackBuffer(mPreviewData); mCamera.setParameters(p); } public void surfaceCreated(SurfaceHolder holder) { Log.d(TAG, "surfaceCreated"); startPreview(holder); } public void surfaceDestroyed(SurfaceHolder holder) { Log.d(TAG, "surfaceDestroyed"); stopPreview(); } public void surfaceChanged(SurfaceHolder holder, int format, int w, int h) { Log.d(TAG, "surfaceChanged"); stopPreview(); configureCamera(); startPreview(holder); } public void setCamera(Camera c) { if (mCamera != null) { mCamera.stopPreview(); } mCamera = c; android.hardware.Camera.CameraInfo info = new android.hardware.Camera.CameraInfo(); // Assume that we opened camera 0 android.hardware.Camera.getCameraInfo(0, info); mCameraOrientation = info.orientation; if (mCamera != null) { startPreview(getHolder()); } } } Halide-17.0.1/apps/HelloAndroid/src/com/example/hellohalide/FrameHandler.java000066400000000000000000000004711456515664200267620ustar00rootroot00000000000000package com.example.hellohalide; import android.hardware.Camera; import android.util.Log; public class FrameHandler implements Camera.PreviewCallback { private static final String TAG = "FrameHandler"; public void onPreviewFrame(byte[] data, Camera camera) { Log.d(TAG, "Got a frame!"); } }Halide-17.0.1/apps/HelloAndroidCamera2/000077500000000000000000000000001456515664200175065ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroidCamera2/.gitignore000066400000000000000000000001501456515664200214720ustar00rootroot00000000000000.gradle/** gen/** gradle_build/** *.iml local.properties obj/** proguard-project.txt project.properties Halide-17.0.1/apps/HelloAndroidCamera2/AndroidManifest.xml000066400000000000000000000027741456515664200233110ustar00rootroot00000000000000 Halide-17.0.1/apps/HelloAndroidCamera2/README.md000066400000000000000000000077661456515664200210050ustar00rootroot00000000000000HelloAndroidCamera2 is a simple application which uses Halide to process images streamed from the Android camera2 API. It reads every frame into the CPU via an ImageReader and uses Halide to either blit the frame to the output surface (converting between YUV formats), or apply an edge detector on the luma channel. This example requires a phone or tablet that supports the camera2 API (Android API level 21 or above). This sample has been tested on Nexus 5, Nexus 6 and Nexus 9. CAVEAT: This example uses the not-so-well-documented ANativeWindow C API to directly write into the graphics buffers that support the Java "Surface" and "SurfaceView" classes. In particular, we rely on the YV12 format and use the ANativeWindow API to "reconfigure" buffers so that they do not have to match the resolution of the display. This exploits the hardware scaler to resample the displayed image. However, although camera2 reports a set of supported resolutions for ImageReader, there is no such enumeration for the display. On untested devices, chooseOptimalSize() may return camera resolution for which there is no matching graphics resolution. This will lead to a green screen with a logcat error message that looks something like: E/halide_native( 6146): ANativeWindow buffer locked but its size was 1920 x 1440, expected 1440 x 1080 This application builds for multiple native ABIs. (At present armeabi, armeabi-v7a, arm64-v8a, x86_64, and x86 are supported.) Halide code is generated for each architecture. This build is meant to use Android command line tools. (An IDE is not required.) In order to build, the following will be required: - Android NDK -- This can be downloaded here: https://developer.android.com/tools/sdk/ndk/index.html After installing, make sure the top-level directory of the install is in the PATH. (It should contain an executable ndk-build file.) - Android SDK -- This can be downloaded here: http://developer.android.com/sdk/index.html The standalone SDK is desired. Once downloaded, the "android" program in the tools directory of the install will need to be run. It should bring up a UI allowing one to choose components to install. HelloAndroidCamera2 currently depends on the android-21 release. Make sure the tools directory is on one's PATH. - Apache Ant -- which can be downloaded here: http://ant.apache.org/bindownload.cgi make sure the bin directory is on one's PATH. If everything is setup correctly, running the build.sh script in this directory, with the current directory set to here, whould build the HelloAndroidCamera2 apk and install it on a connected Android device. # Gradle To use Gradle create local.properties file in this folder with sdk.dir and ndk.dir variables defined like so: ``` sdk.dir=/Users/joe/Downloads/android-sdk ndk.dir=/Users/joe/Downloads/android-ndk ``` After that run `gradlew build` which will produce .apk file ready for deployment to the Android device. On Linux/Mac you can use `build-gradle.sh` to build, deploy and run this sample application. Pay attention to the list of platforms supported by your Halide installation. They are listed in jni/Application.mk APP_ABI variable and in build.gradle archs map. For example, if your Halide installation was built without arm64-v8a, remove it from APP_ABI and archs. Both list and map should match, otherwise you will be getting compilation errors complaining about a missing halide_generated.h file: ``` :ndkBuild FAILED FAILURE: Build failed with an exception. * What went wrong: Execution failed for task ':ndkBuild'. ... Output: /private/tmp/7/halide/apps/HelloAndroidCamera2/jni/native.cpp:11:26: fatal error: deinterleave.h: No such file or directory #include "deinterleave.h" ``` # Android Studio To load project into Android Studio use "File/Import Project..." in Android Studio and point to apps/HelloAndroidCamera2/build.gradle file. You will have to edit automatically-generated local.properties file to add ndk.dir property so it points to your Android NDK installation as described in Gradle section above. Halide-17.0.1/apps/HelloAndroidCamera2/ant.properties000066400000000000000000000013251456515664200224070ustar00rootroot00000000000000# This file is used to override default values used by the Ant build system. # # This file must be checked into Version Control Systems, as it is # integral to the build system of your project. # This file is only used by the Ant script. # You can use this to override default values such as # 'source.dir' for the location of your java source folder and # 'out.dir' for the location of your output folder. # You can also use it define how the release builds are signed by declaring # the following properties: # 'key.store' for the location of your keystore and # 'key.alias' for the name of the key to use. # The password will be asked during the build when you use the 'release' target. java.source=7 java.target=7Halide-17.0.1/apps/HelloAndroidCamera2/build-gradle.sh000077500000000000000000000010101456515664200223700ustar00rootroot00000000000000#!/bin/bash # Gradle needs to know where the NDK is. # The easiest way is to set the ANDROID_NDK_HOME environment variable. # Otherwise, set ndk.dir in local.properties (even though the file itself says # that it's only used by ant). # However, if you run "android update" (say, via build.sh), this variable will # be clobbered. ./gradlew build && adb install -r gradle_build/outputs/apk/HelloAndroidCamera2-debug.apk && adb shell am start com.example.helloandroidcamera2/com.example.helloandroidcamera2.CameraActivity Halide-17.0.1/apps/HelloAndroidCamera2/build.gradle000066400000000000000000000132601456515664200217670ustar00rootroot00000000000000import org.apache.tools.ant.taskdefs.condition.Os // Avoid conflicts with Bazel on case-insensitive filesystems buildDir = 'gradle_build' repositories { jcenter() } buildscript { repositories { jcenter() } dependencies { classpath 'com.android.tools.build:gradle:1.2.2' } } //////////////////////////////////////////////////////////////////////////////// // Use gradle's native C++ plugin to build the Halide generator. // // sources: defines all the C++ source files. We only have one SourceSet called // halide_generator. // // executables: we only make one binary called halide_generator. Here is where // we pass compiler and linker flags. // // binaries.withType: binaries is a collection, which in our case is just the // halide_generator executable. withType() filters the collection by type. // binary is the iteration variable. -> defines the body of the lambda: // for each binary: // for each halide_target / Android ABI mapping: // for each generator: // run the generator with -g and target set // make the later ndkBuild task depend on this task. apply plugin: "cpp" sources { halide_generator { cpp(CppSourceSet) { source { srcDirs "jni/" include "deinterleave_generator.cpp" include "edge_detect_generator.cpp" } source { srcDirs "../../tools" include "GenGen.cpp" } } } } executables { halide_generator { binaries.all { cppCompiler.args "-std=c++17", "-g", "-Wall", "-fno-rtti", "-I", "${projectDir}/../../include", "-I", "${projectDir}/../../build/include" // "/bin" assumes Makefile build for Halide; "/build/lib" assumes CMake build linker.args "-lHalide", "-ldl", "-lpthread", "-lz", "-L", "${projectDir}/../../bin", "-L", "${projectDir}/../../build/lib" } } } binaries.withType(NativeExecutableBinary) { binary -> def bin = "${projectDir}/bin" def linkTask = binary.tasks.link println "linktask output file is " + linkTask.outputFile Map archs = [ // armeabi and armeabi-v7a are the same as far as Halide is concerned "armeabi": "arm-32-android", "armeabi-v7a": "arm-32-android", "arm64-v8a": "arm-64-android", "x86_64": "x86-64-android-sse41", "x86": "x86-32-android" ] def generators = ["deinterleave", "edge_detect"] archs.each { arch -> println "creating task for: " + arch.key + " -> " + arch.value def android_abi = arch.key def hl_target = arch.value def task_name = "generate_halide_binary_${binary.name.capitalize()}_${android_abi}" def destDir = new File(bin, "${android_abi}") def generateHalideTask = task(task_name) { dependsOn linkTask doFirst { println "Executing: " + linkTask.outputFile + " ..." destDir.mkdirs() def envVars = [ "DYLD_LIBRARY_PATH=${projectDir}/../../bin", "LD_LIBRARY_PATH=${projectDir}/../../bin" ] generators.each { generator -> def proc = [linkTask.outputFile, "-g", generator, "-o", ".", "target=$hl_target"] .execute(envVars, destDir) proc.waitFor() if (proc.exitValue() != 0) { println "return code: ${proc.exitValue()}" println "stderr: ${proc.err.text}" println "stdout: ${proc.in.text}" } } } } // Call this task generateHalideTask. binary.builtBy generateHalideTask // Tell gradle that the task called "ndkBuild" below depends // on generateHalideTask. ndkBuild.dependsOn generateHalideTask } println "done with archs" } //////////////////////////////////////////////////////////////////////////////// apply plugin: 'com.android.application' android { compileSdkVersion 21 buildToolsVersion "21.1.2" defaultConfig { applicationId "com.example.helloandroidcamera2" minSdkVersion 21 targetSdkVersion 21 versionCode 1 versionName "1.0" } compileOptions { sourceCompatibility JavaVersion.VERSION_1_7 targetCompatibility JavaVersion.VERSION_1_7 } sourceSets { main { java.srcDirs = ["src/"] // Setting jni.srcDirs to [] disables the automatic ndk-build call // which would use parameters defined in build.gradle. Use our own // task (ndkBuild) below. jni.srcDirs = [] jniLibs.srcDirs = ["bin/lib/"] // default is src/main/jniLibs manifest.srcFile "AndroidManifest.xml" res.srcDirs = ["res/"] // default is src/main/res } } // Call regular ndk-build (ndk-build.cmd on Windows) script from // app directory. task ndkBuild(type: Exec) { def ndkDir = project.android.ndkDirectory def ndkBuildCmd = "" if (Os.isFamily(Os.FAMILY_WINDOWS)) { ndkBuildCmd = "ndk-build.cmd" } else { ndkBuildCmd = "ndk-build" } commandLine "$ndkDir/$ndkBuildCmd", "NDK_GEN_OUT=./bin/gen", "NDK_LIBS_OUT=./bin/lib", "NDK_OUT=./bin/obj" } tasks.withType(JavaCompile) { compileTask -> compileTask.dependsOn ndkBuild } buildTypes { release { minifyEnabled false proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro' } } } task wrapper(type: Wrapper) { gradleVersion = '2.2' } Halide-17.0.1/apps/HelloAndroidCamera2/build.sh000077500000000000000000000023551456515664200211510ustar00rootroot00000000000000#!/bin/bash set -e android update project -p . --subprojects --target android-21 if [ -z "$ANDROID_NDK_HOME" ]; then echo "Set ANDROID_NDK_HOME to point to your android ndk root directory" exit 1 fi mkdir -p bin c++ jni/edge_detect_generator.cpp ../../tools/GenGen.cpp \ -g -fno-rtti -Wall -std=c++17 \ -I ../../include -I ../../build/include \ -L ../../bin -lHalide -ldl -lpthread -lz \ -o bin/edge_detect_generator c++ jni/deinterleave_generator.cpp ../../tools/GenGen.cpp \ -g -fno-rtti -Wall -std=c++17 \ -I ../../include -I ../../build/include \ -L ../../bin -lHalide -ldl -lpthread -lz \ -o bin/deinterleave_generator for archs in arm-32-android,armeabi arm-32-android-armv7s,armeabi-v7a arm-64-android,arm64-v8a x86-64-android-sse41,x86_64 x86-32-android,x86 ; do IFS=, set $archs HL_TARGET=$1 ANDROID_ABI=$2 mkdir -p bin/$ANDROID_ABI ./bin/edge_detect_generator -g edge_detect -o bin/$ANDROID_ABI target=$HL_TARGET ./bin/deinterleave_generator -g deinterleave -o bin/$ANDROID_ABI target=$HL_TARGET unset IFS done ${ANDROID_NDK_HOME}/ndk-build NDK_GEN_OUT=./bin/gen NDK_LIBS_OUT=./bin/lib NDK_OUT=./bin/obj ant debug adb install -r bin/HelloAndroidCamera2-debug.apk adb logcat Halide-17.0.1/apps/HelloAndroidCamera2/build.xml000066400000000000000000000076371456515664200213440ustar00rootroot00000000000000 Halide-17.0.1/apps/HelloAndroidCamera2/gradle/000077500000000000000000000000001456515664200207445ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroidCamera2/gradle/wrapper/000077500000000000000000000000001456515664200224245ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroidCamera2/gradle/wrapper/gradle-wrapper.jar000066400000000000000000001435121456515664200260440ustar00rootroot00000000000000PK UxE META-INF/PK UxE{MAVMETA-INF/MANIFEST.MFMLK-. K-*ϳR03-IM+I, dZ)%bµrrPK UxEorg/PK UxE org/gradle/PK UxEorg/gradle/wrapper/PK UxEhdf#org/gradle/wrapper/Download$1.class}M 0h5Z+v/ ׆p!.3̳{?~~&(P0MHa3e2&p lÐ|e;D-l ׽C!C"v=lrKOx RhO]!'"՞@yMB` !k>"APݶ-_}ɻDu_yks~ r[=*ek€a)? rg'U ewRĎw s'⥧Ǔ9JZ Y >HH,θ1Ppt1prUNN!;$ i}On->+ſC Of$#5;#*uJID)6j -5}+kY_1}h璥>C0EZQl\!@1JQ!NbN)R_p槩r'GڸS6[Kn0֢\V7pM^E\dMPK UxEXs"org/gradle/wrapper/IDownload.classE 0  ^b AP^26J;t>;ɗ|{z~+%5O&WΔ(a_4[gR#!XbQVg={}1AYCX'R5c/J$S@pP\mKulPK UxEz\Q-org/gradle/wrapper/GradleUserHomeLookup.classS[OAF]R(j[[ZU˪T Od .dYlW$jj>G5=R+ȃɹws??~XªQx)I)`^F\F ṂzQFRhMK K [A*_ɮoANϖvӟtp854˰ZsM0ݍ+e錞K{zahӱa{jr⿅ >4fڦ?(06 %L7k}8e*)v0 DqZ5*>F]m4xqNuj}g'-mZ0Zjw䜦[b!ڋ3)UD0A\>yIA$Rf MxfFӴ*e]ӫxwԯ x wuH𘗽P`{!!}%nx/ q}Jhͮ0,މ=q@{,Qzii“G7 !8CH3 `_[(`+8$U)<$4OZd4}/z@:CYׅ"D "Vv I (&%꿮)[|SW/9s ,n%BrUv/PK UxE] 3org/gradle/wrapper/ExclusiveFileAccessManager.classVKpem&im!@!PlP(ӂbhZ 6]ݍMK} >u蝣2#-XGGǃ://߷6?  Gw^D>tIq WiA9% NߏQ#583x6HSq0(3dD-*pWb~~֤':u$SzOut$%R1%+F.k[kP0vU 6E AJJΛ4lStAjfPM&$xf5)PVuU33ާX^{XXʴrdu56n)j/dbAS;4]mdBK1jOqڢnr-hkz,ceK(.<4̋y5cӘ![vnfF$Թ gPaL13 aД 5R96YK,Ap/]l \ak:rAC}ѫ .ZjMǗf}uUQebPMfA=YT[ {[K8Lb<sHi"3+/a2FXY22֯u{(2NS)G//T> 3h ڸ7B(^:Ga*{Pl6\;#4͠v )l y&ͳ`sua$&&s"\^&x3^9O&39qߝ}~9z(sL7R <7o :ǐG3ăVL=W_ߠ(~׉_hy?;Cr|1C_JV,9fx6'2!Ƽ!09'($)^RV-=wj$\7ud/j-'I%4۵ 2ۤG*1 <PK UxEc`-org/gradle/wrapper/WrapperConfiguration.classn@g8N7Ρ--@[QU6@QB[I*ؑSDC!fN, ?y~:9w(w7͋fzusY.xCCF4Ĩ75%c0.w*&UVVqF IJrN kxB \»'/;'k00#IR,TkA i CeG4G۳B1p3r중2SL6 zl'K9lF#N,wEc"Ұ(v 6[+U32vٸfL;6bfDt,f&6nh4# T;j nID5/G'"皶EY(sJhfZ)Ԭ&Yv숔XH`8BVXnə"Rd`KQW8kKtedTgȄejrm)3.oer$K.uڡz9Gp]LY[pcxb٧ NRzFKKSA7sȧy+:2`ligh鰑qUܠt~`fVT;j:V03v; glcJusyI\\+ؘ_b?ڭbA">qHǸY|u*qSr*-=~PqKǏIAtK.XڮG&`ۊ5K 2c3EݾH}tױHk/u5kXlH"5 "M;|L^x|I;Y%79R, [rng}zlj2 t᳼w2iIa%EVFi.OoΔu9mX<VM{ 7J/Q7و 70_C<.ȿ&(r>)U>'K۱T/) nEh<AXBC4E%/ADo_݃>蓛='O/6.6my">FA_ ,b` ("TCj;'JT|p3Q}4}gx[C8AP"AB;&ꇈ.JziE\"&?nqAKO8:aN>GbF'-&=eh=q /ӋONOp 7gBC8\QJ Xzc^sw PK UxErn&org/gradle/wrapper/PathAssembler.classVcWy8sصb',IS(ID*Nڸkk#o*ﺻĆr(}Z)Ĵ~][*7i73ߛ7ƿ8Rn) CQ(LٶdŋqQ[$'WRQܔ[2FA|$gR q|Sq|gEd| _Q|)/хUQ _1|Ko}q'C9y' XZqp\X(e*PsۃEqtGKC#3SÓG秦'sc$Fnh7lQ3 )0 LLwV+t&&ǟ:`k>ekY4|3}(4y0 TРJ =-u/g5ېur Bwek++ܥ~FLMvo 3zgF{(lMtwoIlquu 9&ilb\cEx{q֩|d/k%З˺h}(莫WrŒmkS? ;(PmVW ?suS^uu2(̆c΂9 s.nxSWXvq`w*CRPJ.d pQy<ŏT *5sxZSEnYO,Jע(żn ?gF*~! _k̻gMU1YN$K\E1pC_tCFGŸUU_xG.;`rkp Ⲃu;Y2]cYgU*擦&HܤaܤO>]aF(TdžKF Irfח7(In 3%I$K u`obM%Tw-5ִp-UhWf^A;]Tp!joCb1 Cݾ۫S(#-S:^p>if~y"c<{Fw12~C %FJ+yͥpD"s( 6]eL/-yk63u1Y;)-lé\}j_,iEfӚ~'lpv2:oFx *W p}nۺ?vV1r WpV8(z t ~J rC')q1μ}$i8ӳt,$0m6KOQηә2+{*{3D΅6b@|&2&qh q(c›v&vzROPdҭH-[!1PiaуUfWB 6.q\Kȱ [Fͅ=$Oq|}t]s,v玬s@14:,v-21)hbf1C,]gM>|C _kdqB]32Rռ$hbFhy'[ZpU<ћ!/b机A==HW=fd`TaF2fƓ jUڶaGy\Qy}ĿT8yN22IAүa=VBD=d+xﺔ%|4#j>K>GXy B:|P;XLJ|PK UxE org/gradle/wrapper/Install.classX |uO{j5t!-Xc tKX`l]%xGڑ#΂D8qppl'ihzM:n&8d[ѦuӴM&=N'=~mziCofOi1}o×_%- EO4ߒOKx4+)H=VuA^ Rȷ@+pfin (7KRƭ&MANٸE o Qr`AŻswo.# e a؊L@"1 ƃ_׸ˈ"2VV#QcFaB`@cIog|bMl,$LZo>=9i3ިaAU :gVBan&{ @UKmZ;h,. mBFh.ܰA!cfʚ]}MT JDϪQJVtA%YȡnP>)ܣr/)/~*.nVkE=AB=*!vV ,)gګ9~*_PPS:!?926_VU>/yD(]ꖑYKr^~T ?!֯.pHn{u׎C+\_(KƦOd:y#w\"s +A˾,uA+կoyEk.'S.~vG/8He*v^U`$36b?D\vG9XS fM 8iUၩd@Q RyՈKi;He=jAqݸ8H j 3a Ƣzn,rvB.&% i<[XO撶ϿIjLɃɗM=zI3uWǀmfWUfpFbr.oEBݹM\T[Jd6x{C\X̾S#ޑd\FyRe[(yE*#OQ >;2upIMEs~ޘo‡TMᔖ2-?Ǒ9IJ>k?B? >Pܟ3B9.{_ux-T%)֭*r H%:#v oMt3E"i_%t*Nq\ɝs+Ǥ+N>м@J"" @e-/R8D-q( C[(:Ȧ3%g(@~[iPbP>6O+hWu0Y-RE9 y[r^/gF*&LkՑgK?+v9:Gr|>!aG2=xMER0K4ܲ@ᶗ-HexZ^y 7N/]y֕PZ#Gp]TOcN;Z5ܳZ ;r3z(z 'yRqz ̔{/G^0/6t7xp[ok?䟧Hm _[Jy_֜Gc88u iQ4R}bafH? nҟH\۝U;hDz1?xoK)/"UGZ*h/jx*"汈uZS[/䛧'isԱHՖʭ 2_]E.P/^KtޠڐRy P`l8rwCs3/U;p8vp$L5Ч3p3wFO;~+賎۫k_LSMQ|HTx{[*iiy<GH~X M i*/:ID~*I,`jqgߺWl p-g$<ϞH`HN0!hF4  'üBLClxpf}8"ʚk\#`P'npHd\\5&#X?|jEV8]n`F oyexȿ@''/ 'Ck~R+g vt>\@N~d *$mpq84dB``E2h=@/:ib֎tZ V]D@Gߛ-=X36Xǜ@K_ iOQykO\}aA|2W[̿e(Hk" fEl__sDl/A7 b9 'fE\ߤ/Q܏=RUk BfW6Q跥ԀIv5)۾UwKJ>ܕ^SUvӟ:_C-+B'/?]Xh+E }3a1!8k?r1-I{H?PK UxEL -org/gradle/wrapper/BootstrapMainStarter.classVY[V=²l1pNSJ4 `:/FD,.iڗ6/|M/vd7w;s̽co/eqWF/f$/aVF+dü,aI1XzwMڴYV5#nͬI,_ m5F:-HO͌i:tV"0 ! a\~X@]T-B4 3L5HpO̎JTpP;"%5)%YҎ:l/O~1yn$۷5JьTL]f9dzByd.hNuT'[UzifJ9TV"܃ )xt\:JFvhؓDA| KAt'zEܪE=gϱ()xCM{T G >Ed4Ѕ׵m _+5rÏÏFyK7 wT92`fg̜ ԩ6 ڻ,R}5(Nimm;==荜+AU*tS(\z&+PUYȲ"Z?,S*O8TTp#]p7ZLgw)⾵,+̞;CנӺBͧ`Z%ܮqfLUE:jښq`>;|4{Dj6R\a(\K$"\&T)3զ@j1-2 X#5E|@-봋,:u;O`L!&;͡cԭ“?G1&1_҄7_&ƄM<'|@3LZ p0b CyJ&_1@$CҊC4}xWnM+vuon$"I&/OP1EѼtİyt,mbiory\ ˿+Z$|cqVv (Aq=:Mawxq1):L%>=ZYJDvPK UxEE C (org/gradle/wrapper/WrapperExecutor.classXw`#Grqlg;e3KHBh<^XNRYd82H[hX-mInh @Kҽ.{M}wg\߻7xO~p 0^k¸-ZBkeua>7(nQ(`ob֑2E6KCwλP=27~>e ø'ΏQtX5-_3u_1+4ɝ0|!c|E/ Jŗe1+Q|_FT/5:׵eohpǖ#Cý#c bZgVMtlMWl-goղE] ]R> K HVT o<$̼nن^Pĵh>m/Sm64ޥ4NY{4+GvhD9n\FwNcB~Ӛ蜰LVci$:sCS40sdjiZ4h{ iSpEk*Ow.RU [oԸn Lk٭e9G-=YzQ` EPpJWAN;I fJ]> (-ҵ#beJӚgi \l[JN9HlۦLYX}2þDm9)4cTIyrQw{}f'tۏNE*hk}:)ȉUW[Adg]i>-?G%(!͚ " :hT|" FMaFdSy"xzqkU>oP)kNVRFߩsMv;71M0 ҕ-Dwh:DUן Ze[v۪4CA,gߕ!{>H1gSz{6;iLBKEuWr7r[9h蘄,f3 Y{)w*Lq"?ן#SH́D.NԮHVvu/+}gn=(Izު[ǣ%iPQ7<>FeU 󅷵2/3sTExK{-q-<)[ z|O`pVgłg6^2g$_?T"/K8_wQ_}Ɠ*@((l4Nנ9dj}bnfv,SIG(Ұt vrek TP"!e~,%v#2F {|ĉ/ISzQL*=KE.{2n*IM"iKOAnIePp6碼iVcгS9~ݓү)깴^aV~qU:/|% 8 @)a[q Ø&,z1cڤ|j! Wܹ;1z^DѧDmGАHAcCS^`4:e2Mc VLTw[ŝ3ܽUޙCh^r IFK3گXJ-g?랪{1܏xI.)^%:Fi%T@ϟ ‰6Mڅ|.) '5Nq `ih1p`(j9N0{p;&[ЀW0J OM4 :Rfػ_5űS-ޭPm>K= W>PK UxE”" *org/gradle/wrapper/GradleWrapperMain.classX x^y#lA0,HBlBbllml됖VHWzwh6iӻ$me7nBϤMJ,-̼ ?#*v[|9@/T"r/V/^ŭr2/+REܯƫUUp :xD]wTNw VF/oRoVp&ܧbޢ*JpyvĻw=rxx@9ާ fU*¼)xXťxDi|؋3c >.~Bn>*7@zI )/Wi-r7g ŗ/ *:@Pუ}# ;<4|hPN?czb*8j%.35҆@xo߁P_obG4v xz. E`:>i$1C3zlLOF<豦) !39JꑘgUГ;F Ŝt KOkRE4_ڊƂ RO cƥf;aӖI1JڳDYG" 4,6qDi,"o-v4}F/_xoNU.A/],&eVB%(ӄG+B*RΒ\4BxV4n7)QRTjeF`gHV bq@ S5pLXGf<fSF`ƎEQ94y4\$ZMIvvNG+A3`$\JSt"B.unUA~JRbd C aW@jGDH 8~"VkzLb l!]FM4O\X,S,ձy`h>+,q*̬8@ jYPjZOu'q#aKhǝz qC.gܳ嬝vXXӎFJ.np8W}ɭj[؋u'1SbGwLc店 ;6%64Xvܽ2ͣ\-` (سJVvB<`Lpq G!!\o%E`*ÒPq Σ|b'e)މ2_ǧfP6>jR-,Ke~O[5sXީVG 2XN߻yDʶy sX3y4My%º tۅA3Y؈z` Zq q6Ѥ/q$ 7rKCKqta<_klĝ"ӓ(#0xC)=Ca頣AḝI!h36^(06߆Y9iKK /8U؀n[Rd WVE,mHZ⥷ ^lkypw`RZl`kepqաy\:>Ntށ<emsb;OgrWy6۲40ȵ\B(# QdfWGG A'N}]Xeugkocqe)ٷ)G/)6 rk`N'~^4ћEK#_M7Gud:W3M fڵ23T Ͻ Pc9Sӛ*x&nzWЕYE <YХ| ގ:'X9`X|<xrG(77}DBɓG:[ 8 V (dc' ޭ OiY(ԌZÊj!=]71mP VelӓNn l/薹6& EN]-$bpb Z"!pt4`&ԝfLr[(V<Mٙ>}=<+TӔv%$Kg`T+FΗ cp8p%zאg;ikefk>۔2lLb1sDDJr@ʧD(Ŷ,G`aÉ5K)73Q&}x243wLFyYzԜM- hʶ[)RLضVʎJy56QJf|t/x/t֪Xz ܄Z:LS7PqM5RW:e: o⬊ye]oCB҆Tn pNGX'ԇT|/T|T|o|Vy|? B'Mae܏B<'L8)~/>K*~e&vuL5KS/F&L[C|Fu&uVyice"?nOEWug]x5='_B{g`]bݯ%;IPLi)>ō-hS=iw/N9"KPodzzC밞7Ш %R8r "j %1xBUx/Q[Jk@SfƋtǒ!$R _Ge+iUԤ1 }!ԓTL| I X*捣b]Y~><%#8:# 4^Be֨b!RIcYq7*_b];{sLƱT** `# M)5_%.Uq{S,vݩp),jC٤dЕqK[[f A% R l"PS4 ianPoa`XD;Z>~l'C #S]~LEU+F^k%-؋m8,њ)·c;<<;Hla}$ IvPE#oD43ԧ7K`]N{7B7U2PK UxEj jV8org/gradle/wrapper/PathAssembler$LocalDistribution.classR[KAftq[yk[KA!|(L!N MJP?U<3Ҩ|ؙs.s˿ū9`OM}Q&m5ȏT0芟"VY~ܪ+Mc56-ډiK` {DjyT򏄽Ilֲ$,4T*? {M Fɡkmjs M%ydw-~[ؑ%Ru<[5'_n$6E<`=fAxwZIDډ?7 U46?S@h=W-ĝPK UxEۅ'9z !org/gradle/wrapper/Download.classVi[~GH0 0E%ncGNSHeuef䥛itItI׸mة-\C>/ m;I`=.s9,瞹:!"Ģt:.ࢊEl%$Gqux(Ⲏ6\U K&/ 7i F[r߫RokxMw4|W4|_*~xCś*~ HYp& WA_cٕ'޲|lGAtf3gO+PR vLؖ뙖7oʢEu|ԩI+h3Ki(Ħ.тi-='o-aۼp\>[y︂Cm+N9h[b\\ Bjg¼* zywvF3W"ѓe`9&'k[#ْ\@jFfST|Aif.[ÜpUj@%tUDAfؔ 2 ̧9e ĒRFxʋ΍3%\2c/&tJŸW)sfރĈ5bL;ywSVydsY*J7jD慕ں:zIFFlQx.Y.;Y!foVc qGpr8]HIV[O ?Wp:mE/ Uko I347FZHGc4d*h]Y#=}T_p_qCMo^-VĈ*XUqqV KX|F.UY1*mb]峢sElpG1a#P3 Xo*7>h xMz8SY[;[p, U4uޗxP_?wyֵuWx'ŢY.$4t-|Fce=cXB\qwvA)y6a u$oDROjFzWQ{[-݆̊u^|톦ࡆڭ:}Sn61nӪ$WjVD4=r2h*Z|Ge([]"}ov&=$ܹY}\yBy9züQ?ְ~189g$q|+{x]@* F W8vCމ0^D0/P)0m)L ~] w+=DנfUhwNj>rЏclU(I>[o@ŁU in$S3x)΃<`? j֕i%`G2hf mFm_Et<\A,$I5u vƃt'C׵9Cq2Gon}geޭwaW1[|c=E>ZvF*{ CCCp\G_hV:n|x5=V[!e(WcqS(AbHQ4gx3KE"RzHB,_#7 eP.e7(E(1J+S\EL33s4j?YKjܭ׸[r]~+eaN0OA*s,?75<2c}>{tl?d0v@>B5hGVt&8΄t&3j\Mgt=<́53 |IZ? )kOg^>Y S8Y^YBuG 0 ] 4GtnhרGPK UxEB=PN#gradle-wrapper-classpath.propertiesSO)IUHIM,RS/S02Q042126Vpv Q0204*(JM.)M/JLIM**+MPK UxE qbuild-receipt.properties=n JBv\zTU!&*@$7o<3]gL,ԏKf<Ε7RCNŷ,/z%~8Lǖy➏R7q=2^{?yZPK UxEorg/gradle/cli/PK UxE<S1org/gradle/cli/AbstractCommandLineConverter.classT]oA= +mt>BHhBem` Cߢ/42YPa9{s?p# 5ϕBy9yEx\Ye ZpՆaEjw I7|ZU c:tiw]HaٲzQdu;BrQfظ&DV\%6# ǹƙun]s̷X3ػe9H%ҲIk.=,ȍ-1ak^1蔕)9(id0^}y_7бp:%`i6t*㰊jBL F vUWm_ImuוD I8H7Լ&%DDՄ3qONB :ACvsπb0\8y 3CEw&T*kY+$@B!K͡5IYF6VANwh`+&XEKH,έQFV# J#hRq +dAy ~#TN*)oްOOSxΑ 86'??'k<ų@zV`PK UxE2_e(org/gradle/cli/CommandLineParser$1.classA 0EhZ v庈kCPEv-iIp.<S\p>?fxCDlnmMJ]k'iu#0BWՔ!f,By@wZ͕t!BI]#HI9|g|{ -|PK UxERB <org/gradle/cli/CommandLineParser$MissingOptionArgState.class]O`6d 2 c&/1N4`W5S/ ^H#2-sKƖ&9O״ 1UQ4%ի*gE)D15u KD10ؓp :@^3\ٲŪ+^1UqQ)B&@)ʊ! B@n!fU(AiW|ۤᲵMnR# yd8!>] ZEGO*%UoulGQ/\TMn˹-02#/hAlu@t%-q*2Nw e&BgsQ\gݧ-F͌& nX{~ϐ2zfZC R6A Ӱἆ%GAa3@Lv0Lc8~csC7]Saf=/b !i!WiXN$!9!:">i!z&|'d&w`8Ig9NOXFiХ uD5x Rl( 38qhKCO p:mO.6p'%0o %rC?7o@Ӹ<=@QO b쫇|CMa"!˰߰d^7Uܰ^t^EY3G5Dh8%h+aVE3D*PK UxEM2=org/gradle/cli/CommandLineParser$OptionStringComparator.classTOAfeam`ZPd) HJ  ަ Yvɛ{&ƳvBkj8tޛyo/ jѱcр%:UlQNJU-`!?`nbHzT(gHuO:{|"<Cbq,l ߷YqDݶ̚-M oyޕJ z-Qӹ|H Nl#Kݑ]:Պ}^!C ެ\kW=jA;W{D9!ŒeAS/@%ئRƾj n5nIJ dOcy_e[o9 =}&fH]ͰqiR-x/ rahf2=BF=Y \ KaڙdiZ_L*SHhpPsED?!Fq`pZl{ݺ=1#[s!B{./zi|CG1ǐIVy&$nR$B;C'r."{i"Llr".03m`Y(N͒V055QM<3t Ʃ`1< mcd3B= `OPK UxE# GK1org/gradle/cli/CommandLineArgumentException.classJ1O3Zm+Uו0tD23k*|(1IK-Y  2aEE9炧=OQzTqvK9fU v#tiˀFCb!ė*BE{2ȅ 9`)K,Ihh\x &J>p1YITةQcBW~ͿDcD y f@]tӻLA^%6uV18(c]3 2gy=ݴoa̝̩ kqv>PK UxE?h=org/gradle/cli/CommandLineParser$KnownOptionParserState.classXkx~dY&Vdsд!@LawL73ۙJ[^6V{zJB^֢Oy<}ߟ=gfvݰ?ܾs@;ޕw%!cL!XIy^l^^\OIcB w10X/LtL{$|Uz|={el}2oɷd|a=_P8? cu 'yy(agL K%yT¯$PkXk戄ٗmivOBsh['Xh43g~vt&@A˰qlŀ1bjn&5 wfbqԴ]L0 KB}qeF;zylXqLҟ;C Dl2XwiMU9^܀¬[2ԘnհT ky|qG\R-VlFtw[^ZpwL&tJ%Fڬuu[s-ά }A*I/m^E> @YK#m^:y}` b(`Vʎ7hZ8h W '̇۶g vGVlWpv(؉)BA 4.nS:vA :WV:ujմ\UTBq>:ȲI6)Oݦ%ݣ6uBsԤmq=ajl-N+8*7+xC bH”.k8e9tTij$ *9 cNsxF ^ ~ڛJjFN趾d/?BQ2xEE*~!Ů656ד#f"zHxS[,mUbj%pAMi /}2h4i-lӨo/F[Xulxz o,<#^Z<;^=F݉5y豖 g7eÿ%8#^+XLw6j=K֕ xr^ i/=Tqܼ42b6vc&>ܰh0g|,eŨCgm}C~Rqᦊl>hAdf^]2gbV% %jO2!ǃL&~spfc9OvtYSr\w 7V*9dpŮ[ [3Gҩ7~vOrDx (Sl}izKg:$D)ahJb9gP6hhVD +$D땐h<Y<*6JBx M'X>bAlwĶ%Y9!e} n/*XO߯i;BPK UxEk7org/gradle/cli/CommandLineParser$OptionComparator.classUmOP~nQ:oSD oe b0!c,Zvğ_Hb"6sJC{=9Ϲ~P4dW%,(c ^JX^IXu E ^3tmm2$r}d }w5C}5\ޱgVnkR,)XN];ڡ5P ЬC渺3y  ,#e}0Ђ/]i'jjV]zaՋ S6^~ u- TtY+y&duK< Y$jj7cv 2W[VAۛ+mS*(uB1lņ,i^\r 8;c =x\8npﹸvr CQo *]] 4|[Tuޒ%`anot.AAiU WGŵ S/co4<}u!TZi>Jv!a<@*QȒ̜!q ;G'.!v {nD<|ڊ֋1(Н0Q I9b|b$m:ݳtBHO\g3(W`iL9I k aXtβE$i,:AyLS)""J?QAX_PK UxEb'n?org/gradle/cli/CommandLineParser$UnknownOptionParserState.classURA==$a2@(""UZ`$SdL__\H*~7= B%q}q[A :0A0iDq5k rCw˺& ezabq [vJ%nWL[F ! ɕgY6rkk X>O 6E=}9A{gN)Ҧ>曖.^]S`{jHO~噎-pe}^^rJI=R7٠cSޒ{2iOcinی1^#r޲t1b7?4yvo=)L^)aFZ֜[c9*z'̼n$T BAa F&00 mGd6a4j3 C'/n{._VƵձ+1DX UR%H}4+.L_c=:"L32}F7>L`g%.dJ\)+QLgq.:K^dI׵{""¾j4e8 ț#u`@Ts䥓?|pP{fr mEhQr!"XOٯUVH/.\ 鋘6?9R?SͩUÕ@FcH:U1-NcjXc4Uy<blfq=PK UxE"zZ &org/gradle/cli/CommandLineOption.classV[sV˱E  cP@HICEUGT\Y S¤tCg'C~GVdٲKΞ՞ow=m;0%c:6Lu`; fx%p -xK ܆*cAF>ꩂ4!#x,ƠpG(ߍ%qKFI{dY5~G*n4gXBǔ^4Ubkֿn ,[7'yBQbeI3%5r&u#q2Qy[wHHSYJWɬjJf@H7h[Ҭ,J8ڲ -7ܨq&nԝR1GC (ONT4{Zj̨.2jyi~\PSDz5,žcKBO}p,̮V!^ԜUmH(Q"lbQ-#a("Zrw6 ivΝ=w _|dŦmTҜ1d{mC);BNDfQkikɄaXq2ЯܵRB7"$(XAy/A7A#>;:|(F ͹=U&KcEZP|yq- S+·\U|HQS$dú݄31TMZ@wťK) sAChU0^]<. uʓGa(M"%z.업cg"W=d1fIٔd}F6,O).a9Ţ,se,~Bٿŋ#H r."Y,3❂Ĕh%aG1C Bրަd0vN_y)]}C[m#U mO]ծ7!VWT{z,jVT, AfDm dUO kjUj38QESV-J-墨l۱n`KFO{3FDe4uI\6%Bl5ܸ`e۠<3-ԡLԖ!J9v ]F(=75sٛ5}?{^?ꎹbxm%;sf횳hmNIxoM&?: C]G(سbqu_u|C7u| Wuo+H-ekg{nl]u?Tp{j.g-ٯ:bJǏc~NC: u *~8%sӽ(z,>`u(]tѼLQq :~ :~ D OV\#ZFg,~!3CwHEnZG$]hec9gڝfl;+}<p;/o'_ cuլ7[s_+a {0{&`vN7s0ċʼnrsqf&8 *BW?=qc4㽔(M(D:"cסRS-t*݂*Rإ蚹n[H(^ZHRqh,{DRTzØ۸ύ;{W"9:L!YDaL^X8Џq/!G=If11<&|x'#ې8 H1"G)\MRtqItboOH·Ĥľ5!ߑ? !1Z\c\웎{F]ʇS=6\w.8 Ti߸ #W%#'!yr#1S8S06|38-I")XZ V z'*=#7ppNPJGfFM*vX! Bg鴏 ]5HgUR(sKX d1߰6kGDg6+! \<$/$Ӟæ<~xQ~L%s@T}wCG4x̯.Psn)"{T@wX ˌ菜58o]$EZЈ/g"%1B-)i~G-QFBJFn^$mPK UxEA5l| :org/gradle/cli/ProjectPropertiesCommandLineConverter.classKO@D|?Pâu#Q+$C;1m  JW&.(1D,9vo/[@yl汕G)v }FHWkwLS!]nY7ZK:̿cJDZRysV;H+-)nkS#cruLXgh|BjFYDΏ%L%񎅎*_?ֈ:("<ڄbJՍ ؊tf^*K ߵ XUVi01k p8wZ8T0g?PaΛm=C Ss | 1\Zq-}C_JEˉjE+ w'PK UxE2lWJForg/gradle/cli/CommandLineParser$CaseInsensitiveStringComparator.classS]oA=3|,bYŊ/b JbB$jBfmvƿŗ`|G,L C{ι=wo4<O2H%= Ttjnੁ}dK9*8b+' ; ]wI_zǢoS$u>(>*A[Ue/3n3̎Hm ߗzck쉡-,N3U?ϗ^i e;^{*΅e gl͑HC9Y\Y,X f.Ggab@HFSϢ/@tOL]u(#_k#ܩ7onaܾN25h8)Jeb[11_b7İ#j1lFד>^I>7ʢ0uI;, W/pdiM O: 䡟 I%'/;Lق)<$O#30VQ+qvC#B7>8^C~,rPK UxE( ]UT*&org/gradle/cli/CommandLineParser.classY |Tչrgn."Y"d% l&̄ *JZ^( 1uֶu}^m{k&ޙL.&Ǐ{Μ||͏;NDs ϟUN>:&5I֣AOk9ȺN9_64SQ\ϣP>hU>bNWOb&#S< Rt*eTRiS.+*S) f *9պ9<ϐOg괄i\2괜ktEBX%:#VXy2i|NZ>#*u2svv.+sy[/* [{m/ߍ2a|Ho\zoŢWN|6xXgΗ/c3[EY*m32u3LcWw&ñhc2::͸53M3hŠ&N+^nۣ]Q{rOXƤ4ގ/8\ipSQ}8lmMMA֤GGƨsDk+0-`jŭqci2V־LYH #ZL^n- +]nŅ LU̝fUW2[m5fRE1.NA]ov&nɮ84\=1mU6Eb񶪶#*GUe h)x6#خn1_ x0SbuƭfS@x;RCa )%jᄹ5bh8SR[kDYձՊBl*lF֛v0Ę ٞKr9"~LNY4d`+ndl? V* a%T+!h8 %/3A.~+ڂW"b˻/iDs61C^x:2,#">El5wMy]-t|Ъg+=6;;S Y5lays@HJFqz Fn+Fޭ f76^3;+fts.l|r9ВA,h%/S0碄'2=Sى oIJzL,bSIdXVHAX Գl4#]gEqnyU@C8fqNj4#pafkA𝞣Iv̸7λ-F-.X_Dfqv+ qr&޺ Ld"=.d[Z['m\vR>d b5a*3:l%͞4dh(E{TX,Ώ9Jg8,]H荱x3j8XIwt0 tA_;&xJvv%aS찷8a9i!‡3P'4x6!zx`gd_W{䳗zϵ՟J}_m5򵀸ܰ2s|B!7,ex|l-oۀe%ŒYe_; >wռL,;cg!#34Uhcq NpErA>d%C˷|o|?_anh毤b!%tuq˂ vIuW ?v%(mfunWt Vxp، [[pGY"ŋbdCMƒfKKq,cHMHdpxfYLh /kd~XFcѤBv3 uvx!@|^2s'|_55dbɦe{6u^Q߽ӽ-J ~߀Q% hȗC8#piB O<1oxp:bfTiIӻ^yӝHZx@EsFWxBI7RRl\.GdaƗ< 8ƹ1}šsn@v,2녃GfڝT6 QcLI[]avoz<1Ď.3!H +Yퟷø]LtmM8[Tҵ!ѝ%.*37l׷fbd&3{W -mv VW+,RYɴ5v|pJ\ 7c.W')ˤOWGYI7vi+ږlWa Ǣ hʁ2 &%n`j7nS[<չ!,fB|ô eC%DrϮOOGߍ4=6S-]JZGi+uS3]I-t#YȰVDY=K;~ )BQ(ND;\JIO]H;y#m͝t9+Ï^~ >\Wi?Ct=n?-W?$%{`Ȇ𽃦c8z#dLd= Yl: M'I9 +j `ߠ+ii^fZ~AG.,qTR=ϛC|4o[ǐG)QTgep/^t@gF$2/)3pKKwyX؇df^H  n^OW})Y[HVM]G?L 6иJKT'>(ɩ6de6jxa-~~WrWtD7(/oKi84\ -Ig!UDk~^SʈoҷlxVДeg>k˛utvCe_hLOtx+5ja=ޒcTX i*Pn+t >_arwsG&+_Ch}RO;AUQ[SsjP?69 k~E{-~uh h?wPK UxE_>ң)3org/gradle/cli/CommandLineParser$AfterOptions.classmOP( @ 1Q|Dt1wkծ5wE#]|!Je<1Mz99s۟PƊ'IW1!y &TU,2~=x6x !fu'\lkug!s6tL![/Hvfe׹[‘zdL[N!e`oyXuyej&!&)bA02]yr^5Vf6R\4dr] C0ԖQi26{{J.-ߏ޺&Ww{Ֆ/IU0ᘓ*3Le o13/#|]ˣ%D3zaS2u7XMc qaA78yWN*%1|avť-N x!*JB)>#iIShln|AjTDVИF(J9DRR; %܊1C!˩1(|GFJfKmt$F5kj|EFߧjr dY+!kmrQ>э=ep2n&0ft9\9E'tN* iϤK49hPK UxEGf3org/gradle/cli/CommandLineParser$OptionString.classSNA=]ʗ(U-e)'#  XlwD w+Bs;s=??~ &cAYs)TҘa! ei,pW]p_8 =[wfnsak璳bv0[=כ)m1$j^C0dlWv8Bv-(wAP ;e(|¯9<Zkߴ>o8ª;UZ-6d7_8'_AyKBkxy>Eӂ4Ui 1 z]A@Z Moz~]{*Ka:r摎, b^hxc+ ݥNC:`U L1r) ;{2̜?3zE9nB;;esy-w'i< MI^NN֤ʍЛh8Yq|9wvc 91Ȩ(` (OaʓbmDm>X1#Ka ~B>FJg!6l|;4 kaCԍx0r/iA';`CG G¡c#'30@ÌnmaF⺙igM1*[Ĥ=\E7i PK UxEx&T` ;org/gradle/cli/AbstractPropertiesCommandLineConverter.classV[wU L2悡5&I  i-4J`Ei+v fpfD{_ m\KWu dY\go}_p?H8"%ެHO%"ͧ= IX6@7=q|';y !H"r k7b0tuVɫ}%_&I cbj| f1\f)^2bYZ ڦbZ:c듶r*&2 M n8&M2jV)DĜ2W5( ,MIzn)嚺Re9hst4%D  Tmgv֭ttKzAm" e7tv٧) ٹW v"Qys;]^W/ypF,w5h}ے6&FOڕ.e`ZcCQ@Q,_Q+WDh2DDYFCF0-6j qdqțt+#y:*[0JZEf~]r~N WScXD\/ô.'Ƕk_4ЫTS!Wn[0( 5R%պ<a՝ISؾꢦ.Ci$h{wfЪCMx(t'9%f9o=,N=M柃B&; ﭣ/ N?gyO:aeJ^/F}o b b4kt"tZ3 яU0fx'ySr 6 aadB3 /)LF}Atc^l?$ͼA_ !7Vkg렾@ ‡CGGDoЅph.` (rroYJ]kKIrIn#)`,r|4 mYYBY-B1>q?PK UxE ,org/gradle/cli/ParsedCommandLineOption.classS[OA-"\A(-ʊP1!iĤoC;Ylw- &JD}G.I|9|93|`+)dPPQL!B0冊I,-,&q[~器+䲤⾊ {lqWAW-+ b^ f~,Cz2$J^YA4_T[Wjls6)5fn2Gsy;+c&k_2U`V]Rm4=a[T.ipoSP0/DND=t=Z+jpS0? KOrbjrjd0ajWKp &N&7YָE4Wl/xY92,BS*jx<Iy2=4yt Zce[ZҚE̙swy' :iB&1!CŴ3*f5$0b^E!YsMKc2\(4ff[\xf}f!?\t]($zr,.épx| luj6%;fOxզPɭq…% v2^l"P+ܧ8ش癄[2۟kVMa*W`sQ(r Q.$uv\ y;>ަVv[fnYx2=Y#X13:ncTŢ%,Xѱ5Pg),V5H3 K; 3'<<JڵSΐ!/g&W;;^ݸ v,ly/x-/TQZ]܉\E/˕=lk<RwgrŁtZ.=EPhh+"J6mL+"F~OA-, f1B 娆E$5"y  3@AU'2P*= QE3D5@}C2Pw'3y=RHI8[-vtn5@ $}_+:)+C EoPK UxE'H g)org/gradle/cli/CommandLineConverter.classQMK@}ԯ'"4 FM)HQ ޷lI7ݔ6MBya=t$S l)8A {Oyb :˄3I5' JXdT"qx{a/4OR1=Q615 ڹ6ƇEWbRh{'qj1M8zta,gZTRwZ\SYVSݜnhUYB/n/1Oz_G86\fjAʚajJuG-UʙeZ7^[u(5}2(Y٪Zk$N(@yA(تeٺip|0Xh5UUJ`[QtZw&~ժ:ѷ3<MSn9j*ߌq\6 G J*yN;{2#hfc f*SL Èaq!y" Ǘ axIbtxh.x%0&X}?:Ui4^}R֚Be*"nE& ̗he;*J"b68pÕ7rC.?V-0fO~^y뱉6{*B^*Y-z=:7CRyʍհjHC*dfN͔W{M ^pCMPzkt_gBR,]x;^Vw+eM+" ;zf7i^/O3}xb]C[B]d5c|Q%G7GO=m~} ҈'ҮOC.i^yb;30rؑ^!yڢ{f U5$:2lދ9eX Lm1 }IbR$Yy|M7PK UxE;|9org/gradle/cli/SystemPropertiesCommandLineConverter.classJ@ثmjE5BDąR/P~ӑ$&BJW 'iAY3͜l "lYlE <& d@HgL{:rRs:C*X4NĬQ ۴;hZ3a ѽG!]Gv7S"5eb o}ɸGtFMz9y~X{()spL`7e.KV, TXxɢfDTEGPWJmh~49AjxѰ sh gԙn85].FԒs9Q΢*s/@Ug J*ce+s+1 $p6/t-,;h-.Z >kZPK UxE-h2org/gradle/cli/CommandLineParser$ParserState.classSoPN) sT4706|3M$m f{@ú[s-?&>GmdiҞ|~pF62΢jv,-4WׄTqB0;%v=^(>{J` aŴ9,D!PK UxEF= ;org/gradle/cli/CommandLineParser$AfterFirstSubCommand.classVRPN[J;be-x)U.R(XE_( |gtPq猏C8I K0㟳{v|gw9| E͈4тHA/pK\3*B;ExqOH0ԛoB Нҍ\$gȫy%ͫ!k)USdqW5ռ0l%^--1x[+T^ɓğҳr~I6T/ =]kS1&U`fV Ҍ)F2/ Y88f-IMy$ M1eLo H2ڼ})MzٛZʑ"P\\y6O\}9ݲ'JBKeվm gËYzK/ Z1}**2 ~CG&1%`Z 3t9eaёgںokǔ@g%Ku`|G%E5e uS閻5ϗ4ڪ۲b}B\EU6:YPFJjh>3L578")&COH)YslH#?nA;ΙZwQW+S ^ͼ8W'AE[=|>>Јs6MMT > wH{im::^A:HG8d"Fx&O+\AX!vѸq HW69Z e+˝}с; \G7Ĺ(( ]LSh`cB p.9Bj1 D'($U<"z@'bj']GZ f<_PK UxE٥FDgradle-cli-classpath.propertiesSO)IUHIM,RS/S02Q042125Wpv Q0204*(JM.)**+MPK UxE AMETA-INF/PK UxE{MAV)META-INF/MANIFEST.MFPK UxEAorg/PK UxE Aorg/gradle/PK UxEAorg/gradle/wrapper/PK UxEhdf#org/gradle/wrapper/Download$1.classPK UxE[pDorg/gradle/wrapper/Download$SystemPropertiesProxyAuthenticator.classPK UxEXs"zorg/gradle/wrapper/IDownload.classPK UxEz\Q-dorg/gradle/wrapper/GradleUserHomeLookup.classPK UxE] 3 org/gradle/wrapper/ExclusiveFileAccessManager.classPK UxEc`-!org/gradle/wrapper/WrapperConfiguration.classPK UxEQ}i 0org/gradle/wrapper/SystemPropertiesHandler.classPK UxErn&org/gradle/wrapper/PathAssembler.classPK UxE Xorg/gradle/wrapper/Install.classPK UxEL -,org/gradle/wrapper/BootstrapMainStarter.classPK UxEE C (1org/gradle/wrapper/WrapperExecutor.classPK UxE”" *<org/gradle/wrapper/GradleWrapperMain.classPK UxE{x "Forg/gradle/wrapper/Install$1.classPK UxEj jV8Lorg/gradle/wrapper/PathAssembler$LocalDistribution.classPK UxEۅ'9z !Norg/gradle/wrapper/Download.classPK UxEB=PN#dVgradle-wrapper-classpath.propertiesPK UxE qVbuild-receipt.propertiesPK UxEAWorg/gradle/cli/PK UxE<S1)Xorg/gradle/cli/AbstractCommandLineConverter.classPK UxE2_e(Zorg/gradle/cli/CommandLineParser$1.classPK UxERB <[org/gradle/cli/CommandLineParser$MissingOptionArgState.classPK UxEM2=^org/gradle/cli/CommandLineParser$OptionStringComparator.classPK UxE# GK1aorg/gradle/cli/CommandLineArgumentException.classPK UxE?h=$corg/gradle/cli/CommandLineParser$KnownOptionParserState.classPK UxEk7Fkorg/gradle/cli/CommandLineParser$OptionComparator.classPK UxEb'n?hnorg/gradle/cli/CommandLineParser$UnknownOptionParserState.classPK UxE"zZ &qorg/gradle/cli/CommandLineOption.classPK UxEl\ϧ8|worg/gradle/cli/CommandLineParser$OptionParserState.classPK UxE#4P*&yyorg/gradle/cli/ParsedCommandLine.classPK UxEA5l| :org/gradle/cli/ProjectPropertiesCommandLineConverter.classPK UxE2lWJForg/gradle/cli/CommandLineParser$CaseInsensitiveStringComparator.classPK UxE( ]UT*&iorg/gradle/cli/CommandLineParser.classPK UxE_>ң)3org/gradle/cli/CommandLineParser$AfterOptions.classPK UxEGf3org/gradle/cli/CommandLineParser$OptionString.classPK UxEx&T` ;؝org/gradle/cli/AbstractPropertiesCommandLineConverter.classPK UxE ,org/gradle/cli/ParsedCommandLineOption.classPK UxEs=org/gradle/cli/CommandLineParser$OptionAwareParserState.classPK UxE'H g)org/gradle/cli/CommandLineConverter.classPK UxEC| <org/gradle/cli/CommandLineParser$BeforeFirstSubCommand.classPK UxE;|9.org/gradle/cli/SystemPropertiesCommandLineConverter.classPK UxE-h2org/gradle/cli/CommandLineParser$ParserState.classPK UxEF= ;Xorg/gradle/cli/CommandLineParser$AfterFirstSubCommand.classPK UxE٥FDgradle-cli-classpath.propertiesPK00qHalide-17.0.1/apps/HelloAndroidCamera2/gradle/wrapper/gradle-wrapper.properties000066400000000000000000000003461456515664200274610ustar00rootroot00000000000000#Wed Jul 15 16:34:43 PDT 2015 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists distributionUrl=https\://services.gradle.org/distributions/gradle-2.2-all.zip Halide-17.0.1/apps/HelloAndroidCamera2/gradlew000077500000000000000000000116551456515664200210710ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################## ## ## Gradle start up script for UN*X ## ############################################################################## # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. DEFAULT_JVM_OPTS="" APP_NAME="Gradle" APP_BASE_NAME=`basename "$0"` # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD="maximum" warn ( ) { echo "$*" } die ( ) { echo echo "$*" echo exit 1 } # OS specific support (must be 'true' or 'false'). cygwin=false darwin=false case "`uname`" in CYGWIN* ) cygwin=true ;; Darwin* ) darwin=true ;; esac # For Cygwin, ensure paths are in UNIX format before anything is touched. if $cygwin ; then [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` fi # Attempt to set APP_HOME # Resolve links: $0 may be a link PRG="$0" # Need this for relative symlinks. while [ -h "$PRG" ] ; do ls=`ls -ld "$PRG"` link=`expr "$ls" : '.*-> \(.*\)$'` if expr "$link" : '/.*' > /dev/null; then PRG="$link" else PRG=`dirname "$PRG"`"/$link" fi done SAVED="`pwd`" cd "`dirname \"$PRG\"`/" >&- APP_HOME="`pwd -P`" cd "$SAVED" >&- CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar # Determine the Java command to use to start the JVM. if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables JAVACMD="$JAVA_HOME/jre/sh/java" else JAVACMD="$JAVA_HOME/bin/java" fi if [ ! -x "$JAVACMD" ] ; then die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi else JAVACMD="java" which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi # Increase the maximum file descriptors if we can. if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then MAX_FD_LIMIT=`ulimit -H -n` if [ $? -eq 0 ] ; then if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then MAX_FD="$MAX_FD_LIMIT" fi ulimit -n $MAX_FD if [ $? -ne 0 ] ; then warn "Could not set maximum file descriptor limit: $MAX_FD" fi else warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" fi fi # For Darwin, add options to specify how the application appears in the dock if $darwin; then GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" fi # For Cygwin, switch paths to Windows format before running java if $cygwin ; then APP_HOME=`cygpath --path --mixed "$APP_HOME"` CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` # We build the pattern for arguments to be converted via cygpath ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` SEP="" for dir in $ROOTDIRSRAW ; do ROOTDIRS="$ROOTDIRS$SEP$dir" SEP="|" done OURCYGPATTERN="(^($ROOTDIRS))" # Add a user-defined pattern to the cygpath arguments if [ "$GRADLE_CYGPATTERN" != "" ] ; then OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" fi # Now convert the arguments - kludge to limit ourselves to /bin/sh i=0 for arg in "$@" ; do CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` else eval `echo args$i`="\"$arg\"" fi i=$((i+1)) done case $i in (0) set -- ;; (1) set -- "$args0" ;; (2) set -- "$args0" "$args1" ;; (3) set -- "$args0" "$args1" "$args2" ;; (4) set -- "$args0" "$args1" "$args2" "$args3" ;; (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; esac fi # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules function splitJvmOpts() { JVM_OPTS=("$@") } eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" Halide-17.0.1/apps/HelloAndroidCamera2/gradlew.bat000066400000000000000000000045441456515664200216320ustar00rootroot00000000000000@if "%DEBUG%" == "" @echo off @rem ########################################################################## @rem @rem Gradle startup script for Windows @rem @rem ########################################################################## @rem Set local scope for the variables with windows NT shell if "%OS%"=="Windows_NT" setlocal @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. set DEFAULT_JVM_OPTS= set DIRNAME=%~dp0 if "%DIRNAME%" == "" set DIRNAME=. set APP_BASE_NAME=%~n0 set APP_HOME=%DIRNAME% @rem Find java.exe if defined JAVA_HOME goto findJavaFromJavaHome set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 if "%ERRORLEVEL%" == "0" goto init echo. echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. echo. echo Please set the JAVA_HOME variable in your environment to match the echo location of your Java installation. goto fail :findJavaFromJavaHome set JAVA_HOME=%JAVA_HOME:"=% set JAVA_EXE=%JAVA_HOME%/bin/java.exe if exist "%JAVA_EXE%" goto init echo. echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% echo. echo Please set the JAVA_HOME variable in your environment to match the echo location of your Java installation. goto fail :init @rem Get command-line arguments, handling Windowz variants if not "%OS%" == "Windows_NT" goto win9xME_args if "%@eval[2+2]" == "4" goto 4NT_args :win9xME_args @rem Slurp the command line arguments. set CMD_LINE_ARGS= set _SKIP=2 :win9xME_args_slurp if "x%~1" == "x" goto execute set CMD_LINE_ARGS=%* goto execute :4NT_args @rem Get arguments from the 4NT Shell from JP Software set CMD_LINE_ARGS=%$ :execute @rem Setup the command line set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar @rem Execute Gradle "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% :end @rem End local scope for the variables with windows NT shell if "%ERRORLEVEL%"=="0" goto mainEnd :fail rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of rem the _cmd.exe /c_ return code! if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 exit /b 1 :mainEnd if "%OS%"=="Windows_NT" endlocal :omega Halide-17.0.1/apps/HelloAndroidCamera2/jni/000077500000000000000000000000001456515664200202665ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroidCamera2/jni/Android.mk000066400000000000000000000013151456515664200221770ustar00rootroot00000000000000LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) LOCAL_MODULE := HelloAndroidCamera2 LOCAL_ARM_MODE := arm LOCAL_SRC_FILES := \ AndroidBufferUtilities.cpp \ HalideFilters.cpp \ LockedSurface.cpp \ YuvBufferT.cpp LOCAL_LDFLAGS := -L$(LOCAL_PATH)/../jni LOCAL_LDLIBS := -lm -llog -landroid -latomic LOCAL_LDLIBS += $(LOCAL_PATH)/../bin/$(TARGET_ARCH_ABI)/deinterleave.a LOCAL_LDLIBS += $(LOCAL_PATH)/../bin/$(TARGET_ARCH_ABI)/edge_detect.a LOCAL_STATIC_LIBRARIES := android_native_app_glue LOCAL_C_INCLUDES := $(LOCAL_PATH)/../../../include $(LOCAL_PATH)/../../../build/include $(LOCAL_PATH)/../bin/$(TARGET_ARCH_ABI)/ include $(BUILD_SHARED_LIBRARY) $(call import-module,android/native_app_glue) Halide-17.0.1/apps/HelloAndroidCamera2/jni/AndroidBufferUtilities.cpp000066400000000000000000000066341456515664200254110ustar00rootroot00000000000000#include "AndroidBufferUtilities.h" #include #include #include "LockedSurface.h" #include "YuvBufferT.h" #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, "AndroidBufferUtilities", __VA_ARGS__) #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "AndroidBufferUtilities", __VA_ARGS__) extern "C" { JNIEXPORT jlong JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_allocNativeYuvBufferT( JNIEnv *env, jobject, jint srcWidth, jint srcHeight, jobject srcLumaByteBuffer, jint srcLumaRowStrideBytes, jobject srcChromaUByteBuffer, jobject srcChromaVByteBuffer, jint srcChromaElementStrideBytes, jint srcChromaRowStrideBytes) { uint8_t *srcLumaPtr = reinterpret_cast( env->GetDirectBufferAddress(srcLumaByteBuffer)); uint8_t *srcChromaUPtr = reinterpret_cast( env->GetDirectBufferAddress(srcChromaUByteBuffer)); uint8_t *srcChromaVPtr = reinterpret_cast( env->GetDirectBufferAddress(srcChromaVByteBuffer)); if (srcLumaPtr == nullptr || srcChromaUPtr == nullptr || srcChromaVPtr == nullptr) { return 0L; } YuvBufferT *buffer = new YuvBufferT(srcLumaPtr, srcWidth, srcHeight, 1 /* srcLumaElementStrideBytes */, srcLumaRowStrideBytes, srcChromaUPtr, srcWidth / 2, srcHeight / 2, srcChromaElementStrideBytes, srcChromaRowStrideBytes, srcChromaVPtr, srcWidth / 2, srcHeight / 2, srcChromaElementStrideBytes, srcChromaRowStrideBytes); return reinterpret_cast(buffer); } JNIEXPORT jboolean JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_freeNativeYuvBufferT( JNIEnv *env, jobject obj, jlong handle) { if (handle == 0L) { return false; } YuvBufferT *yuvBufferT = reinterpret_cast(handle); delete yuvBufferT; return true; } JNIEXPORT jboolean JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_rotateNativeYuvBufferT180( JNIEnv *env, jobject obj, jlong handle) { if (handle == 0L) { return false; } YuvBufferT *yuvBufferT = reinterpret_cast(handle); yuvBufferT->rotate180(); return true; } JNIEXPORT jlong JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_lockSurface( JNIEnv *env, jobject obj, jobject surface) { return reinterpret_cast(LockedSurface::lock(env, surface)); } JNIEXPORT jlong JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_allocNativeYuvBufferTFromSurfaceHandle( JNIEnv *env, jobject obj, jlong lockedSurfaceHandle) { if (lockedSurfaceHandle == 0L) { return 0L; } LockedSurface *ls = reinterpret_cast(lockedSurfaceHandle); YuvBufferT tmp = ls->yuvView(); if (tmp.isNull()) { return 0L; } YuvBufferT *yuvBufferT = new YuvBufferT(tmp); return reinterpret_cast(yuvBufferT); } JNIEXPORT jboolean JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_unlockSurface( JNIEnv *env, jobject obj, jlong lockedSurfaceHandle) { if (lockedSurfaceHandle == 0L) { return false; } LockedSurface *ls = reinterpret_cast(lockedSurfaceHandle); delete ls; return true; } } // extern "C" Halide-17.0.1/apps/HelloAndroidCamera2/jni/AndroidBufferUtilities.h000066400000000000000000000022131456515664200250430ustar00rootroot00000000000000#ifndef ANDROID_BUFFER_UTILITIES_H #define ANDROID_BUFFER_UTILITIES_H #include extern "C" { JNIEXPORT jlong JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_allocNativeYuvBufferT( JNIEnv *env, jobject, jint srcWidth, jint srcHeight, jobject srcLumaByteBuffer, jint srcLumaRowStrideBytes, jobject srcChromaUByteBuffer, jobject srcChromaVByteBuffer, jint srcChromaElementStrideBytes, jint srcChromaRowStrideBytes); JNIEXPORT jboolean JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_freeNativeYuvBufferT( JNIEnv *env, jobject obj, jlong handle); JNIEXPORT jlong JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_lockSurface( JNIEnv *env, jobject obj, jobject surface); JNIEXPORT jlong JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_allocNativeYuvBufferTFromSurfaceHandle( JNIEnv *env, jobject obj, jlong surfaceWrapperHandle); JNIEXPORT jboolean JNICALL Java_com_example_helloandroidcamera2_AndroidBufferUtilities_unlockSurface( JNIEnv *env, jobject obj, jlong surfaceWrapperHandle); } // extern "C" #endif // ANDROID_BUFFER_UTILITIES_H Halide-17.0.1/apps/HelloAndroidCamera2/jni/Application.mk000066400000000000000000000002271456515664200230630ustar00rootroot00000000000000APP_ABI := armeabi armeabi-v7a arm64-v8a x86_64 x86 APP_PLATFORM := android-21 APP_STL := c++_static APP_CPPFLAGS := -std=c++17 -fno-rtti -fexceptions Halide-17.0.1/apps/HelloAndroidCamera2/jni/HalideFilters.cpp000066400000000000000000000136351456515664200235210ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, "native", __VA_ARGS__) #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "native", __VA_ARGS__) #include "AndroidBufferUtilities.h" #include "HalideRuntime.h" #include "YuvBufferT.h" #include "deinterleave.h" #include "edge_detect.h" #define DEBUG 1 // Extern functions from the Halide runtime that are not exposed in // HalideRuntime.h. extern "C" int halide_host_cpu_count(); extern "C" int64_t halide_current_time_ns(); // Override Halide's print to use LOGD and also print the time. extern "C" void halide_print(void *, const char *msg) { static int64_t t0 = halide_current_time_ns(); int64_t t1 = halide_current_time_ns(); LOGD("%d: %s\n", (int)(t1 - t0) / 1000000, msg); t0 = t1; } extern "C" { bool checkEqualExtents(YuvBufferT *src, YuvBufferT *dst) { if (src->luma().width() != dst->luma().width() || src->luma().height() != dst->luma().height() || src->chromaU().width() != dst->chromaU().width() || src->chromaU().height() != dst->chromaU().height() || src->chromaV().width() != dst->chromaV().width() || src->chromaV().height() != dst->chromaV().height()) { LOGE("failed: src and dst extents must be equal.\n\t" "src extents: luma: %d, %d, chromaU: %d, %d, chromaV: %d, %d.\n\t" "dst extents: luma: %d, %d, chromaU: %d, %d, chromaV: %d, %d.", src->luma().width(), src->luma().height(), src->chromaU().width(), src->chromaU().height(), src->chromaV().width(), src->chromaV().height(), dst->luma().width(), dst->luma().height(), dst->chromaU().width(), dst->chromaU().height(), dst->chromaV().width(), dst->chromaV().height()); return false; } else { return true; } } JNIEXPORT bool JNICALL Java_com_example_helloandroidcamera2_HalideFilters_copyHalide( JNIEnv *env, jobject obj, jlong srcYuvBufferTHandle, jlong dstYuvBufferTHandle) { if (srcYuvBufferTHandle == 0L || dstYuvBufferTHandle == 0L) { LOGE("copyHalide failed: src and dst must not be null"); return false; } YuvBufferT *src = reinterpret_cast(srcYuvBufferTHandle); YuvBufferT *dst = reinterpret_cast(dstYuvBufferTHandle); if (!checkEqualExtents(src, dst)) { return false; } YuvBufferT::ChromaStorage srcChromaStorage = src->chromaStorage(); YuvBufferT::ChromaStorage dstChromaStorage = dst->chromaStorage(); bool succeeded; int halideErrorCode; // Use Halide deinterleave if the source chroma is interleaved and destination chroma is planar. // Other, fall back to slow copy. if ((srcChromaStorage == YuvBufferT::ChromaStorage::kInterleavedUFirst || srcChromaStorage == YuvBufferT::ChromaStorage::kInterleavedVFirst) && (dstChromaStorage == YuvBufferT::ChromaStorage::kPlanarPackedUFirst || dstChromaStorage == YuvBufferT::ChromaStorage::kPlanarPackedVFirst || dstChromaStorage == YuvBufferT::ChromaStorage::kPlanarGeneric)) { // Copy the luma channel directly. dst->luma().copy_from(src->luma()); // Use Halide to deinterleave the chroma channels. auto srcInterleavedChroma = src->interleavedChromaView(); auto dstPlanarChromaU = dst->chromaU(); auto dstPlanarChromaV = dst->chromaV(); if (srcChromaStorage == YuvBufferT::ChromaStorage::kInterleavedUFirst) { halideErrorCode = deinterleave(srcInterleavedChroma, dstPlanarChromaU, dstPlanarChromaV); } else { halideErrorCode = deinterleave(srcInterleavedChroma, dstPlanarChromaV, dstPlanarChromaU); } succeeded = (halideErrorCode != halide_error_code_success); if (halideErrorCode != halide_error_code_success) { LOGE("deinterleave failed with error code: %d", halideErrorCode); } } else { (*dst).copy_from(*src); } return succeeded; } JNIEXPORT bool JNICALL Java_com_example_helloandroidcamera2_HalideFilters_edgeDetectHalide( JNIEnv *env, jobject obj, jlong srcYuvBufferTHandle, jlong dstYuvBufferTHandle) { if (srcYuvBufferTHandle == 0L || dstYuvBufferTHandle == 0L) { LOGE("edgeDetectHalide failed: src and dst must not be null"); return false; } YuvBufferT *src = reinterpret_cast(srcYuvBufferTHandle); YuvBufferT *dst = reinterpret_cast(dstYuvBufferTHandle); if (!checkEqualExtents(src, dst)) { return false; } static bool first_call = true; static unsigned counter = 0; static unsigned times[16]; if (first_call) { LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count()); first_call = false; for (int t = 0; t < 16; t++) { times[t] = 0; } } // Set chrominance to 128 to appear grayscale. dst->fillUV(128, 128); auto srcLuma = src->luma(); auto dstLuma = dst->luma(); int64_t t1 = halide_current_time_ns(); int err = edge_detect(srcLuma, dstLuma); if (err != halide_error_code_success) { LOGE("edge_detect failed with error code: %d", err); } int64_t t2 = halide_current_time_ns(); unsigned elapsed_us = (t2 - t1) / 1000; times[counter & 15] = elapsed_us; counter++; unsigned min = times[0]; for (int i = 1; i < 16; i++) { if (times[i] < min) { min = times[i]; } } LOGD("Time taken: %d us (minimum: %d us)", elapsed_us, min); return (err != halide_error_code_success); } } // extern "C" Halide-17.0.1/apps/HelloAndroidCamera2/jni/LockedSurface.cpp000066400000000000000000000041451456515664200235100ustar00rootroot00000000000000#include "LockedSurface.h" // Defined in http://developer.android.com/reference/android/graphics/ImageFormat.html #define IMAGE_FORMAT_YV12 842094169 // Round x up to a multiple of mask. // E.g., ALIGN(x, 16) means round x up to the nearest multiple of 16. #define ALIGN(x, mask) (((x) + (mask)-1) & ~((mask)-1)) LockedSurface *LockedSurface::lock(JNIEnv *env, jobject surface) { LockedSurface *output = new LockedSurface; output->window_ = ANativeWindow_fromSurface(env, surface); if (int err = ANativeWindow_lock(output->window_, &(output->buffer_), NULL)) { ANativeWindow_release(output->window_); delete output; output = nullptr; } return output; } LockedSurface::~LockedSurface() { ANativeWindow_unlockAndPost(window_); ANativeWindow_release(window_); window_ = nullptr; } const ANativeWindow_Buffer &LockedSurface::buffer() const { return buffer_; } YuvBufferT LockedSurface::yuvView() const { if (buffer_.format != IMAGE_FORMAT_YV12) { return YuvBufferT(); } // This is guaranteed by the YV12 format, see android.graphics.ImageFormat. uint8_t *lumaPtr = reinterpret_cast(buffer_.bits); uint32_t lumaRowStrideBytes = buffer_.stride; uint32_t lumaSizeBytes = lumaRowStrideBytes * buffer_.height; uint32_t chromaRowStrideBytes = ALIGN(buffer_.stride / 2, 16); // Size of one chroma plane. uint32_t chromaSizeBytes = chromaRowStrideBytes * buffer_.height / 2; // Yes, V is actually first. uint8_t *chromaVPtr = lumaPtr + lumaSizeBytes; uint8_t *chromaUPtr = lumaPtr + lumaSizeBytes + chromaSizeBytes; return YuvBufferT(lumaPtr, buffer_.width, buffer_.height, 1 /* lumaElementStrideBytes */, lumaRowStrideBytes, chromaUPtr, buffer_.width / 2, buffer_.height / 2, 1 /* chromaUElementStrideBytes */, chromaRowStrideBytes, chromaVPtr, buffer_.width / 2, buffer_.height / 2, 1 /* chromaVElementStrideBytes */, chromaRowStrideBytes); } Halide-17.0.1/apps/HelloAndroidCamera2/jni/LockedSurface.h000066400000000000000000000013561456515664200231560ustar00rootroot00000000000000#ifndef LOCKED_SURFACE_H #define LOCKED_SURFACE_H #include #include #include #include "YuvBufferT.h" // Wraps an RAII pattern around locking an ANativeWindow. class LockedSurface { public: // Lock a Surface, returning a lock object, or nullptr if it failed. static LockedSurface *lock(JNIEnv *env, jobject surface); ~LockedSurface(); const ANativeWindow_Buffer &buffer() const; // If buffer() is a compatible YUV format, returns a non-null YuvBufferT. // Otherwise, output.isNull() will be true. YuvBufferT yuvView() const; private: LockedSurface() = default; ANativeWindow *window_; ANativeWindow_Buffer buffer_; }; #endif // LOCKED_SURFACE_HHalide-17.0.1/apps/HelloAndroidCamera2/jni/YuvBufferT.cpp000066400000000000000000000164301456515664200230370ustar00rootroot00000000000000#include "YuvBufferT.h" #include #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, "YuvBufferT", __VA_ARGS__) #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "YuvBufferT", __VA_ARGS__) YuvBufferT::YuvBufferT(uint8_t *lumaPointer, int32_t lumaWidth, int32_t lumaHeight, int32_t lumaElementStrideBytes, int32_t lumaRowStrideBytes, uint8_t *chromaUPointer, int32_t chromaUWidth, int32_t chromaUHeight, int32_t chromaUElementStrideBytes, int32_t chromaURowStrideBytes, uint8_t *chromaVPointer, int32_t chromaVWidth, int32_t chromaVHeight, int32_t chromaVElementStrideBytes, int32_t chromaVRowStrideBytes) { assert(lumaPointer != nullptr); assert(chromaUPointer != nullptr); assert(chromaVPointer != nullptr); halide_dimension_t lumaShape[] = { {0, lumaWidth, lumaElementStrideBytes}, {0, lumaHeight, lumaRowStrideBytes}}; luma_ = Halide::Runtime::Buffer(lumaPointer, 2, lumaShape); halide_dimension_t chromaUShape[] = { {0, chromaUWidth, chromaUElementStrideBytes}, {0, chromaUHeight, chromaURowStrideBytes}}; chromaU_ = Halide::Runtime::Buffer(chromaUPointer, 2, chromaUShape); halide_dimension_t chromaVShape[] = { {0, chromaVWidth, chromaVElementStrideBytes}, {0, chromaVHeight, chromaVRowStrideBytes}}; chromaV_ = Halide::Runtime::Buffer(chromaVPointer, 2, chromaVShape); // See if chroma is stored according to a well known format. chromaStorage_ = ChromaStorage::kOther; // U and V must have the same extents and strides. if (chromaU_.width() == chromaV_.width() && chromaU_.height() == chromaV_.height() && chromaU_.dim(0).stride() == chromaV_.dim(0).stride() && chromaU_.dim(1).stride() == chromaV_.dim(1).stride()) { // If strides are exactly 2, check if they are interleaved. if (chromaU_.dim(0).stride() == 2 && chromaV_.dim(0).stride() == 2) { if (chromaU_.data() == chromaV_.data() - 1) { chromaStorage_ = ChromaStorage::kInterleavedUFirst; } else if (chromaV_.data() == chromaU_.data() - 1) { chromaStorage_ = ChromaStorage::kInterleavedVFirst; } } else if (chromaU_.dim(0).stride() == 1 && chromaV_.dim(0).stride() == 1) { // If element stride is 1, then they're planar. // If there is no space at the end of each row, they might be packed. // Check if one directly follows the other. if (chromaU_.width() == chromaU_.dim(1).stride() && chromaV_.width() == chromaV_.dim(1).stride()) { if (&chromaU_(0, chromaU_.height()) == &chromaV_(0, 0)) { chromaStorage_ = ChromaStorage::kPlanarPackedUFirst; } else if (&chromaV_(0, chromaU_.height()) == &chromaU_(0, 0)) { chromaStorage_ = ChromaStorage::kPlanarPackedVFirst; } } else { chromaStorage_ = ChromaStorage::kPlanarGeneric; } } } interleavedChromaView_ = Halide::Runtime::Buffer(); if (chromaStorage_ == ChromaStorage::kInterleavedUFirst) { halide_dimension_t chromaShape[] = { {0, chromaUWidth * 2, 1}, {0, chromaUHeight, chromaURowStrideBytes}}; interleavedChromaView_ = Halide::Runtime::Buffer(chromaUPointer, 2, chromaShape); } else if (chromaStorage_ == ChromaStorage::kInterleavedVFirst) { halide_dimension_t chromaShape[] = { {0, chromaVWidth * 2, 1}, {0, chromaVHeight, chromaVRowStrideBytes}}; interleavedChromaView_ = Halide::Runtime::Buffer(chromaVPointer, 2, chromaShape); } else if (chromaStorage_ == ChromaStorage::kPlanarPackedUFirst) { packedPlanarChromaView_ = chromaU_; packedPlanarChromaView_.crop(1, 0, chromaUHeight * 2); } else if (chromaStorage_ == ChromaStorage::kPlanarPackedVFirst) { packedPlanarChromaView_ = chromaV_; packedPlanarChromaView_.crop(1, 0, chromaVHeight * 2); } interleavedChromaView_.set_host_dirty(); packedPlanarChromaView_.set_host_dirty(); chromaU_.set_host_dirty(); chromaV_.set_host_dirty(); luma_.set_host_dirty(); } bool YuvBufferT::isNull() const { return luma_.data() == nullptr; } Halide::Runtime::Buffer YuvBufferT::luma() const { return luma_; } Halide::Runtime::Buffer YuvBufferT::chromaU() const { return chromaU_; } Halide::Runtime::Buffer YuvBufferT::chromaV() const { return chromaV_; } YuvBufferT::ChromaStorage YuvBufferT::chromaStorage() const { return chromaStorage_; } Halide::Runtime::Buffer YuvBufferT::interleavedChromaView() const { return interleavedChromaView_; } Halide::Runtime::Buffer YuvBufferT::packedPlanarChromaView() const { return packedPlanarChromaView_; } void YuvBufferT::copy_from(const YuvBufferT &other) { luma_.copy_from(other.luma_); if (interleavedChromaView_.data() && other.interleavedChromaView_.data()) { interleavedChromaView_.copy_from(other.interleavedChromaView_); } else if (packedPlanarChromaView_.data() && other.packedPlanarChromaView_.data()) { packedPlanarChromaView_.copy_from(other.packedPlanarChromaView_); } else { chromaU_.copy_from(other.chromaU_); chromaV_.copy_from(other.chromaV_); } } void YuvBufferT::fill(uint8_t y, uint8_t u, uint8_t v) { luma_.fill(y); fillUV(u, v); } void YuvBufferT::fillUV(uint8_t u, uint8_t v) { if (interleavedChromaView_.data() && u == v) { interleavedChromaView_.fill(u); } else if (packedPlanarChromaView_.data() && u == v) { packedPlanarChromaView_.fill(v); } else { chromaU_.fill(u); chromaV_.fill(v); } } namespace { Halide::Runtime::Buffer rotateBuffer180(Halide::Runtime::Buffer buf) { if (buf.data() == nullptr) return buf; halide_dimension_t shape[] = { {0, buf.dim(0).extent(), -buf.dim(0).stride()}, {0, buf.dim(1).extent(), -buf.dim(1).stride()}, }; return Halide::Runtime::Buffer(&buf(buf.width() - 1, buf.height() - 1), 2, shape); } }; // namespace void YuvBufferT::rotate180() { luma_ = rotateBuffer180(luma_); chromaU_ = rotateBuffer180(chromaU_); chromaV_ = rotateBuffer180(chromaV_); packedPlanarChromaView_ = rotateBuffer180(packedPlanarChromaView_); interleavedChromaView_ = rotateBuffer180(interleavedChromaView_); // Rotating the above two views effectively swaps U and V. switch (chromaStorage_) { case ChromaStorage::kPlanarPackedUFirst: chromaStorage_ = ChromaStorage::kPlanarPackedVFirst; break; case ChromaStorage::kPlanarPackedVFirst: chromaStorage_ = ChromaStorage::kPlanarPackedUFirst; break; case ChromaStorage::kInterleavedUFirst: chromaStorage_ = ChromaStorage::kInterleavedVFirst; break; case ChromaStorage::kInterleavedVFirst: chromaStorage_ = ChromaStorage::kInterleavedUFirst; break; default: // nothing break; }; } Halide-17.0.1/apps/HelloAndroidCamera2/jni/YuvBufferT.h000066400000000000000000000072301456515664200225020ustar00rootroot00000000000000#ifndef YUV_BUFFER_T_H #define YUV_BUFFER_T_H #include "HalideBuffer.h" #include "HalideRuntime.h" #include #include class YuvBufferT { public: enum class ChromaStorage { // UVUVUV... Interleaved U and V with element stride 2 // UVUVUV... and arbitrary row stride. // U and V have the same extents and strides. kInterleavedUFirst, // VUVUVU... Interleaved V and U with element stride 2 // VUVUVU... and arbitrary row stride. // U and V have the same extents and strides. kInterleavedVFirst, // U and V and stored in separate planes, with U first, followed // immediately by V. Element stride = 1, row stride = width. // U and V have the same extents and strides. kPlanarPackedUFirst, // V and U and stored in separate planes, with V first, followed // immediately by U. Element stride = 1, row stride = width. // U and V have the same extents and strides. kPlanarPackedVFirst, // U and V are stored in separate planes. // Element stride = 1, row stride = arbitrary. // U and V have the same extents and strides. kPlanarGeneric, // Some other arbitrary interleaving of chroma not easily classified. kOther }; // Make a null YuvBufferT. YuvBufferT() = default; YuvBufferT(uint8_t *lumaPointer, int32_t lumaWidth, int32_t lumaHeight, int32_t lumaElementStrideBytes, int32_t lumaRowStrideBytes, uint8_t *chromaUPointer, int32_t chromaUWidth, int32_t chromaUHeight, int32_t chromaUElementStrideBytes, int32_t chromaURowStrideBytes, uint8_t *chromaVPointer, int32_t chromaVWidth, int32_t chromaVHeight, int32_t chromaVElementStrideBytes, int32_t chromaVRowStrideBytes); YuvBufferT(const YuvBufferT ©) = default; bool isNull() const; Halide::Runtime::Buffer luma() const; Halide::Runtime::Buffer chromaU() const; Halide::Runtime::Buffer chromaV() const; ChromaStorage chromaStorage() const; // If chroma channels are interleaved, return an interleaved // Halide::Runtime::Buffer with: // - The host pointer pointing to whichever chroma buffer is first in // memory. // - Twice the width. // Otherwise, returns a Halide::Runtime::Buffer pointing to nullptr. Halide::Runtime::Buffer interleavedChromaView() const; // If chroma channels are planar and tightly packed (one directly // follows the other, with the same size and strides), then // returns a Halide::Runtime::Buffer with: // - The host pointer pointing to whichever chroma buffer is first in // memory. // - Twice the height. // Otherwise, returns a Halide::Runtime::Buffer pointing to nullptr. Halide::Runtime::Buffer packedPlanarChromaView() const; // Rotate the buffer 180 degrees. Cheap. Just messes with the strides. void rotate180(); // Copy the contents from another YuvBufferT void copy_from(const YuvBufferT &other); // Fill the buffer with a fixed YUV value void fill(uint8_t y, uint8_t u, uint8_t v); // Fill the chroma with a fixed UV value void fillUV(uint8_t u, uint8_t v); private: Halide::Runtime::Buffer luma_; Halide::Runtime::Buffer chromaU_; Halide::Runtime::Buffer chromaV_; ChromaStorage chromaStorage_; Halide::Runtime::Buffer interleavedChromaView_; Halide::Runtime::Buffer packedPlanarChromaView_; }; #endif // YUV_BUFFER_T_H Halide-17.0.1/apps/HelloAndroidCamera2/jni/deinterleave_generator.cpp000066400000000000000000000017021456515664200255070ustar00rootroot00000000000000#include "Halide.h" namespace { class Deinterleave : public Halide::Generator { public: Input> uvInterleaved{"uvInterleaved"}; // There is no way to declare a Buffer, so we must use Output instead Output result{"result", {UInt(8), UInt(8)}, 2}; void generate() { Var x, y; result(x, y) = {uvInterleaved(2 * x, y), uvInterleaved(2 * x + 1, y)}; // CPU schedule: // Parallelize over scan lines, 4 scanlines per task. // Independently, vectorize over x. result .parallel(y, 4) .vectorize(x, natural_vector_size(UInt(8))); // Cope with rotated inputs uvInterleaved.dim(0).set_stride(Expr()); result.specialize(uvInterleaved.dim(0).stride() == 1); result.specialize(uvInterleaved.dim(0).stride() == -1); } }; } // namespace HALIDE_REGISTER_GENERATOR(Deinterleave, deinterleave) Halide-17.0.1/apps/HelloAndroidCamera2/jni/edge_detect_generator.cpp000066400000000000000000000024431456515664200252770ustar00rootroot00000000000000#include "Halide.h" namespace { class EdgeDetect : public Halide::Generator { public: Input> input{"input"}; Output> result{"result"}; void generate() { Var x, y; Func clamped = Halide::BoundaryConditions::repeat_edge(input); // Upcast to 16-bit Func in16; in16(x, y) = cast(clamped(x, y)); // Gradients in x and y. Func gx; Func gy; gx(x, y) = (in16(x + 1, y) - in16(x - 1, y)) / 2; gy(x, y) = (in16(x, y + 1) - in16(x, y - 1)) / 2; // Gradient magnitude. Func grad_mag; grad_mag(x, y) = (gx(x, y) * gx(x, y) + gy(x, y) * gy(x, y)); // Draw the result result(x, y) = cast(clamp(grad_mag(x, y), 0, 255)); // CPU schedule: // Parallelize over scan lines, 4 scanlines per task. // Independently, vectorize in x. result .compute_root() .vectorize(x, 8) .parallel(y, 8); // Cope with rotated inputs input.dim(0).set_stride(Expr()); result.specialize(input.dim(0).stride() == 1); result.specialize(input.dim(0).stride() == -1); } }; } // namespace HALIDE_REGISTER_GENERATOR(EdgeDetect, edge_detect) Halide-17.0.1/apps/HelloAndroidCamera2/res/000077500000000000000000000000001456515664200202775ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroidCamera2/res/drawable-hdpi/000077500000000000000000000000001456515664200230025ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroidCamera2/res/drawable-hdpi/ic_launcher.png000066400000000000000000000222651456515664200257730ustar00rootroot00000000000000PNG  IHDRHHUGbKGDC pHYsgR vpAgHHy#IDATx͜y%Uuk:s瞛dFGDE(јhL$*O4qbFx*4Dno\U{:Ksڿ[kPW8LJl.zw8}j_ǘo,\; }s^!yC29VA p8:d1thf/ +Žp/%!:y*ܲrN]|<˨8*b!]zc9%9(!8 `ĝ{\s6)/QO6ů ):t6*.Yelz cܰÿ 9ÒP} |ճl" ӽGX?N֞!nذu_4mh0^w'eg1xu{{MyToFX0}r;V:4IIҢ:̑qA^x^֦ЀEś cå[ݚS$ԆRqїPVO>X0c)C=ass8zH5#Ǔ C"-3r2urO\NqNq4g<:d4Ż`3#H?z]L}ICԗtH;;_*5= vw~ؚrv-Y'ymYF:GW>Ōr<C1U*^^.x 8qyx,k/?aK^h–59OnAJw޻od2eԡ3^ c.=4} = u{ا>7nಛ_RCq 4fz/QG6xcWw|Q>v jm4f oh6ֱc*<ˡ#Ya, t/?=;6\QwuPA =5<թGj/\ &PRM,V8YY' mPb>sa^dJrjρb`SV%3SS1"qHge_%ë#rtHb6>C"%ƺ+Ũ@0bQT1R  Jfy`J9SvRݻ-s{‹Tb1HFq:ǣؾQf;S^rgt>vdDY2j2JCT3`2IT#щiFHĘ",-`=:@h WB'i(WZ|!ry8Xw^X%2)M2Djiǭ[w<][o|O/;+uytiTES*jVq25<ɡ.gZDv&l;7nS{vN>64Zr~懧?/rpxm/pj2{F ;E)Ҟ;ro[sw>Դ[j F>pWZ.aeh8R5DF̀2EyB2syJ :N˓ y'ѴY\w=oe}_@O[lɒhvlo>@{ffj{z[_ \[fR7xF]ԵiUss{]Z~ O|WrZ;&|O;o,Mc&%Ms(*5KeHCSY`!N,Qb |Lu#O=iWI;<<X$KI4z_?Y+\IIjBi֋UJj]_&ccd#Bt^gxe  yiεS$wlMY&$mwhvh{{zLh洛]fxҞ'K 훍WR0X! IxUj uhhxDnHcvl_?~CXQYQiL:˱{[VuO\6Ϫ)6M`IQ)ҞٜmڿOy=7RH c˄kǰ8q\! A^.RFr|^#N_:"Fw,dianvnuu݊>)gBReEu 8!xBe_ +&bDئRLi!Pf1@] `D-W>YE88B\$ (Oy/ j_ a9[0Ct E҇ξ B)WOkYLBCELEZ!A\ F .Kiu2fz(f7`R@x1a^ji4բL#Ì }>* DDv3 (xny2J;:WF *Hg1rץvWr!q3>^M#[ SMF 9ZѺ od { Z %t PBՀu11UD"XgA]oi_%i>@Yj!C_вXs:l6%6k}E"@|g W<17=}C+Y6PYFC=ThufxUG{W+,B8ƇWs֩M/a{Q*[V8fp} (q/-H/9t|o T'N/sA,.g͒S|?oZn-j= p?G(} +yward=Y3)(Fbl{{a2V`M5|[_%W?Kؽ#kcT[Hp͋ڟ_oaQ.zh?zW.bffc?=B(ɛ.c˄/|ɵxJϧ (憆x^̣XDAUNG1e*i-Ņ,h 4nB>8sW5_y^}'.]yFuEQ5Z]Kva^Ji&?@CL6`D;jq@S\y0OY)bPOdyyI7/ Gu<_8g/cE`qdk鳧>@L[lkb)L̈)^qAK] O2}VR`lwǯ O>eO 9YOHq?Ed^Kž-"T.@Uy{HlْP Y`2}3!/GY}0 g\j!0ɾߑiqFĠ0!M{upoyyFР!_k":|F 1*k2}Ml@Bhx~n\nS֐M_U~AthP@uPT#E!?@c)l`>"lXѲ8W5{~+)/U6J4DZ"o"5h*ƈ.d"'].y| fP;R>ۏ:s)TBGCH?|fvE13*]. %wiH[5&$ B[T6&BY5[WG5dT,Z"@ig,p |(NQtE;X=x_&BPW#\+Uե+\h[}W`Mҕl2굱R 0ZuAl~UQ6 FE>$5~tW"Q)hgC1+eXk-i>྇nV.94H|ˎ/y| LTC xO:}sJ'>"gW cD_wE_|L3slwhc֫Xl:N˯:߿Gt6DffJgYUiT]ZIK'kII6[규LR5z~߹fO.~Q :=jahueH~̻=ZXlE, sC~;p/U缛c:Pwz)14M׫~vϊ:l=W|kj,?/:F Y&,m\;hZY)Ǯ8'\w WV кpQcuP}Æ%+O$~h<δ]h^淿kt~~/Wu,g՚_<}9# +ʿ_g>s+C(lcTBTBT1V%o*Cg'#M=pûmڣD DWp$z#?fӃYrlmټG6O{A'?XӖgm|~^uUf-vըL1c1U֬[Ay~ix$I&|۵w?7rO^1/,/ueE UKaHM:E_?훭:љ㧵2@{.gbu̷>;?O }%/ᡇ 2vwegbWr_]lf]'ѹyKogv8ԕ8 _AMLttdesNh0Z?mC7HDŽ~!H1,_>$f;K5KX|1mk)̪ X42N|vvεv~Zׯ-[X,YfIz:8 ^r1SK&cػgF4n}Ulq5jf"+&P ?a|]OёtyK?P T|T 339"MdI%:TRIVjT*NJn5ƨ6N{yyI夽NGӣlS12Ku G.x%å#Z' /!X0=4XWjDLJz@SZ=OcsM5K!oͥ?i|kGڽMg|o]UBYQamAJ$tX 섇GC1'~XO聺y~D2qP=f%4juXDҘ% #cpLm(RIV&$V* **CjJ-"J,H`gk`z[%+oogPx9_K_5^I*Zp%J(YEu)´ `id];.wе "jm1B0! aC<"`#3y⏃p7le;T#D!rk Q**6*Ү)捌Phf q\T-%N"l smb1=W~/Pw娉 cbk%BuRZb) X|uo\YQc ͲMP*qT1EA2$PL(՚-͋آcL Ћ^ LaD;Pxa* ֭<|n;|:eeF /:H*B&F\kօ''4Nz.5a·M7}U]1aH(cc45ӻ8C^ׇF%P:ء?NЩ]fێ{6e=^Bbfdڻp[/mj܃][-3{Zre|d9˴ZV~K3$,k-Aٷ/8=]u?U/oWq/gؚozOeù衦OmӏCycιWZGZǭ_kDU G&&'شzm8M 7޸ok34ۅVm3'—f0F .O?t h|@|r%tEXtdate:create2011-09-22T09:32:32-07:00b%tEXtdate:modify2011-09-16T07:27:36-07:00wjTIENDB`Halide-17.0.1/apps/HelloAndroidCamera2/res/drawable-ldpi/000077500000000000000000000000001456515664200230065ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroidCamera2/res/drawable-ldpi/ic_launcher.png000066400000000000000000000052511456515664200257730ustar00rootroot00000000000000PNG  IHDR$$sRGBbKGD pHYsgRtIME  1 )IDATXõ{_Uu?kyonH $T ЎV,0Qґ)%R:V0H*ۊ2*Ih#J 0$M;{>{sK*3{g}w}lD{5ϝwdּ0rYoußߺko886;qz9Go}?z}jCd>PsegС^[_ S긬Ú7=Gݽi gfznqs)`7DQDy;od)""D!ۖ^7䬙==cdЭE2 pUլLŜ|>gvX O7A+ VD=d&!a!m[d?y(wl?z gLЊ{ b%%X-q$)QG 2 9:S;AN[_s)lxdwϲV3`Ȃ#_(͔V+١ A'[2\,pV_tn Cj9 zWG\)G 9uԒ)I3ptXXi3|󽷽rǤEq06_vz9nw;Q $ b:0a Il?X2밉t"{osミ|L ; QjZ5 QvF{1IĖ$H eXk5MS\& Ǟ{Dpo)L|71ϻ+1xW{~9pD>wp켼X?Tͨ=%" (U /bpb%hdp!6~MZ6_ws=O]pߟt~TkT}d3cˀ4bT ŢĀ4QđŦFjsN*ygOZwK*!i5Jc%UP%̪Xvc "Ċ")b374qđ4 QTQ@n sBEg9u2x8q(B2{)L~y,m,/9:ȬSp9ũq01CL)8YF8\1Oջh|gkxi:v9l9p)gejTD xm&FGevԻ2ro!\d{6]ѽE^i)Z)WUSN9eJ$ᄅQHw@M$w)¬38K θ4fϫ;]"H[U3%Q8ɟIamU4)Ip|j~~V>su)sؼ126OO"K5/ .װo<_uЁph_N,jLTLESY80%q-~7c_Vm MUL(i:h;^SQ99k ^ΘXJ*'3Sv\-iӍQr;H&$"a|SEFs|{}0(ɋ/^.׀0$H.7> J阮Tt7=ƱZ,s'P*4PbP3x4q<~#0{mj)J+ Xr 56]X©DY9AUgr&YmX+] +l:?/m}ܺ&b bmGi6̚&%<|{Y={ QO227SmTiݣoʋWGs~GgEWջ8tVl [8> >bD\uI$:'x?ve>R_rݩY"W]?8Q?}y|O]wɟ]U1O'o "C!R,-}sZ6q:Iݷ}_>nY!Z-FU 8B&A_U=Q'ĉt:8 Gðz&e7&h׌oܿozFwozARJ~6ML 0EWov4[׾7_rxf%LYJ" ڳ {/N4JL@/_- pW~[?t 2V~2mfZ]i9F-I`SoxWe]\R/S,z*vdRd&JK<5Q'4χrweHQdiv2Ai·D%y"ŲGV\)iQurZ.PiS2xtu,JPR,e`q+wNL=Az{פs?REAĆX~/7=U[['30~̷jIENDB`Halide-17.0.1/apps/HelloAndroidCamera2/res/drawable-mdpi/000077500000000000000000000000001456515664200230075ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroidCamera2/res/drawable-mdpi/ic_launcher.png000066400000000000000000000121651456515664200257760ustar00rootroot00000000000000PNG  IHDR00WbKGDC pHYsgR vpAg00WIDATh޵YiU9o{^&IBA& ( "`b *HQdZTef !aJKHޔ7{V9gQvKZZ}^U}>vq<8cx%~UO|To=V| b0@@R\UA'TX}M'1x8ViZ"]J03o `Mz6 4hQ1GeZ,bH ($F:,,K9NԩhL\eXu߸cTj%׳Wc\zgtp DiD(mBSh^ GT!GUS[QCp(r) r_N;]#.uTȝp.SĶVZA˖dCi/?A*BD)Ѓ,RXİp`XpM=eԓբQ@s5{ym?n*S_M 8r\?CaAp#mz>S1GXĜ"i&6qV:' Xkޑ?*u w}N"ҢX{vcg?=YA" @A`9k-\ʈL9;GItKX"R>죎*@ kK  cwR[۱U*zRͨbc]a!&iH53sablظiu˗~<F+gog Ax!&T oرZxQ{|"seR D9EvdYF`cX3~d8҃}난\`ֶ/p`4 =kx'w=ta:\\c=&bT;<@ ձYPB<qo=c8 YK7^V5cT1_!6?#(²ŗ>"K|T`qd)QQZt:+*%vOҤD (DE Mᛗc8'MM}) jcRQ/JAgUfl!톹Rx?+Wm^=pl] '9(LEQ= Y5ʎ/!fs{H @ M1 xϝ+3(-س__y덫Z|8v];8:  qW̧2ooWMhmmVL]>TjIj]UF+wK]v0VJLPl ꛅիnƔ % 1%B_ɅT-:MNıC*WG å;xq0LhC0!Q=ɾäE!+Ԍwv∩P,fDINISrNt3C}l~򚂜BjFQl30@)42>·Hiq^ (4Ydj%q[!N"DHFN&*挸B;'S" Q}9=npra.fDP%sg,ٷ0M LISDf eʙ3~譈#q@SVäOțN,lh-Noǚb{Uh l^ōϔً̭H!xSCD#2!BHMHDy/1L iRGT< w? w? iʹs=N[5XuGpop`B-dK35#zsf\"C*-feVHm=qһC[#4y,d̟uF^7VlBav ^a/Y;҆VrhA!(Pd\fHBSD /[LK6&ph(xΌXW9鄈/)C) S>Ug "Ks`xɛUB@7)5fBjcv_u%(XG4Yɵxs&|'_~>>pGpؼOa2 ( !fa@# !rPT${J4P# 38gqK7$BG[|Rl|s pR/p{/A ۆRt1G~&y 6 l=\> HnG eH9f"ho #Sij)[S=}|Xc-ߊwGϝM2*ă="lM!\Lnd@jT$tAD, , jHU0w싋p}?c+oy0sւ7\ehENg?+kf+ Y*`tew05ׁ`T3+/j8#F5mFoA.ùP ^$i(^CgM/ 9EmJ,*d ZR&f Ez|gY)$e8xpPHKēg f+(i5nG p2^͑Y_5zFےլj́$& Bi-JR5Xt)ADPl}!';rl72|!0 `H$(.=/1q\s|YPAS4@ Dј@lK9 G(9x ['Ag5I=}pȞ/ qI ݉ۆ\r\jG͚=ͯg=o2Ļc%.ٱqo%tEXtdate:create2011-09-22T09:32:32-07:00b%tEXtdate:modify2011-09-16T07:27:36-07:00wjTIENDB`Halide-17.0.1/apps/HelloAndroidCamera2/res/drawable-xhdpi/000077500000000000000000000000001456515664200231725ustar00rootroot00000000000000Halide-17.0.1/apps/HelloAndroidCamera2/res/drawable-xhdpi/ic_launcher.png000066400000000000000000000340571456515664200261650ustar00rootroot00000000000000PNG  IHDR``w8bKGDC pHYsgR vpAg``x7XIDATx}w%Uoo휡9QTQA3*`#bz&8`Б"D Mjh龝'W9MouNݺ{[kKM~?f7ߍ=`l~[WT0 Qs!@2:}ҽdw%spz3J1?m*#^{ރUOP"{v)].C"UO2Ow]Y2%M9#N{kZn*@r:X|NTD1y6!B$ Ћ PC UhfA'lQOœԥx\ ^\z^.^c֭q wq.bw1O-C26kYW9YQ۶Ι;oLJށ}x  X%a&;4,ZHȢIMl!܆= G?:9T q1`GX+_`#mtъ7aGVaӚ tmȲ&z{v2h"y9fΜy1SisX'8=QQlh$X@8Xh1!X`XGEF~~y̪G[pܓ#+Zk[7FO><6ބZV>r6 ?npsg/]e(;n{ w(\a1Rax8Ee"L1b'cp>S p0Ք0n1'Q-<bh4/f햷X尵+iZ{sC^q;ѮncE 5B9^ VmM}cZ(K_|9Ka-pOP{#C_&E8^w8߿=jrF1G͊OpQlZӺOl\t1=b[{ޛ[(z㳗Vh( qYgԙO\/m?gG#JaO5h`ш,  b$A@P Dm,`;L,0΍C#>zH_㑇Źrrk =}$~GqFcڂ/ߝ9?|{T}d=9]CQGL1$B`800 f &t`.=dM:+?\v<@r¢0|оC@HYO?h#0_>`FnZW!7X6X.b0 *L>p F p`xXX8:u`Hr}exx{˷W=R3>~i#?ۋggml41^ b6k 3Covuu'(Na~\+:F!zP4D01 zXhؔ-sYcMAPα} sҏkJ` M6Nse8ݿ")&{_+وi&{Y{ˊGMLa*Bpֈh&5XtuVq+/wi?#x Ї(cz*Ի1HPfa5X`1IbY'@N?ͤ4P'uAЉÇ_lg'bIhv*Cd A ފ /"lr((‚0B  /; T,vf0 0 8^ $XHZ>2]7Kgn6F6:ӻ>TLތq* 615;=/`7ac(axЃtF+|88R^ ék1F[ca:5|Ȇ߯'y֢ql[|;vhjB2W- S= p\D)0XhXh1T 5#A `Xpяw ǀsgvH%K?;J@*$ݫ{gT.^},86u]0eF T!D͌pKX(ь0B`U ,00h $]wu8]#Y9Z-{knZs 0)tnTF$N3xfʓQƾKE.'3c4޷&9n6 s>f f2 DpΒlv7~/y.р=Ey 6Gg*#uW$LhpF@@px4fm6Lk$ЉOXmFL3⑘,SJKpObJa)X ! mCS5o3ວNEfqhϧS澞%g%嚭 ZUlUC#SXdՉE6COhݷю1YB#.FJ^<qRSHHQ &<_ȀW Sաx`6\fk757 X Ro3 HAPF 6L%"%ܦҮ2D73}oc P{a.@N+[v3;AD CmolU,e jA:@,-5ٜ?7_k U/? g֩[f(hD?3?L t-\J;<[/W`DƘ4"R&kc8k G7> ' b@ɭGޓ=!Zc ޱ5AcBZDA/dH)"cYĂ4K؁I9oZ&X`bbBwN$^D2\бXS=3+B(|iBUu a`4m\c Fjj"Mͩc4+%Q+GEaK:mdL[ڵ2]1iD~2:8ǰ}ԁ> f! f8vDdg>^8EOZ߱>I +ǀhۛ$-' D L>'9]u9a(9K:0p`f8ga5\ι~S@O#>%84!eo2AXHcOFqd 4C+V&"W#{EU>*0 mSBD9Cp6;Dh "o85K٫~g)l7_>2h5U&br<p_߿dqQ{a1 01lfpƍ?ͻMFshs\mIt4hK; u$jkmwc'KyvA&FZk\[2-`Mg i}j3hɬEYЪ.GF"^0@!I*@x `a|v_O} XBlƟQR DSi] cjXaTmozrYcuB:g=rb.m=sJL:O涶׎jL4瘘 qI*=wπ %ISMYTe1мN YF~|#&6<:`w;BHH$ w81gi_pFCRHr 1%~Ԛۧ3lc4`ù eϱ:%ˤ={b1mF3F4jj_# ̂7uN0 4[c}c#5V˹tKHc0jӀf2DH)3DzH\|͑ύ+̻QP FM"Dv8 n:0!9nxʰb>PT!3+0I}^ ҈3ť]uu\۱x6-|NR5Hi@Hb1e c;q/Wcw>`֍۰u'houlvkWFiNY /F\{sB`zU~JĿ7Vis axtX3.Lhh٪q+` $;ܡvrJ;ݶt#w. q{|I8b@J:q7 ;M;ֶ~7Ϡ iGձj'oP1O[g5;hΉN?yF}Y.U;۞i%>Z|xY=0ܢ +>Uՠ-BA@`D?jm|nclIwxir -LV\+.%, a 2Q!K‚@ E>a`-42<86IF(Q( J@PJ!*HDX b!*d OCVk:` #-ǔ"i:icFܲЉQҬ7eʼ7}+ڧlΝai#jYءՌl&h55gYsfb%Ռ`gTG+{ݶzX4tf36 b)PDRO&Q>VuU9ȃ޳ρ1k>GW Xغ. #[-LѬ[Ht0NH BP%z{==SBGr__?&זB˶nC_=t#iDAo}[YR(tns3jCܲW4UcjOo8p BHE 10 D*Q޾A*PBB}d3@NZY3N܇6~h#8&$C{4jT'cctɣ}(g33@1S.e7:Up1 rAM!s눝 zȳaB94͓K@hS?/SktyGS@;]Ք\)Bh;K%̺.fv0evۈce8ialrbޔ$vcV-#&~FtrCnh9fKxzlGE&bq= 4+2?k@MXZt%R"oEضp{HZS:нK_ra쉟M@lG%C0;ft X;3L( ٟ؏:aitY%Mqʇ`Q} ߕG<,aT n/+* )dΑL=3I}9-13pZZdAK[ fN]1[q+غ0ZY@Erh2fs`BFt`ff6ˇcP>ӏFyNFv03g$ &)+cxxո_lldɖz8BH}B@gh)w.83m"gvN?ҫ`23,Ă:C8;s&T1z;pS}Kf0 0:QL>i^@sLNpk`ՀoHl `X!Hg@`v:0JQP(Bm萖|LHLX8B\wIH!ӎ vYTWOS g)(X΀"mMDT ЭwWFv^m7$cE^=(qMtp##쐺 Tоٝ5wđ: '}]p--u8#/]w1!!RFX<ۑp )$~sxy+>> ?-SaLҖ|_ Xȍ-:=[,}j:QX9I ʢv$ٱbdJy^/E߀?B|[!|%ppaǀK,9>o)%qMWW^6߃C<yU8S!de pwE; xC" pgY&a@ɰ#I~qO_H$:H d P ,42>B3Xg/>M8M˾kn&>yKpޯY~› r񋫿o>O7&@;8bߋ4 ͒"zƬK; U;QsX:(J)BX͐ayC)VII@z6^!I#|v;wƾx+/Ɖ~)ᛸmٕXwܑbnFmʈg~8]ޕò^O#>:/%##t"$IWQ;O+bcbNzgcig"4N!Lp;:25ͤ zyٰc c5=/:_0wq y9:$`vB=-|Gu.:q̼HMnI(Н`fi~p4&K;g\BPL:Ob]Ju*7;;:&otiΡ:لNMg"p0o1 )]{VFw6}YgIHΥ-0IՔvh{ZtIΆ"G:y*KRPRM!"JC;a$@sd@`h*\ ]!Q<{뿍{{dIXk%,A,cuZNN</4 uL# |eɷߧi}nIods$ѡ`qf&:ȝ<+Y&ڷu; } `#i]:4H.ܡm~pv@G jΙ|I)7];0) !.' ewQڧ޶\QM)΁Kߙ96 yT,! ^ 93!mAڤD,>/ PρDqjNFlW0R2Vu`L+YuB j<ؤT (#O) JQݬZ%DH@%-b= ODP*+/@> snAa2r^!x =FڲeK0[Q?KIuWTYAYg1C1;q:PzPU[We'':*s)</-|n7( 3,c[RRt*f@4S p9S% jYoGX7<6$W. 0Y(">gYHk5[-Y :ie}13U(ڲ;8 K9kشn ??^bVddNx碟-7RBJѱ?o.&1cA"EgK1,=T( @ҬJX_,sJ~O$IP椴Y׌'?{|O]vr(zp/݌nw@6+Yݶh9p2~QᲪMhTͷ7S(dzyԊ FX`]8.Oau1VXm{>AYs{Po/?z٨] 'ckΟ ժ=ucZ7XWH:CY}ӥ(J Ϯ9t+OFsRJ0{Yd`:~\oLv۬ v|xe AP!CH܅5w۶\x+ߏ: zڙ"@d$q㭿ů6ySp n{Ī3K3I?7Oƣ/^?~߉•,nQ'~q_P,Fwq@"  PcނAtLp&?r戭 BHHp˝܏_Fp̘2d@~R@Fgx;:wqk!b0LE+a5aKU`S/rvhx`5hEϝpOFiS{! D*#/~WZؔПdT 5?_ W^kb~߫`5Vutm8#nB !,/_}5>}۱fh%D:+siox, :HPʯΑ3AHʉn:X;c3ГC[bH BkHͧx0N;aT*G0 mPX2 ̙tht~4<.{/BBh%kA!a̮69,Y'PظdjV7*%.Άxgy/_F0u pYٓ(%X, B:N:v[IWY`d6oޠ7wЦotJ*eSu YnLԆz U˿'yGC"P$,FO\RE"&*\x[p?*f&c߫=-Z.ʯd;,Uڑ(|X ۪h5 ,x @" lY32.yr~EDr6` ZmkV(.0Nsu鹐YJ|w"*h6bm|猑 WAS!RI(A~L󡵛̺8:o@@ n:!6!:9n1AHlqCuݫ䦍k'X;}^hnܺxpo/cIaW8il ( " ;}6g䗳 ) RBt pkVY kt+^lK.c[knxX )K5cެX} W7 }{leWi&q'ǂKS^G`lJ(VȳiH A@"HB$E2"`(SNYm[ؚǂ0| GqÍ+*8O/yu-?|_qaaؓ,Y~wz+&m F[-"S)PO|+S&()  !D]dGȿ3 )>ׂ{2 oO}N6X䞋ϾZwp,=Ϗϳ<.ipa-OY"rq=鸳gxOWm}pX QTwF{lAdp "GN<@ deGipNs^#]RVl9`Cqꎷ;0YW]y7yJ _k7>VyEk>uCCՉZ*ZzvU8nAJ%AIQ*PJEAUBTBXFPDE* ฅ8D6ѭشqƷY7 ی]r^2'g?63y#U~2 ȎJ̙3c2>˾;}buCIklDIR bRB5)HMB 9b%FdC| $6Il ?,6<пCY[:f߶iuK,޷G܌ gB}_̗>z˳ |(tA$XVPF:ݟ$դphV9݆auCk=? im?\}/o~$H' ** )}F2 `N۪+]SpK"T7]J|`7+U"Ö;DT)*zz=DP,B_GET!9[W86zu*bb9BXQ,(HB>u3;4k<7w9mvۿ-ڃ{[ȄxfH%;\:f XyфH Halide-17.0.1/apps/HelloAndroidCamera2/res/layout/fragment_camera2_basic.xml000066400000000000000000000015101456515664200266710ustar00rootroot00000000000000 \n"; stream << " \n"; stream << "
\n"; stream << " \n"; stream << "
\n"; stream << " \n"; stream << "\n"; } /* Misc helper methods */ // Load assembly code from file std::ostringstream asm_stream; AssemblyInfo host_asm_info; AssemblyInfo device_asm_info; void load_asm_code(const std::string &asm_file) { user_assert(file_exists(asm_file)) << "Unable to open assembly file: " << asm_file << "\n"; // Open assembly file std::ifstream assembly; assembly.open(asm_file.c_str()); // Slurp the code into asm_stream std::string line; while (getline(assembly, line)) { asm_stream << line << "\n"; } } }; // The external interface to this module void print_to_stmt_html(const std::string &html_output_filename, const Module &m, const std::string &assembly_input_filename) { PipelineHTMLInspector inspector(html_output_filename, m, assembly_input_filename, false); inspector.generate_html(m); debug(1) << "Done generating HTML IR Inspector - printed to: " << html_output_filename << "\n"; } void print_to_conceptual_stmt_html(const std::string &html_output_filename, const Module &m, const std::string &assembly_input_filename) { PipelineHTMLInspector inspector(html_output_filename, m, assembly_input_filename, true); inspector.generate_html(m); debug(1) << "Done generating HTML Conceptual IR Inspector - printed to: " << html_output_filename << "\n"; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/StmtToHTML.h000066400000000000000000000025351456515664200156700ustar00rootroot00000000000000#ifndef HALIDE_STMT_TO_HTML #define HALIDE_STMT_TO_HTML /** \file * Defines a function to dump an HTML-formatted visualization to a file. */ #include namespace Halide { class Module; namespace Internal { struct Stmt; /** Dump an HTML-formatted visualization of a Module to filename. * If assembly_input_filename is not empty, it is expected to be the path * to assembly output. If empty, the code will attempt to find such a * file based on output_filename (replacing ".stmt.html" with ".s"), * and will assert-fail if no such file is found. */ void print_to_stmt_html(const std::string &html_output_filename, const Module &m, const std::string &assembly_input_filename = ""); /** Dump an HTML-formatted visualization of a Module's conceptual Stmt code to filename. * If assembly_input_filename is not empty, it is expected to be the path * to assembly output. If empty, the code will attempt to find such a * file based on output_filename (replacing ".stmt.html" with ".s"), * and will assert-fail if no such file is found. */ void print_to_conceptual_stmt_html(const std::string &html_output_filename, const Module &m, const std::string &assembly_input_filename = ""); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/StorageFlattening.cpp000066400000000000000000000575411456515664200177330ustar00rootroot00000000000000#include "StorageFlattening.h" #include "Bounds.h" #include "CSE.h" #include "Function.h" #include "FuseGPUThreadLoops.h" #include "IRMutator.h" #include "IROperator.h" #include "IRPrinter.h" #include "Parameter.h" #include "Scope.h" #include "Simplify.h" #include "Substitute.h" #include namespace Halide { namespace Internal { using std::map; using std::ostringstream; using std::pair; using std::set; using std::string; using std::vector; namespace { class ExpandExpr : public IRMutator { using IRMutator::visit; const Scope &scope; Expr visit(const Variable *var) override { if (scope.contains(var->name)) { Expr expr = scope.get(var->name); // Mutate the expression, so lets can get replaced recursively. expr = mutate(expr); debug(4) << "Fully expanded " << var->name << " -> " << expr << "\n"; return expr; } else { return var; } } public: ExpandExpr(const Scope &s) : scope(s) { } }; // Perform all the substitutions in a scope Expr expand_expr(const Expr &e, const Scope &scope) { ExpandExpr ee(scope); Expr result = ee.mutate(e); debug(4) << "Expanded " << e << " into " << result << "\n"; return result; } class FlattenDimensions : public IRMutator { public: FlattenDimensions(const map> &e, const vector &o, const Target &t) : env(e), target(t) { for (const auto &f : o) { outputs.insert(f.name()); } } private: struct HoistedAllocationInfo { string name; Type type; MemoryType memory_type; vector extents; Expr condition; HoistedAllocationInfo(const string &name, Type type, MemoryType memory_type, const vector &extents, Expr condition) : name(name), type(type), memory_type(memory_type), extents(extents), condition(std::move(condition)) { } }; struct HoistedStorageData { string name; vector hoisted_allocations; Scope loop_vars; Scope scope; HoistedStorageData(const string &n) : name(n) { } }; const map> &env; set outputs; set textures; const Target ⌖ Scope<> realizations; bool in_gpu = false; vector hoisted_storages; map hoisted_storages_map; Expr make_shape_var(string name, const string &field, size_t dim, const Buffer<> &buf, const Parameter ¶m) { ReductionDomain rdom; name = name + "." + field + "." + std::to_string(dim); return Variable::make(Int(32), name, buf, param, rdom); } Expr flatten_args(const string &name, vector args, const Buffer<> &buf, const Parameter ¶m) { bool internal = realizations.contains(name); Expr idx = target.has_large_buffers() ? make_zero(Int(64)) : 0; vector mins(args.size()), strides(args.size()); for (size_t i = 0; i < args.size(); i++) { strides[i] = make_shape_var(name, "stride", i, buf, param); mins[i] = make_shape_var(name, "min", i, buf, param); if (target.has_large_buffers()) { strides[i] = cast(strides[i]); } } Expr zero = target.has_large_buffers() ? make_zero(Int(64)) : 0; // We peel off constant offsets so that multiple stencil // taps can share the same base address. Expr constant_term = zero; for (size_t i = 0; i < args.size(); i++) { const Add *add = args[i].as(); if (add && is_const(add->b)) { constant_term += strides[i] * add->b; args[i] = add->a; } } if (internal) { // f(x, y) -> f[(x-xmin)*xstride + (y-ymin)*ystride] This // strategy makes sense when we expect x to cancel with // something in xmin. We use this for internal allocations. for (size_t i = 0; i < args.size(); i++) { idx += (args[i] - mins[i]) * strides[i]; } } else { // f(x, y) -> f[x*stride + y*ystride - (xstride*xmin + // ystride*ymin)]. The idea here is that the last term // will be pulled outside the inner loop. We use this for // external buffers, where the mins and strides are likely // to be symbolic Expr base = zero; for (size_t i = 0; i < args.size(); i++) { idx += args[i] * strides[i]; base += mins[i] * strides[i]; } idx -= base; } if (!is_const_zero(constant_term)) { idx += constant_term; } return idx; } using IRMutator::visit; Stmt visit(const HoistedStorage *op) override { hoisted_storages.emplace_back(op->name); // Record index in the stack. hoisted_storages_map[op->name] = hoisted_storages.size() - 1; Stmt body = mutate(op->body); internal_assert(!hoisted_storages.back().hoisted_allocations.empty()) << "Couldn't find a matching Realize node for Hoisted storage " << op->name << "\n"; const auto &alloc_info = hoisted_storages.back().hoisted_allocations.front(); vector extents = alloc_info.extents; for (int i = 1; i < (int)hoisted_storages.back().hoisted_allocations.size(); i++) { const auto &ai = hoisted_storages.back().hoisted_allocations[i]; internal_assert(ai.extents.size() == alloc_info.extents.size()); for (int j = 0; j < (int)extents.size(); j++) { extents[j] = Max::make(extents[j], ai.extents[j]); } } body = Allocate::make(alloc_info.name, alloc_info.type, alloc_info.memory_type, extents, alloc_info.condition, body); hoisted_storages_map.erase(op->name); hoisted_storages.pop_back(); return body; } Stmt visit(const Realize *op) override { realizations.push(op->name); if (op->memory_type == MemoryType::GPUTexture) { textures.insert(op->name); debug(2) << "found texture " << op->name << "\n"; } Stmt body = mutate(op->body); // Compute the size vector extents(op->bounds.size()); for (size_t i = 0; i < op->bounds.size(); i++) { extents[i] = mutate(op->bounds[i].extent); } Expr condition = mutate(op->condition); realizations.pop(op->name); // The allocation extents of the function taken into account of // the align_storage directives. It is only used to determine the // host allocation size and the strides in halide_buffer_t objects (which // also affects the device allocation in some backends). vector allocation_extents(extents.size()); vector storage_permutation; vector bound_asserts; { auto iter = env.find(op->name); internal_assert(iter != env.end()) << "Realize node refers to function not in environment.\n"; Function f = iter->second.first; const vector &storage_dims = f.schedule().storage_dims(); const vector &args = f.args(); for (size_t i = 0; i < storage_dims.size(); i++) { for (size_t j = 0; j < args.size(); j++) { if (args[j] == storage_dims[i].var) { storage_permutation.push_back((int)j); Expr bound = storage_dims[i].bound; if (bound.defined()) { if (can_prove(extents[j] > bound)) { user_error << "Explicit storage bound (" << bound << ") for variable " << args[j] << " of function " << op->name << " is smaller than required (" << extents[j] << ")\n"; } Expr bound_too_small_error = Call::make(Int(32), "halide_error_storage_bound_too_small", {StringImm::make(op->name), StringImm::make(args[j]), bound, extents[j]}, Call::Extern); Stmt size_to_small_check = AssertStmt::make(extents[j] <= bound, bound_too_small_error); bound_asserts.push_back(size_to_small_check); extents[j] = bound; } Expr alignment = storage_dims[i].alignment; if (alignment.defined()) { allocation_extents[j] = ((extents[j] + alignment - 1) / alignment) * alignment; } else { allocation_extents[j] = extents[j]; } } } internal_assert(storage_permutation.size() == i + 1); } } internal_assert(storage_permutation.size() == op->bounds.size()); Stmt stmt = body; internal_assert(op->types.size() == 1); // Make the names for the mins, extents, and strides int dims = op->bounds.size(); vector min_name(dims), extent_name(dims), stride_name(dims); for (int i = 0; i < dims; i++) { string d = std::to_string(i); min_name[i] = op->name + ".min." + d; stride_name[i] = op->name + ".stride." + d; extent_name[i] = op->name + ".extent." + d; } vector min_var(dims), extent_var(dims), stride_var(dims); for (int i = 0; i < dims; i++) { min_var[i] = Variable::make(Int(32), min_name[i]); extent_var[i] = Variable::make(Int(32), extent_name[i]); stride_var[i] = Variable::make(Int(32), stride_name[i]); } // Create a halide_buffer_t object for this allocation. BufferBuilder builder; builder.host = Variable::make(Handle(), op->name); builder.type = op->types[0]; builder.dimensions = dims; for (int i = 0; i < dims; i++) { builder.mins.push_back(min_var[i]); builder.extents.push_back(extent_var[i]); builder.strides.push_back(stride_var[i]); } stmt = LetStmt::make(op->name + ".buffer", builder.build(), stmt); if (hoisted_storages_map.count(op->name) > 0) { HoistedStorageData &hoisted_storage_data = hoisted_storages[hoisted_storages_map[op->name]]; vector bounded_extents; for (const auto &e : allocation_extents) { Expr expanded_extent = e; // Iterate from innermost outwards for (auto it = hoisted_storages.rbegin(); it != hoisted_storages.rend(); it++) { expanded_extent = expand_expr(expanded_extent, it->scope); if (it->name == op->name) { break; } } expanded_extent = simplify(common_subexpression_elimination(expanded_extent)); Interval bounds = bounds_of_expr_in_scope(expanded_extent, hoisted_storage_data.loop_vars); user_assert(bounds.max.defined()) << "Couldn't infer the upper bound for the storage size of " << op->name << ", consider using bound_storage.\n"; bounded_extents.push_back(bounds.max); } HoistedAllocationInfo hoisted_alloc(op->name, op->types[0], op->memory_type, bounded_extents, condition); hoisted_storage_data.hoisted_allocations.push_back(hoisted_alloc); } else { // Make the allocation node stmt = Allocate::make(op->name, op->types[0], op->memory_type, allocation_extents, condition, stmt); } // Wrap it into storage bound asserts. if (!bound_asserts.empty()) { stmt = Block::make(Block::make(bound_asserts), stmt); } // Compute the strides for (int i = (int)op->bounds.size() - 1; i > 0; i--) { int prev_j = storage_permutation[i - 1]; int j = storage_permutation[i]; Expr stride = stride_var[prev_j] * allocation_extents[prev_j]; stmt = LetStmt::make(stride_name[j], stride, stmt); } // Innermost stride is one if (dims > 0) { int innermost = storage_permutation.empty() ? 0 : storage_permutation[0]; stmt = LetStmt::make(stride_name[innermost], 1, stmt); } // Assign the mins and extents stored for (size_t i = op->bounds.size(); i > 0; i--) { stmt = LetStmt::make(min_name[i - 1], op->bounds[i - 1].min, stmt); stmt = LetStmt::make(extent_name[i - 1], extents[i - 1], stmt); } return stmt; } Stmt visit(const Provide *op) override { internal_assert(op->values.size() == 1); Parameter output_buf; auto it = env.find(op->name); if (it != env.end()) { const Function &f = it->second.first; int idx = it->second.second; // We only want to do this for actual pipeline outputs, // even though every Function has an output buffer. Any // constraints you set on the output buffer of a Func that // isn't actually an output is ignored. This is a language // wart. if (outputs.count(f.name())) { output_buf = f.output_buffers()[idx]; } } if (output_buf.defined()) { if (output_buf.memory_type() == MemoryType::GPUTexture) { textures.insert(op->name); } } Expr value = mutate(op->values[0]); Expr predicate = mutate(op->predicate); if (in_gpu && textures.count(op->name)) { Expr buffer_var = Variable::make(type_of(), op->name + ".buffer", output_buf); vector args(2); args[0] = op->name; args[1] = buffer_var; for (size_t i = 0; i < op->args.size(); i++) { Expr min = Variable::make(Int(32), op->name + ".min." + std::to_string(i)); args.push_back(op->args[i] - min); } args.push_back(value); Expr store = Call::make(value.type(), Call::image_store, args, Call::Intrinsic); Stmt result = Evaluate::make(store); if (!is_const_one(op->predicate)) { result = IfThenElse::make(predicate, result); } return result; } else { Expr idx = mutate(flatten_args(op->name, op->args, Buffer<>(), output_buf)); return Store::make(op->name, value, idx, output_buf, predicate, ModulusRemainder()); } } Expr visit(const Call *op) override { if (op->call_type == Call::Halide || op->call_type == Call::Image) { debug(2) << " load call to " << op->name << " " << textures.count(op->name) << "\n"; if (op->param.defined()) { debug(2) << " is param: " << " " << op->param.name() << " " << op->param.memory_type() << "\n"; if (op->param.memory_type() == MemoryType::GPUTexture) { textures.insert(op->name); } } internal_assert(op->value_index == 0); if (in_gpu && textures.count(op->name)) { ReductionDomain rdom; Expr buffer_var = Variable::make(type_of(), op->name + ".buffer", op->image, op->param, rdom); // Create image_load("name", name.buffer, x - x_min, x_extent, // y - y_min, y_extent, ...). Extents can be used by // successive passes. OpenGL, for example, uses them // for coordinate normalization. vector args(2); args[0] = op->name; args[1] = buffer_var; for (size_t i = 0; i < op->args.size(); i++) { Expr min = make_shape_var(op->name, "min", i, op->image, op->param); Expr extent = make_shape_var(op->name, "extent", i, op->image, op->param); args.push_back(mutate(op->args[i]) - min); args.push_back(extent); } return Call::make(op->type, Call::image_load, args, Call::PureIntrinsic, FunctionPtr(), 0, op->image, op->param); } else { Expr idx = mutate(flatten_args(op->name, op->args, op->image, op->param)); return Load::make(op->type, op->name, idx, op->image, op->param, const_true(op->type.lanes()), ModulusRemainder()); } } else { return IRMutator::visit(op); } } Stmt visit(const Prefetch *op) override { internal_assert(op->types.size() == 1) << "Prefetch from multi-dimensional halide tuple should have been split\n"; Expr condition = mutate(op->condition); vector prefetch_min(op->bounds.size()); vector prefetch_extent(op->bounds.size()); vector prefetch_stride(op->bounds.size()); for (size_t i = 0; i < op->bounds.size(); i++) { prefetch_min[i] = mutate(op->bounds[i].min); prefetch_extent[i] = mutate(op->bounds[i].extent); prefetch_stride[i] = Variable::make(Int(32), op->name + ".stride." + std::to_string(i), op->prefetch.param); } Expr base_offset = mutate(flatten_args(op->name, prefetch_min, Buffer<>(), op->prefetch.param)); Expr base_address = Variable::make(Handle(), op->name); vector args = {base_address, base_offset}; auto iter = env.find(op->name); if (iter != env.end()) { // Order the args based on the storage dims // (i.e. innermost dimension should be first in args) vector storage_permutation; { Function f = iter->second.first; const vector &storage_dims = f.schedule().storage_dims(); const vector &args = f.args(); for (size_t i = 0; i < storage_dims.size(); i++) { for (size_t j = 0; j < args.size(); j++) { if (args[j] == storage_dims[i].var) { storage_permutation.push_back((int)j); } } internal_assert(storage_permutation.size() == i + 1); } } internal_assert(storage_permutation.size() == op->bounds.size()); for (size_t i = 0; i < op->bounds.size(); i++) { internal_assert(storage_permutation[i] < (int)op->bounds.size()); args.push_back(prefetch_extent[storage_permutation[i]]); args.push_back(prefetch_stride[storage_permutation[i]]); } } else { for (size_t i = 0; i < op->bounds.size(); i++) { args.push_back(prefetch_extent[i]); args.push_back(prefetch_stride[i]); } } // TODO: Consider generating a prefetch call for each tuple element. Stmt prefetch_call = Evaluate::make(Call::make(op->types[0], Call::prefetch, args, Call::Intrinsic)); if (!is_const_one(condition)) { prefetch_call = IfThenElse::make(condition, prefetch_call); } Stmt body = mutate(op->body); return Block::make(prefetch_call, body); } Stmt visit(const For *op) override { Expr expanded_min = op->min; Expr expanded_extent = op->extent; // Iterate from innermost outwards for (auto it = hoisted_storages.rbegin(); it != hoisted_storages.rend(); it++) { expanded_min = simplify(expand_expr(expanded_min, it->scope)); expanded_extent = expand_expr(expanded_extent, it->scope); Interval loop_bounds = Interval(expanded_min, simplify(expanded_min + expanded_extent - 1)); it->loop_vars.push(op->name, loop_bounds); } bool old_in_gpu = in_gpu; if (op->for_type == ForType::GPUBlock || op->for_type == ForType::GPUThread) { in_gpu = true; } Stmt stmt = IRMutator::visit(op); in_gpu = old_in_gpu; for (auto &p : hoisted_storages) { p.loop_vars.pop(op->name); } return stmt; } Stmt visit(const LetStmt *op) override { if (!hoisted_storages.empty()) { hoisted_storages.back().scope.push(op->name, op->value); } Stmt stmt = IRMutator::visit(op); if (!hoisted_storages.empty()) { hoisted_storages.back().scope.pop(op->name); } return stmt; } }; // Realizations, stores, and loads must all be on types that are // multiples of 8-bits. This really only affects bools class PromoteToMemoryType : public IRMutator { using IRMutator::visit; Type upgrade(Type t) { return t.with_bits(((t.bits() + 7) / 8) * 8); } Expr visit(const Load *op) override { Type t = upgrade(op->type); if (t != op->type) { return Cast::make(op->type, Load::make(t, op->name, mutate(op->index), op->image, op->param, mutate(op->predicate), ModulusRemainder())); } else { return IRMutator::visit(op); } } Stmt visit(const Store *op) override { Type t = upgrade(op->value.type()); if (t != op->value.type()) { return Store::make(op->name, Cast::make(t, mutate(op->value)), mutate(op->index), op->param, mutate(op->predicate), ModulusRemainder()); } else { return IRMutator::visit(op); } } Stmt visit(const Allocate *op) override { Type t = upgrade(op->type); if (t != op->type) { return Allocate::make(op->name, t, op->memory_type, mutate(op->extents), mutate(op->condition), mutate(op->body), mutate(op->new_expr), op->free_function, op->padding); } else { return IRMutator::visit(op); } } }; } // namespace Stmt storage_flattening(Stmt s, const vector &outputs, const map &env, const Target &target) { // The OpenGL backend requires loop mins to be zero'd at this point. s = zero_gpu_loop_mins(s); // Make an environment that makes it easier to figure out which // Function corresponds to a tuple component. foo.0, foo.1, foo.2, // all point to the function foo. map> tuple_env; for (const auto &p : env) { if (p.second.outputs() > 1) { for (int i = 0; i < p.second.outputs(); i++) { tuple_env[p.first + "." + std::to_string(i)] = {p.second, i}; } } else { tuple_env[p.first] = {p.second, 0}; } } s = FlattenDimensions(tuple_env, outputs, target).mutate(s); s = PromoteToMemoryType().mutate(s); return s; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/StorageFlattening.h000066400000000000000000000014331456515664200173650ustar00rootroot00000000000000#ifndef HALIDE_STORAGE_FLATTENING_H #define HALIDE_STORAGE_FLATTENING_H /** \file * Defines the lowering pass that flattens multi-dimensional storage * into single-dimensional array access */ #include #include #include #include "Expr.h" namespace Halide { struct Target; namespace Internal { class Function; /** Take a statement with multi-dimensional Realize, Provide, and Call * nodes, and turn it into a statement with single-dimensional * Allocate, Store, and Load nodes respectively. */ Stmt storage_flattening(Stmt s, const std::vector &outputs, const std::map &env, const Target &target); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/StorageFolding.cpp000066400000000000000000001344071456515664200172170ustar00rootroot00000000000000#include "StorageFolding.h" #include "Bounds.h" #include "CSE.h" #include "Debug.h" #include "ExprUsesVar.h" #include "IRMutator.h" #include "IROperator.h" #include "IRPrinter.h" #include "Monotonic.h" #include "Simplify.h" #include "Substitute.h" #include namespace Halide { namespace Internal { namespace { int64_t next_power_of_two(int64_t x) { return static_cast(1) << static_cast(std::ceil(std::log2(x))); } using std::map; using std::string; using std::vector; // Count the number of producers of a particular func. class CountProducers : public IRVisitor { const std::string &name; void visit(const ProducerConsumer *op) override { if (op->is_producer && (op->name == name)) { count++; } else { IRVisitor::visit(op); } } using IRVisitor::visit; public: int count = 0; CountProducers(const std::string &name) : name(name) { } }; int count_producers(const Stmt &in, const std::string &name) { CountProducers counter(name); in.accept(&counter); return counter.count; } // Fold the storage of a function in a particular dimension by a particular factor class FoldStorageOfFunction : public IRMutator { string func; int dim; Expr factor; string dynamic_footprint; using IRMutator::visit; Expr visit(const Call *op) override { Expr expr = IRMutator::visit(op); op = expr.as(); internal_assert(op); if (op->name == func && op->call_type == Call::Halide) { vector args = op->args; internal_assert(dim < (int)args.size()); args[dim] = is_const_one(factor) ? 0 : (args[dim] % factor); expr = Call::make(op->type, op->name, args, op->call_type, op->func, op->value_index, op->image, op->param); } else if (op->name == Call::buffer_crop) { Expr source = op->args[2]; const Variable *buf_var = source.as(); if (buf_var && starts_with(buf_var->name, func + ".") && ends_with(buf_var->name, ".buffer")) { // We are taking a crop of a folded buffer. For now // we'll just assert that the crop doesn't wrap // around, so that the crop doesn't need to be treated // as a folded buffer too. But to take the crop, we // need to use folded coordinates, and then restore // the non-folded min after the crop operation. // Pull out the expressions we need internal_assert(op->args.size() >= 5); Expr mins_arg = op->args[3]; Expr extents_arg = op->args[4]; const Call *mins_call = mins_arg.as(); const Call *extents_call = extents_arg.as(); internal_assert(mins_call && extents_call); vector mins = mins_call->args; const vector &extents = extents_call->args; internal_assert(dim < (int)mins.size() && dim < (int)extents.size()); Expr old_min = mins[dim]; Expr old_extent = extents[dim]; // Rewrite the crop args mins[dim] = old_min % factor; Expr new_mins = Call::make(type_of(), Call::make_struct, mins, Call::Intrinsic); vector new_args = op->args; new_args[3] = new_mins; expr = Call::make(op->type, op->name, new_args, op->call_type); // Inject the assertion Expr no_wraparound = mins[dim] + extents[dim] <= factor; Expr valid_min = old_min; if (!dynamic_footprint.empty()) { // If the footprint is being tracked dynamically, it's // not enough to just check we don't overlap a // fold. We also need to check the min against the // valid min. // TODO: dynamic footprint is no longer the min, and may be tracked separately on producer and consumer sides (head vs tail) valid_min = Load::make(Int(32), dynamic_footprint, 0, Buffer<>(), Parameter(), const_true(), ModulusRemainder()); Expr check = (old_min >= valid_min && (old_min + old_extent - 1) < valid_min + factor); no_wraparound = no_wraparound && check; } Expr error = Call::make(Int(32), "halide_error_bad_extern_fold", {Expr(func), Expr(dim), old_min, old_extent, valid_min, factor}, Call::Extern); expr = Call::make(op->type, Call::require, {no_wraparound, expr, error}, Call::Intrinsic); // Restore the correct min coordinate expr = Call::make(op->type, Call::buffer_set_bounds, {expr, dim, old_min, old_extent}, Call::Extern); } } return expr; } Stmt visit(const Provide *op) override { Stmt stmt = IRMutator::visit(op); op = stmt.as(); internal_assert(op); if (op->name == func) { vector args = op->args; args[dim] = is_const_one(factor) ? 0 : (args[dim] % factor); stmt = Provide::make(op->name, op->values, args, op->predicate); } return stmt; } public: FoldStorageOfFunction(string f, int d, Expr e, string p) : func(std::move(f)), dim(d), factor(std::move(e)), dynamic_footprint(std::move(p)) { } }; // Inject dynamic folding checks against a tracked live range. class InjectFoldingCheck : public IRMutator { Function func; string head, tail, loop_var; Expr sema_var; int dim; bool in_produce; const StorageDim &storage_dim; using IRMutator::visit; Stmt visit(const ProducerConsumer *op) override { if (op->name == func.name()) { Stmt body = op->body; if (op->is_producer) { if (func.has_extern_definition()) { // We'll update the valid min at the buffer_crop call. in_produce = true; body = mutate(op->body); } else { // Update valid range based on bounds written to. Box b = box_provided(body, func.name()); Expr old_leading_edge = Load::make(Int(32), head + "_next", 0, Buffer<>(), Parameter(), const_true(), ModulusRemainder()); internal_assert(!b.empty()); // Track the logical address range the memory // currently represents. Expr new_leading_edge; if (storage_dim.fold_forward) { new_leading_edge = max(b[dim].max, old_leading_edge); } else { new_leading_edge = min(b[dim].min, old_leading_edge); } string new_leading_edge_var_name = unique_name('t'); Expr new_leading_edge_var = Variable::make(Int(32), new_leading_edge_var_name); Stmt update_leading_edge = Store::make(head, new_leading_edge_var, 0, Parameter(), const_true(), ModulusRemainder()); Stmt update_next_leading_edge = Store::make(head + "_next", new_leading_edge_var, 0, Parameter(), const_true(), ModulusRemainder()); // Check the region being written to in this // iteration lies within the range of coordinates // currently represented. Expr fold_non_monotonic_error = Call::make(Int(32), "halide_error_bad_fold", {func.name(), storage_dim.var, loop_var}, Call::Extern); Expr in_valid_range; if (storage_dim.fold_forward) { in_valid_range = b[dim].min > new_leading_edge - storage_dim.fold_factor; } else { in_valid_range = b[dim].max < new_leading_edge + storage_dim.fold_factor; } Stmt check_in_valid_range = AssertStmt::make(in_valid_range, fold_non_monotonic_error); Expr extent = b[dim].max - b[dim].min + 1; // Separately check the extent for *this* loop iteration fits. Expr fold_too_small_error = Call::make(Int(32), "halide_error_fold_factor_too_small", {func.name(), storage_dim.var, storage_dim.fold_factor, loop_var, extent}, Call::Extern); Stmt check_extent = AssertStmt::make(extent <= storage_dim.fold_factor, fold_too_small_error); Stmt checks = Block::make({check_extent, check_in_valid_range, update_leading_edge, update_next_leading_edge}); if (func.schedule().async()) { Expr to_acquire; if (storage_dim.fold_forward) { to_acquire = new_leading_edge_var - old_leading_edge; } else { to_acquire = old_leading_edge - new_leading_edge_var; } body = Block::make(checks, body); body = Acquire::make(sema_var, to_acquire, body); body = LetStmt::make(new_leading_edge_var_name, new_leading_edge, body); } else { checks = LetStmt::make(new_leading_edge_var_name, new_leading_edge, checks); body = Block::make(checks, body); } } } else { // Check the accessed range against the valid range. Box b = box_required(body, func.name()); if (b.empty()) { // Must be used in an extern call (TODO: // assert this, TODO: What if it's used in an // extern call and native Halide). We'll // update the valid min at the buffer_crop // call. in_produce = false; body = mutate(op->body); } else { Expr leading_edge = Load::make(Int(32), tail + "_next", 0, Buffer<>(), Parameter(), const_true(), ModulusRemainder()); if (func.schedule().async()) { Expr new_leading_edge; if (storage_dim.fold_forward) { new_leading_edge = b[dim].min - 1 + storage_dim.fold_factor; } else { new_leading_edge = b[dim].max + 1 - storage_dim.fold_factor; } string new_leading_edge_name = unique_name('t'); Expr new_leading_edge_var = Variable::make(Int(32), new_leading_edge_name); Expr to_release; if (storage_dim.fold_forward) { to_release = new_leading_edge_var - leading_edge; } else { to_release = leading_edge - new_leading_edge_var; } Expr release_producer = Call::make(Int(32), "halide_semaphore_release", {sema_var, to_release}, Call::Extern); // The consumer is going to get its own forked copy of the footprint, so it needs to update it too. Stmt update_leading_edge = Store::make(tail, new_leading_edge_var, 0, Parameter(), const_true(), ModulusRemainder()); update_leading_edge = Block::make(Store::make(tail + "_next", new_leading_edge_var, 0, Parameter(), const_true(), ModulusRemainder()), update_leading_edge); update_leading_edge = Block::make(Evaluate::make(release_producer), update_leading_edge); update_leading_edge = LetStmt::make(new_leading_edge_name, new_leading_edge, update_leading_edge); body = Block::make(update_leading_edge, body); } else { Expr check; if (storage_dim.fold_forward) { check = (b[dim].min > leading_edge - storage_dim.fold_factor && b[dim].max <= leading_edge); } else { check = (b[dim].max < leading_edge + storage_dim.fold_factor && b[dim].min >= leading_edge); } Expr bad_fold_error = Call::make(Int(32), "halide_error_bad_fold", {func.name(), storage_dim.var, loop_var}, Call::Extern); body = Block::make(AssertStmt::make(check, bad_fold_error), body); } } } return ProducerConsumer::make(op->name, op->is_producer, body); } else { return IRMutator::visit(op); } } Stmt visit(const LetStmt *op) override { if (starts_with(op->name, func.name() + ".") && ends_with(op->name, ".tmp_buffer")) { Stmt body = op->body; Expr buf = Variable::make(type_of(), op->name); if (in_produce) { // We're taking a crop of the buffer to act as an output // to an extern stage. Update the valid min or max // coordinate accordingly. Expr leading_edge; if (storage_dim.fold_forward) { leading_edge = Call::make(Int(32), Call::buffer_get_max, {buf, dim}, Call::Extern); } else { leading_edge = Call::make(Int(32), Call::buffer_get_min, {buf, dim}, Call::Extern); } Stmt update_leading_edge = Store::make(head, leading_edge, 0, Parameter(), const_true(), ModulusRemainder()); body = Block::make(update_leading_edge, body); // We don't need to make sure the min is moving // monotonically, because we can't do sliding window on // extern stages, so we don't have to worry about whether // we're preserving valid values from previous loop // iterations. if (func.schedule().async()) { Expr old_leading_edge = Load::make(Int(32), head, 0, Buffer<>(), Parameter(), const_true(), ModulusRemainder()); Expr to_acquire; if (storage_dim.fold_forward) { to_acquire = leading_edge - old_leading_edge; } else { to_acquire = old_leading_edge - leading_edge; } body = Acquire::make(sema_var, to_acquire, body); } } else { // We're taking a crop of the buffer to act as an input // to an extern stage. Update the valid min or max // coordinate accordingly. Expr leading_edge; if (storage_dim.fold_forward) { leading_edge = Call::make(Int(32), Call::buffer_get_min, {buf, dim}, Call::Extern) - 1 + storage_dim.fold_factor; } else { leading_edge = Call::make(Int(32), Call::buffer_get_max, {buf, dim}, Call::Extern) + 1 - storage_dim.fold_factor; } Stmt update_leading_edge = Store::make(tail, leading_edge, 0, Parameter(), const_true(), ModulusRemainder()); body = Block::make(update_leading_edge, body); if (func.schedule().async()) { Expr old_leading_edge = Load::make(Int(32), tail, 0, Buffer<>(), Parameter(), const_true(), ModulusRemainder()); Expr to_release; if (storage_dim.fold_forward) { to_release = leading_edge - old_leading_edge; } else { to_release = old_leading_edge - leading_edge; } Expr release_producer = Call::make(Int(32), "halide_semaphore_release", {sema_var, to_release}, Call::Extern); body = Block::make(Evaluate::make(release_producer), body); } } return LetStmt::make(op->name, op->value, body); } else { return LetStmt::make(op->name, op->value, mutate(op->body)); } } public: InjectFoldingCheck(Function func, string head, string tail, string loop_var, Expr sema_var, int dim, const StorageDim &storage_dim) : func(std::move(func)), head(std::move(head)), tail(std::move(tail)), loop_var(std::move(loop_var)), sema_var(std::move(sema_var)), dim(dim), storage_dim(storage_dim) { } }; struct Semaphore { string name; Expr var; Expr init; }; class HasExternConsumer : public IRVisitor { using IRVisitor::visit; void visit(const Variable *op) override { if (op->name == func + ".buffer") { result = true; } } const std::string &func; public: HasExternConsumer(const std::string &func) : func(func) { } bool result = false; }; class VectorAccessOfFoldedDim : public IRVisitor { using IRVisitor::visit; void visit(const Provide *op) override { if (op->name == func) { internal_assert(dim < (int)op->args.size()); if (expr_uses_vars(op->args[dim], vector_vars)) { result = true; } } else { IRVisitor::visit(op); } } void visit(const Call *op) override { if (op->name == func && op->call_type == Call::Halide) { internal_assert(dim < (int)op->args.size()); if (expr_uses_vars(op->args[dim], vector_vars)) { result = true; } } else { IRVisitor::visit(op); } } template void visit_let(const LetOrLetStmt *op) { op->value.accept(this); bool is_vec = expr_uses_vars(op->value, vector_vars); ScopedBinding<> bind(is_vec, vector_vars, op->name); op->body.accept(this); } void visit(const Let *op) override { visit_let(op); } void visit(const LetStmt *op) override { visit_let(op); } void visit(const For *op) override { ScopedBinding<> bind(op->for_type == ForType::Vectorized, vector_vars, op->name); IRVisitor::visit(op); } Scope<> vector_vars; const string &func; int dim; public: bool result = false; VectorAccessOfFoldedDim(const string &func, int dim) : func(func), dim(dim) { } }; // Attempt to fold the storage of a particular function in a statement class AttemptStorageFoldingOfFunction : public IRMutator { Function func; bool explicit_only; using IRMutator::visit; Stmt visit(const ProducerConsumer *op) override { if (op->name == func.name()) { // Can't proceed into the pipeline for this func return op; } else { return IRMutator::visit(op); } } bool found_sliding_marker = false; Expr visit(const Call *op) override { if (op->is_intrinsic(Call::sliding_window_marker)) { internal_assert(op->args.size() == 2); const StringImm *name = op->args[0].as(); internal_assert(name); if (name->value == func.name()) { found_sliding_marker = true; } } return op; } Stmt visit(const Block *op) override { Stmt first = mutate(op->first); if (found_sliding_marker) { return Block::make(first, op->rest); } else { return Block::make(first, mutate(op->rest)); } } Stmt visit(const For *op) override { if (op->for_type != ForType::Serial && op->for_type != ForType::Unrolled) { // We can't proceed into a parallel for loop. // TODO: If there's no overlap between the region touched // by the threads as this loop counter varies // (i.e. there's no cross-talk between threads), then it's // safe to proceed. return op; } Stmt stmt; Stmt body = op->body; Box provided = box_provided(body, func.name()); Box required = box_required(body, func.name()); // For storage folding, we don't care about conditional reads. required.used = Expr(); Box box = box_union(provided, required); Expr loop_var = Variable::make(Int(32), op->name); Expr loop_min = Variable::make(Int(32), op->name + ".loop_min"); Expr loop_max = Variable::make(Int(32), op->name + ".loop_max"); string dynamic_footprint; Scope bounds; bounds.push(op->name, Interval(op->min, simplify(op->min + op->extent - 1))); Scope steady_bounds; steady_bounds.push(op->name, Interval(simplify(op->min + 1), simplify(op->min + op->extent - 1))); HasExternConsumer has_extern_consumer(func.name()); body.accept(&has_extern_consumer); // Try each dimension in turn from outermost in for (size_t i = box.size(); i > 0; i--) { int dim = (int)(i - 1); if (!box[dim].is_bounded()) { continue; } Expr min = simplify(common_subexpression_elimination(box[dim].min)); Expr max = simplify(common_subexpression_elimination(box[dim].max)); if (is_const(min) || is_const(max)) { debug(3) << "\nNot considering folding " << func.name() << " over for loop over " << op->name << " dimension " << i - 1 << "\n" << " because the min or max are constants." << "Min: " << min << "\n" << "Max: " << max << "\n"; continue; } Expr min_provided, max_provided, min_required, max_required; if (func.schedule().async() && !explicit_only) { if (!provided.empty()) { min_provided = simplify(provided[dim].min); max_provided = simplify(provided[dim].max); } if (!required.empty()) { min_required = simplify(required[dim].min); max_required = simplify(required[dim].max); } } string sema_name = func.name() + ".folding_semaphore." + unique_name('_'); Expr sema_var = Variable::make(type_of(), sema_name); // Consider the initial iteration and steady state // separately for all these proofs. Expr loop_var = Variable::make(Int(32), op->name); Expr steady_state = (op->min < loop_var); Expr min_steady = simplify(substitute(steady_state, const_true(), min), true, steady_bounds); Expr max_steady = simplify(substitute(steady_state, const_true(), max), true, steady_bounds); Expr min_initial = simplify(substitute(steady_state, const_false(), min), true, bounds); Expr max_initial = simplify(substitute(steady_state, const_false(), max), true, bounds); Expr extent_initial = simplify(substitute(loop_var, op->min, max_initial - min_initial + 1), true, bounds); Expr extent_steady = simplify(max_steady - min_steady + 1, true, steady_bounds); Expr extent = Max::make(extent_initial, extent_steady); extent = simplify(common_subexpression_elimination(extent), true, bounds); // Find the StorageDim corresponding to dim. const std::vector &storage_dims = func.schedule().storage_dims(); auto storage_dim_i = std::find_if(storage_dims.begin(), storage_dims.end(), [&](const StorageDim &i) { return i.var == func.args()[dim]; }); internal_assert(storage_dim_i != storage_dims.end()); const StorageDim &storage_dim = *storage_dim_i; Expr explicit_factor; if (!is_pure(min) || !is_pure(max) || has_extern_consumer.result || expr_uses_var(min, op->name) || expr_uses_var(max, op->name)) { // We only use the explicit fold factor if the fold is // relevant for this loop. If the fold isn't relevant // for this loop, the added asserts will be too // conservative. explicit_factor = storage_dim.fold_factor; } debug(3) << "\nConsidering folding " << func.name() << " over for loop over " << op->name << " dimension " << i - 1 << "\n" << "Min: " << min << "\n" << "Max: " << max << "\n" << "Extent: " << extent << "\n" << "explicit_factor: " << explicit_factor << "\n"; // First, attempt to detect if the loop is monotonically // increasing or decreasing (if we allow automatic folding). bool can_fold_forwards = false, can_fold_backwards = false; if (!explicit_only) { // We can't clobber data that will be read later. If // async, the producer can't un-release slots in the // circular buffer. can_fold_forwards = (is_monotonic(min, op->name) == Monotonic::Increasing); can_fold_backwards = (is_monotonic(max, op->name) == Monotonic::Decreasing); if (func.schedule().async()) { // Our semaphore acquire primitive can't take // negative values, so we can't un-acquire slots // in the circular buffer. can_fold_forwards &= (is_monotonic(max_provided, op->name) == Monotonic::Increasing); can_fold_backwards &= (is_monotonic(min_provided, op->name) == Monotonic::Decreasing); // We need to be able to analyze the required footprint to know how much to release can_fold_forwards &= min_required.defined(); can_fold_backwards &= max_required.defined(); } } // Uncomment to pretend that static analysis always fails (for testing) // can_fold_forwards = can_fold_backwards = false; if (!can_fold_forwards && !can_fold_backwards) { if (explicit_factor.defined()) { // If we didn't find a monotonic dimension, and we // have an explicit fold factor, we need to // dynamically check that the min/max do in fact // monotonically increase/decrease. We'll allocate // some stack space to store the valid footprint, // update it outside produce nodes, and check it // outside consume nodes. string head, tail; if (func.schedule().async()) { // If we're async, we need to keep a separate // counter for the producer and consumer. They // are coupled by a semaphore. The counter // represents the max index the producer may // write to. The invariant is that the // semaphore count is the difference between // the counters. So... // // when folding forwards, semaphore == head - tail // when folding backwards, semaphore == tail - head // // We'll initialize to head = tail, and // semaphore = 0. Every time the producer or // consumer wants to move the counter, it must // also acquire or release the semaphore to // prevent them from diverging too far. dynamic_footprint = func.name() + ".folding_semaphore." + op->name + unique_name('_'); head = dynamic_footprint + ".head"; tail = dynamic_footprint + ".tail"; } else { dynamic_footprint = func.name() + "." + op->name + unique_name('_') + ".head"; head = tail = dynamic_footprint; } body = InjectFoldingCheck(func, head, tail, op->name, sema_var, dim, storage_dim) .mutate(body); if (storage_dim.fold_forward) { can_fold_forwards = true; } else { can_fold_backwards = true; } } else { // Can't do much with this dimension if (!explicit_only) { debug(3) << "Not folding because loop min or max not monotonic in the loop variable\n" << "min_initial = " << min_initial << "\n" << "min_steady = " << min_steady << "\n" << "max_initial = " << max_initial << "\n" << "max_steady = " << max_steady << "\n"; } else { debug(3) << "Not folding because there is no explicit storage folding factor\n"; } continue; } } internal_assert(can_fold_forwards || can_fold_backwards); Expr factor; if (explicit_factor.defined()) { if (dynamic_footprint.empty() && !func.schedule().async()) { // We were able to prove monotonicity // statically, but we may need a runtime // assertion for maximum extent. In many cases // it will simplify away. For async schedules // it gets dynamically tracked anyway. Expr error = Call::make(Int(32), "halide_error_fold_factor_too_small", {func.name(), storage_dim.var, explicit_factor, op->name, extent}, Call::Extern); body = Block::make(AssertStmt::make(extent <= explicit_factor, error), body); } factor = explicit_factor; } else { // The max of the extent over all values of the loop variable must be a constant Scope scope; scope.push(op->name, Interval(loop_min, loop_max)); Expr max_extent = find_constant_bound(extent, Direction::Upper, scope); scope.pop(op->name); const int max_fold = 1024; const int64_t *const_max_extent = as_const_int(max_extent); if (const_max_extent && *const_max_extent <= max_fold) { factor = static_cast(next_power_of_two(*const_max_extent)); } else { // Try a little harder to find a bounding power of two int e = max_fold * 2; bool success = false; while (e > 0 && can_prove(extent <= e / 2)) { success = true; e /= 2; } if (success) { factor = e; } else { debug(3) << "Not folding because extent not bounded by a constant not greater than " << max_fold << "\n" << "extent = " << extent << "\n" << "max extent = " << max_extent << "\n"; // Try the next dimension continue; } } } internal_assert(factor.defined()); if (!explicit_factor.defined()) { VectorAccessOfFoldedDim vector_access_of_folded_dim{func.name(), dim}; body.accept(&vector_access_of_folded_dim); if (vector_access_of_folded_dim.result) { user_warning << "Not folding Func " << func.name() << " along dimension " << func.args()[dim] << " because there is vectorized access to that Func in that dimension and " << "storage folding was not explicitly requested in the schedule. In previous " << "versions of Halide this would have folded with factor " << factor << ". To restore the old behavior add " << func.name() << ".fold_storage(" << func.args()[dim] << ", " << factor << ") to your schedule.\n"; // Try the next dimension continue; } } debug(3) << "Proceeding with factor " << factor << "\n"; Fold fold = {(int)i - 1, factor}; dims_folded.push_back(fold); { string head; if (!dynamic_footprint.empty() && func.schedule().async()) { head = dynamic_footprint + ".head"; } else { head = dynamic_footprint; } body = FoldStorageOfFunction(func.name(), (int)i - 1, factor, head).mutate(body); } // If the producer is async, it can run ahead by // some amount controlled by a semaphore. if (func.schedule().async()) { Semaphore sema; sema.name = sema_name; sema.var = sema_var; sema.init = 0; if (dynamic_footprint.empty()) { // We are going to do the sem acquires and releases using static analysis of the boxes accessed. sema.init = factor; // Do the analysis of how much to acquire and release statically Expr to_acquire, to_release; if (can_fold_forwards) { Expr max_provided_prev = substitute(op->name, loop_var - 1, max_provided); Expr min_required_next = substitute(op->name, loop_var + 1, min_required); to_acquire = max_provided - max_provided_prev; // This is the first time we use these entries to_release = min_required_next - min_required; // This is the last time we use these entries } else { internal_assert(can_fold_backwards); Expr min_provided_prev = substitute(op->name, loop_var - 1, min_provided); Expr max_required_next = substitute(op->name, loop_var + 1, max_required); to_acquire = min_provided_prev - min_provided; // This is the first time we use these entries to_release = max_required - max_required_next; // This is the last time we use these entries } // On the first iteration, we need to acquire the extent of the region shared // between the producer and consumer, and we need to release it on the last // iteration. to_acquire = select(loop_var > loop_min, to_acquire, extent); to_release = select(loop_var < loop_max, to_release, extent); // We may need dynamic assertions that a positive // amount of the semaphore is acquired/released, // and that the semaphore is initialized to a // positive value. If we are able to prove it, // these checks will simplify away. string to_release_name = unique_name('t'); Expr to_release_var = Variable::make(Int(32), to_release_name); string to_acquire_name = unique_name('t'); Expr to_acquire_var = Variable::make(Int(32), to_acquire_name); Expr bad_fold_error = Call::make(Int(32), "halide_error_bad_fold", {func.name(), storage_dim.var, op->name}, Call::Extern); Expr release_producer = Call::make(Int(32), "halide_semaphore_release", {sema.var, to_release_var}, Call::Extern); Stmt release = Evaluate::make(release_producer); Stmt check_release = AssertStmt::make(to_release_var >= 0 && to_release <= factor, bad_fold_error); release = Block::make(check_release, release); release = LetStmt::make(to_release_name, to_release, release); Stmt check_acquire = AssertStmt::make(to_acquire_var >= 0 && to_acquire_var <= factor, bad_fold_error); body = Block::make(body, release); body = Acquire::make(sema.var, to_acquire_var, body); body = Block::make(check_acquire, body); body = LetStmt::make(to_acquire_name, to_acquire, body); } else { // We injected runtime tracking and semaphore logic already } dims_folded.back().semaphore = sema; } if (!dynamic_footprint.empty()) { if (func.schedule().async()) { dims_folded.back().head = dynamic_footprint + ".head"; dims_folded.back().tail = dynamic_footprint + ".tail"; } else { dims_folded.back().head = dynamic_footprint; dims_folded.back().tail.clear(); } dims_folded.back().fold_forward = storage_dim.fold_forward; } Expr min_next = substitute(op->name, loop_var + 1, min); if (can_prove(max < min_next)) { // There's no overlapping usage between loop // iterations, so we can continue to search // for further folding opportunities // recursively. } else if (!body.same_as(op->body)) { stmt = For::make(op->name, op->min, op->extent, op->for_type, op->partition_policy, op->device_api, body); break; } else { stmt = op; debug(3) << "Not folding because loop min or max not monotonic in the loop variable\n" << "min = " << min << "\n" << "max = " << max << "\n"; break; } } // Attempt to fold an inner loop. This will bail out if it encounters a // ProducerConsumer node for the func, or if it hits a sliding window // marker. body = mutate(body); if (body.same_as(op->body)) { stmt = op; } else { stmt = For::make(op->name, op->min, op->extent, op->for_type, op->partition_policy, op->device_api, body); } if (func.schedule().async() && !dynamic_footprint.empty()) { // Step the counters backwards over the entire extent of // the realization, in case we're in an inner loop and are // going to run this loop again with the same // semaphore. Our invariant is that the difference between // the two counters is the semaphore. // // Doing this instead of synchronizing and resetting the // counters and semaphores lets producers advance to the // next scanline while a consumer is still on the last few // pixels of the previous scanline. Expr head = Load::make(Int(32), dynamic_footprint + ".head", 0, Buffer<>(), Parameter(), const_true(), ModulusRemainder()); Expr tail = Load::make(Int(32), dynamic_footprint + ".tail", 0, Buffer<>(), Parameter(), const_true(), ModulusRemainder()); Expr step = Variable::make(Int(32), func.name() + ".extent." + std::to_string(dims_folded.back().dim)) + dims_folded.back().factor; Stmt reset_head = Store::make(dynamic_footprint + ".head_next", head - step, 0, Parameter(), const_true(), ModulusRemainder()); Stmt reset_tail = Store::make(dynamic_footprint + ".tail_next", tail - step, 0, Parameter(), const_true(), ModulusRemainder()); stmt = Block::make({stmt, reset_head, reset_tail}); } return stmt; } public: struct Fold { int dim; Expr factor; Semaphore semaphore; string head, tail; bool fold_forward; }; vector dims_folded; AttemptStorageFoldingOfFunction(Function f, bool explicit_only) : func(std::move(f)), explicit_only(explicit_only) { } }; // Look for opportunities for storage folding in a statement class StorageFolding : public IRMutator { const map &env; using IRMutator::visit; Stmt visit(const Realize *op) override { Stmt body = mutate(op->body); // Get the function associated with this realization, which // contains the explicit fold directives from the schedule. auto func_it = env.find(op->name); Function func = func_it != env.end() ? func_it->second : Function(); // Don't attempt automatic storage folding if there is // more than one produce node for this func. bool explicit_only = count_producers(body, op->name) != 1; AttemptStorageFoldingOfFunction folder(func, explicit_only); if (explicit_only) { debug(3) << "Attempting to fold " << op->name << " explicitly\n"; } else { debug(3) << "Attempting to fold " << op->name << " automatically or explicitly\n"; } body = folder.mutate(body); if (body.same_as(op->body)) { return op; } else if (folder.dims_folded.empty()) { return Realize::make(op->name, op->types, op->memory_type, op->bounds, op->condition, body); } else { Region bounds = op->bounds; // Collapse down the extent in the folded dimension for (const auto &dim : folder.dims_folded) { int d = dim.dim; Expr f = dim.factor; internal_assert(d >= 0 && d < (int)bounds.size()); bounds[d] = Range(0, f); } Stmt stmt = Realize::make(op->name, op->types, op->memory_type, bounds, op->condition, body); // Each fold may have an associated semaphore that needs initialization, along with some counters for (const auto &fold : folder.dims_folded) { auto sema = fold.semaphore; if (sema.var.defined()) { Expr sema_space = Call::make(type_of(), "halide_make_semaphore", {sema.init}, Call::Extern); stmt = LetStmt::make(sema.name, sema_space, stmt); } Expr init; if (fold.fold_forward) { init = op->bounds[fold.dim].min; } else { init = op->bounds[fold.dim].min + op->bounds[fold.dim].extent - 1; } if (!fold.head.empty()) { stmt = Block::make(Store::make(fold.head + "_next", init, 0, Parameter(), const_true(), ModulusRemainder()), stmt); stmt = Allocate::make(fold.head + "_next", Int(32), MemoryType::Stack, {}, const_true(), stmt); stmt = Block::make(Store::make(fold.head, init, 0, Parameter(), const_true(), ModulusRemainder()), stmt); stmt = Allocate::make(fold.head, Int(32), MemoryType::Stack, {}, const_true(), stmt); } if (!fold.tail.empty()) { internal_assert(func.schedule().async()) << "Expected a single counter for synchronous folding"; stmt = Block::make(Store::make(fold.tail + "_next", init, 0, Parameter(), const_true(), ModulusRemainder()), stmt); stmt = Allocate::make(fold.tail + "_next", Int(32), MemoryType::Stack, {}, const_true(), stmt); stmt = Block::make(Store::make(fold.tail, init, 0, Parameter(), const_true(), ModulusRemainder()), stmt); stmt = Allocate::make(fold.tail, Int(32), MemoryType::Stack, {}, const_true(), stmt); } } return stmt; } } public: StorageFolding(const map &env) : env(env) { } }; class RemoveSlidingWindowMarkers : public IRMutator { using IRMutator::visit; Expr visit(const Call *op) override { if (op->is_intrinsic(Call::sliding_window_marker)) { return make_zero(op->type); } else { return IRMutator::visit(op); } } }; } // namespace Stmt storage_folding(const Stmt &s, const std::map &env) { Stmt stmt = StorageFolding(env).mutate(s); stmt = RemoveSlidingWindowMarkers().mutate(stmt); return stmt; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/StorageFolding.h000066400000000000000000000015011456515664200166500ustar00rootroot00000000000000#ifndef HALIDE_STORAGE_FOLDING_H #define HALIDE_STORAGE_FOLDING_H /** \file * Defines the lowering optimization pass that reduces large buffers * down to smaller circular buffers when possible */ #include #include #include "Expr.h" namespace Halide { namespace Internal { class Function; /** Fold storage of functions if possible. This means reducing one of * the dimensions module something for the purpose of storage, if we * can prove that this is safe to do. E.g consider: * \code f(x) = ... g(x) = f(x-1) + f(x) f.store_root().compute_at(g, x); \endcode * * We can store f as a circular buffer of size two, instead of * allocating space for all of it. */ Stmt storage_folding(const Stmt &s, const std::map &env); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/StrictifyFloat.cpp000066400000000000000000000040001456515664200172370ustar00rootroot00000000000000#include "StrictifyFloat.h" #include "Function.h" #include "IRMutator.h" #include "IROperator.h" namespace Halide { namespace Internal { namespace { class StrictifyFloat : public IRMutator { enum Strictness { FastMath, StrictFloat, } strictness; using IRMutator::visit; Expr visit(const Call *call) override { Strictness new_strictness = strictness; if (call->is_intrinsic(Call::strict_float)) { new_strictness = StrictFloat; any_strict_float |= true; } ScopedValue save_strictness(strictness, new_strictness); return IRMutator::visit(call); } using IRMutator::mutate; Expr mutate(const Expr &expr) override { if (!expr.defined()) { return expr; } Expr e = IRMutator::mutate(expr); if (e.type().is_float()) { switch (strictness) { case FastMath: return e; case StrictFloat: return strict_float(e); } } return e; } public: enum StrictnessMode { Allowed, Forced }; bool any_strict_float{false}; StrictifyFloat(StrictnessMode mode) : strictness((mode == Forced) ? StrictFloat : FastMath) { any_strict_float |= (mode == Forced); } }; } // namespace bool strictify_float(std::map &env, const Target &t) { bool any_strict_float = false; for (auto &iter : env) { Function &func = iter.second; StrictifyFloat::StrictnessMode mode = StrictifyFloat::Allowed; if (t.has_feature(Target::StrictFloat)) { mode = StrictifyFloat::Forced; } // TODO(zalman): Some targets don't allow strict float and we can provide errors for these. StrictifyFloat strictify(mode); func.mutate(&strictify); any_strict_float |= strictify.any_strict_float; } return any_strict_float; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/StrictifyFloat.h000066400000000000000000000015101456515664200167070ustar00rootroot00000000000000#ifndef HALIDE_STRICTIFY_FLOAT_H #define HALIDE_STRICTIFY_FLOAT_H /** \file * Defines a lowering pass to make all floating-point strict for all top-level Exprs. */ #include #include namespace Halide { struct Target; namespace Internal { class Function; /** Propagate strict_float intrinisics such that they immediately wrap * all floating-point expressions. This makes the IR nodes context * independent. If the Target::StrictFloat flag is specified in * target, starts in strict_float mode so all floating-point type * Exprs in the compilation will be marked with strict_float. Returns * whether any strict floating-point is used in any function in the * passed in env. */ bool strictify_float(std::map &env, const Target &t); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/Substitute.cpp000066400000000000000000000142271456515664200164600ustar00rootroot00000000000000#include "Substitute.h" #include "IREquality.h" #include "IRMutator.h" #include "Scope.h" namespace Halide { namespace Internal { using std::map; using std::string; namespace { class Substitute : public IRMutator { const map &replace; Scope<> hidden; Expr find_replacement(const string &s) { map::const_iterator iter = replace.find(s); if (iter != replace.end() && !hidden.contains(s)) { return iter->second; } else { return Expr(); } } public: Substitute(const map &m) : replace(m) { } using IRMutator::visit; Expr visit(const Variable *v) override { Expr r = find_replacement(v->name); if (r.defined()) { return r; } else { return v; } } template auto visit_let(const T *op) -> decltype(op->body) { decltype(op->body) orig = op; struct Frame { const T *op; Expr new_value; ScopedBinding<> bind; }; std::vector frames; decltype(op->body) body; bool values_unchanged = true; do { Expr new_value = mutate(op->value); values_unchanged &= new_value.same_as(op->value); frames.push_back(Frame{op, std::move(new_value), ScopedBinding<>(hidden, op->name)}); body = op->body; op = body.template as(); } while (op); auto new_body = mutate(body); if (values_unchanged && new_body.same_as(body)) { return orig; } else { for (auto it = frames.rbegin(); it != frames.rend(); it++) { new_body = T::make(it->op->name, it->new_value, new_body); } return new_body; } } Expr visit(const Let *op) override { return visit_let(op); } Stmt visit(const LetStmt *op) override { return visit_let(op); } Stmt visit(const For *op) override { Expr new_min = mutate(op->min); Expr new_extent = mutate(op->extent); hidden.push(op->name); Stmt new_body = mutate(op->body); hidden.pop(op->name); if (new_min.same_as(op->min) && new_extent.same_as(op->extent) && new_body.same_as(op->body)) { return op; } else { return For::make(op->name, new_min, new_extent, op->for_type, op->partition_policy, op->device_api, new_body); } } }; } // namespace Expr substitute(const string &name, const Expr &replacement, const Expr &expr) { map m; m[name] = replacement; Substitute s(m); return s.mutate(expr); } Stmt substitute(const string &name, const Expr &replacement, const Stmt &stmt) { map m; m[name] = replacement; Substitute s(m); return s.mutate(stmt); } Expr substitute(const map &m, const Expr &expr) { Substitute s(m); return s.mutate(expr); } Stmt substitute(const map &m, const Stmt &stmt) { Substitute s(m); return s.mutate(stmt); } namespace { class SubstituteExpr : public IRMutator { public: Expr find, replacement; using IRMutator::mutate; Expr mutate(const Expr &e) override { if (equal(e, find)) { return replacement; } else { return IRMutator::mutate(e); } } }; } // namespace Expr substitute(const Expr &find, const Expr &replacement, const Expr &expr) { SubstituteExpr s; s.find = find; s.replacement = replacement; return s.mutate(expr); } Stmt substitute(const Expr &find, const Expr &replacement, const Stmt &stmt) { SubstituteExpr s; s.find = find; s.replacement = replacement; return s.mutate(stmt); } namespace { /** Substitute an expr for a var in a graph. */ class GraphSubstitute : public IRGraphMutator { string var; Expr value; using IRGraphMutator::visit; Expr visit(const Variable *op) override { if (op->name == var) { return value; } else { return op; } } Expr visit(const Let *op) override { Expr new_value = mutate(op->value); if (op->name == var) { return Let::make(op->name, new_value, op->body); } else { return Let::make(op->name, new_value, mutate(op->body)); } } public: GraphSubstitute(const string &var, const Expr &value) : var(var), value(value) { } }; /** Substitute an Expr for another Expr in a graph. Unlike substitute, * this only checks for shallow equality. */ class GraphSubstituteExpr : public IRGraphMutator { Expr find, replace; public: using IRGraphMutator::mutate; Expr mutate(const Expr &e) override { if (e.same_as(find)) { return replace; } else { return IRGraphMutator::mutate(e); } } GraphSubstituteExpr(const Expr &find, const Expr &replace) : find(find), replace(replace) { } }; } // namespace Expr graph_substitute(const string &name, const Expr &replacement, const Expr &expr) { return GraphSubstitute(name, replacement).mutate(expr); } Stmt graph_substitute(const string &name, const Expr &replacement, const Stmt &stmt) { return GraphSubstitute(name, replacement).mutate(stmt); } Expr graph_substitute(const Expr &find, const Expr &replacement, const Expr &expr) { return GraphSubstituteExpr(find, replacement).mutate(expr); } Stmt graph_substitute(const Expr &find, const Expr &replacement, const Stmt &stmt) { return GraphSubstituteExpr(find, replacement).mutate(stmt); } namespace { class SubstituteInAllLets : public IRGraphMutator { using IRGraphMutator::visit; Expr visit(const Let *op) override { Expr value = mutate(op->value); Expr body = mutate(op->body); return graph_substitute(op->name, value, body); } }; } // namespace Expr substitute_in_all_lets(const Expr &expr) { return SubstituteInAllLets().mutate(expr); } Stmt substitute_in_all_lets(const Stmt &stmt) { return SubstituteInAllLets().mutate(stmt); } } // namespace Internal } // namespace Halide Halide-17.0.1/src/Substitute.h000066400000000000000000000043331456515664200161220ustar00rootroot00000000000000#ifndef HALIDE_SUBSTITUTE_H #define HALIDE_SUBSTITUTE_H /** \file * * Defines methods for substituting out variables in expressions and * statements. */ #include #include "Expr.h" namespace Halide { namespace Internal { /** Substitute variables with the given name with the replacement * expression within expr. This is a dangerous thing to do if variable * names have not been uniquified. While it won't traverse inside let * statements with the same name as the first argument, moving a piece * of syntax around can change its meaning, because it can cross lets * that redefine variable names that it includes references to. */ Expr substitute(const std::string &name, const Expr &replacement, const Expr &expr); /** Substitute variables with the given name with the replacement * expression within stmt. */ Stmt substitute(const std::string &name, const Expr &replacement, const Stmt &stmt); /** Substitute variables with names in the map. */ // @{ Expr substitute(const std::map &replacements, const Expr &expr); Stmt substitute(const std::map &replacements, const Stmt &stmt); // @} /** Substitute expressions for other expressions. */ // @{ Expr substitute(const Expr &find, const Expr &replacement, const Expr &expr); Stmt substitute(const Expr &find, const Expr &replacement, const Stmt &stmt); // @} /** Substitutions where the IR may be a general graph (and not just a * DAG). */ // @{ Expr graph_substitute(const std::string &name, const Expr &replacement, const Expr &expr); Stmt graph_substitute(const std::string &name, const Expr &replacement, const Stmt &stmt); Expr graph_substitute(const Expr &find, const Expr &replacement, const Expr &expr); Stmt graph_substitute(const Expr &find, const Expr &replacement, const Stmt &stmt); // @} /** Substitute in all let Exprs in a piece of IR. Doesn't substitute * in let stmts, as this may change the meaning of the IR (e.g. by * moving a load after a store). Produces graphs of IR, so don't use * non-graph-aware visitors or mutators on it until you've CSE'd the * result. */ // @{ Expr substitute_in_all_lets(const Expr &expr); Stmt substitute_in_all_lets(const Stmt &stmt); // @} } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/Target.cpp000066400000000000000000001474301456515664200155360ustar00rootroot00000000000000#include #include #include #include "Target.h" #include "Debug.h" #include "DeviceInterface.h" #include "Error.h" #include "Util.h" #include "WasmExecutor.h" #if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__)) #if defined(__FreeBSD__) #include #include #endif // This uses elf.h and must be included after "LLVM_Headers.h", which // uses llvm/support/Elf.h. #include #endif #ifdef _MSC_VER #include #endif // _MSC_VER namespace Halide { using std::string; using std::vector; namespace { #ifdef _MSC_VER static void cpuid(int info[4], int infoType, int extra) { __cpuidex(info, infoType, extra); } #else #if defined(__x86_64__) || defined(__i386__) // CPU feature detection code taken from ispc // (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll) void cpuid(int info[4], int infoType, int extra) { __asm__ __volatile__( "cpuid \n\t" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "0"(infoType), "2"(extra)); } #endif #endif #if defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER) enum class VendorSignatures { Unknown, GenuineIntel, AuthenticAMD, }; VendorSignatures get_vendor_signature() { int info[4]; cpuid(info, 0, 0); if (info[0] < 1) { return VendorSignatures::Unknown; } // "Genu ineI ntel" if (info[1] == 0x756e6547 && info[3] == 0x49656e69 && info[2] == 0x6c65746e) { return VendorSignatures::GenuineIntel; } // "Auth enti cAMD" if (info[1] == 0x68747541 && info[3] == 0x69746e65 && info[2] == 0x444d4163) { return VendorSignatures::AuthenticAMD; } return VendorSignatures::Unknown; } void detect_family_and_model(int info0, unsigned &family, unsigned &model) { family = (info0 >> 8) & 0xF; // Bits 8..11 model = (info0 >> 4) & 0xF; // Bits 4..7 if (family == 0x6 || family == 0xF) { if (family == 0xF) { // Examine extended family ID if family ID is 0xF. family += (info0 >> 20) & 0xFf; // Bits 20..27 } // Examine extended model ID if family ID is 0x6 or 0xF. model += ((info0 >> 16) & 0xF) << 4; // Bits 16..19 } } Target::Processor get_amd_processor(unsigned family, unsigned model, bool have_sse3) { switch (family) { case 0xF: // AMD Family 0Fh if (have_sse3) { return Target::Processor::K8_SSE3; // Hammer (modern, with SSE3) } return Target::Processor::K8; // Hammer (original, without SSE3) case 0x10: // AMD Family 10h return Target::Processor::AMDFam10; // Barcelona case 0x14: // AMD Family 14h return Target::Processor::BtVer1; // Bobcat case 0x15: // AMD Family 15h if (model >= 0x60 && model <= 0x7f) { return Target::Processor::BdVer4; // 60h-7Fh: Excavator } if (model >= 0x30 && model <= 0x3f) { return Target::Processor::BdVer3; // 30h-3Fh: Steamroller } if ((model >= 0x10 && model <= 0x1f) || model == 0x02) { return Target::Processor::BdVer2; // 02h, 10h-1Fh: Piledriver } if (model <= 0x0f) { return Target::Processor::BdVer1; // 00h-0Fh: Bulldozer } break; case 0x16: // AMD Family 16h return Target::Processor::BtVer2; // Jaguar case 0x17: // AMD Family 17h if ((model >= 0x30 && model <= 0x3f) || model == 0x71) { return Target::Processor::ZnVer2; // 30h-3Fh, 71h: Zen2 } if (model <= 0x0f) { return Target::Processor::ZnVer1; // 00h-0Fh: Zen1 } break; case 0x19: // AMD Family 19h if ((model & 0xf0) == 0 || model == 0x21) { return Target::Processor::ZnVer3; // 00h-0Fh, 21h: Zen3 } else if (model == 0x61) { return Target::Processor::ZnVer4; // 61h: Zen4 } break; default: break; // Unknown AMD CPU. } return Target::Processor::ProcessorGeneric; } #endif // defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER) Target calculate_host_target() { Target::OS os = Target::OSUnknown; #ifdef __linux__ os = Target::Linux; #endif #ifdef _WIN32 os = Target::Windows; #endif #ifdef __APPLE__ os = Target::OSX; #endif bool use_64_bits = (sizeof(size_t) == 8); int bits = use_64_bits ? 64 : 32; int vector_bits = 0; Target::Processor processor = Target::Processor::ProcessorGeneric; std::vector initial_features; #if __riscv Target::Arch arch = Target::RISCV; #else #if defined(__arm__) || defined(__aarch64__) Target::Arch arch = Target::ARM; #else #if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__)) Target::Arch arch = Target::POWERPC; #if defined(__linux__) unsigned long hwcap = getauxval(AT_HWCAP); unsigned long hwcap2 = getauxval(AT_HWCAP2); #elif defined(__FreeBSD__) unsigned long hwcap, hwcap2; elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)); #endif bool have_altivec = (hwcap & PPC_FEATURE_HAS_ALTIVEC) != 0; bool have_vsx = (hwcap & PPC_FEATURE_HAS_VSX) != 0; bool arch_2_07 = (hwcap2 & PPC_FEATURE2_ARCH_2_07) != 0; user_assert(have_altivec) << "The POWERPC backend assumes at least AltiVec support. This machine does not appear to have AltiVec.\n"; if (have_vsx) initial_features.push_back(Target::VSX); if (arch_2_07) initial_features.push_back(Target::POWER_ARCH_2_07); #else Target::Arch arch = Target::X86; VendorSignatures vendor_signature = get_vendor_signature(); int info[4]; cpuid(info, 1, 0); unsigned family = 0, model = 0; detect_family_and_model(info[0], family, model); bool have_sse41 = (info[2] & (1 << 19)) != 0; // ECX[19] bool have_sse2 = (info[3] & (1 << 26)) != 0; // EDX[26] bool have_sse3 = (info[2] & (1 << 0)) != 0; // ECX[0] bool have_avx = (info[2] & (1 << 28)) != 0; // ECX[28] bool have_f16c = (info[2] & (1 << 29)) != 0; // ECX[29] bool have_rdrand = (info[2] & (1 << 30)) != 0; // ECX[30] bool have_fma = (info[2] & (1 << 12)) != 0; // ECX[12] user_assert(have_sse2) << "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n" << "cpuid returned: " << std::hex << info[0] << ", " << info[1] << ", " << info[2] << ", " << info[3] << std::dec << "\n"; if (vendor_signature == VendorSignatures::AuthenticAMD) { processor = get_amd_processor(family, model, have_sse3); if (processor == Target::Processor::ZnVer4) { Target t{os, arch, bits, processor, initial_features, vector_bits}; t.set_features({Target::SSE41, Target::AVX, Target::F16C, Target::FMA, Target::AVX2, Target::AVX512, Target::AVX512_Skylake, Target::AVX512_Cannonlake, Target::AVX512_Zen4}); return t; } } // Processors not specifically detected by model number above use the cpuid // feature bits to determine what flags are supported. For future models, // detect them explicitly above rather than extending the code below. if (have_sse41) { initial_features.push_back(Target::SSE41); } if (have_avx) { initial_features.push_back(Target::AVX); } if (have_f16c) { initial_features.push_back(Target::F16C); } if (have_fma) { initial_features.push_back(Target::FMA); } if (use_64_bits && have_avx && have_f16c && have_rdrand) { // So far, so good. AVX2/512? // Call cpuid with eax=7, ecx=0 int info2[4]; cpuid(info2, 7, 0); const uint32_t avx2 = 1U << 5; const uint32_t avx512f = 1U << 16; const uint32_t avx512dq = 1U << 17; const uint32_t avx512pf = 1U << 26; const uint32_t avx512er = 1U << 27; const uint32_t avx512cd = 1U << 28; const uint32_t avx512bw = 1U << 30; const uint32_t avx512vl = 1U << 31; const uint32_t avx512ifma = 1U << 21; const uint32_t avx512 = avx512f | avx512cd; const uint32_t avx512_knl = avx512 | avx512pf | avx512er; const uint32_t avx512_skylake = avx512 | avx512vl | avx512bw | avx512dq; const uint32_t avx512_cannonlake = avx512_skylake | avx512ifma; // Assume ifma => vbmi if ((info2[1] & avx2) == avx2) { initial_features.push_back(Target::AVX2); } if ((info2[1] & avx512) == avx512) { initial_features.push_back(Target::AVX512); // TODO: port to family/model -based detection. if ((info2[1] & avx512_knl) == avx512_knl) { initial_features.push_back(Target::AVX512_KNL); } // TODO: port to family/model -based detection. if ((info2[1] & avx512_skylake) == avx512_skylake) { initial_features.push_back(Target::AVX512_Skylake); } // TODO: port to family/model -based detection. if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) { initial_features.push_back(Target::AVX512_Cannonlake); const uint32_t avxvnni = 1U << 4; // avxvnni (note, not avx512vnni) result in eax const uint32_t avx512bf16 = 1U << 5; // bf16 result in eax, with cpuid(eax=7, ecx=1) int info3[4]; cpuid(info3, 7, 1); // TODO: port to family/model -based detection. if ((info3[0] & avxvnni) == avxvnni && (info3[0] & avx512bf16) == avx512bf16) { initial_features.push_back(Target::AVX512_SapphireRapids); } } } } #endif #endif #endif return {os, arch, bits, processor, initial_features, vector_bits}; } bool is_using_hexagon(const Target &t) { return (t.has_feature(Target::HVX) || t.has_feature(Target::HVX_v62) || t.has_feature(Target::HVX_v65) || t.has_feature(Target::HVX_v66) || t.has_feature(Target::HexagonDma) || t.arch == Target::Hexagon); } int get_hvx_lower_bound(const Target &t) { if (!is_using_hexagon(t)) { return -1; } if (t.has_feature(Target::HVX_v62)) { return 62; } if (t.has_feature(Target::HVX_v65)) { return 65; } if (t.has_feature(Target::HVX_v66)) { return 66; } return 60; } } // namespace Target get_host_target() { // Calculating the host target isn't slow but it isn't free, // and it's pointless to recalculate it every time we (e.g.) parse // an arbitrary Target string. It won't ever change, so cache on first // use. static Target host_target = calculate_host_target(); return host_target; } namespace { Target::Feature calculate_host_cuda_capability(Target t) { const auto *interface = get_device_interface_for_device_api(DeviceAPI::CUDA, t); internal_assert(interface->compute_capability); int major, minor; int err = interface->compute_capability(nullptr, &major, &minor); internal_assert(err == 0) << "Failed to query cuda compute capability\n"; int ver = major * 10 + minor; if (ver < 30) { return Target::FeatureEnd; } else if (ver < 32) { return Target::CUDACapability30; } else if (ver < 35) { return Target::CUDACapability32; } else if (ver < 50) { return Target::CUDACapability35; } else if (ver < 61) { return Target::CUDACapability50; } else if (ver < 70) { return Target::CUDACapability61; } else if (ver < 75) { return Target::CUDACapability70; } else if (ver < 80) { return Target::CUDACapability75; } else if (ver < 86) { return Target::CUDACapability80; } else { return Target::CUDACapability86; } } Target::Feature get_host_cuda_capability(Target t) { static Target::Feature cap = calculate_host_cuda_capability(t); return cap; } Target::Feature calculate_host_vulkan_capability(Target t) { const auto *interface = get_device_interface_for_device_api(DeviceAPI::Vulkan, t); internal_assert(interface->compute_capability); int major, minor; int err = interface->compute_capability(nullptr, &major, &minor); internal_assert(err == 0) << "Failed to query vulkan compute capability\n"; int ver = major * 10 + minor; if (ver < 10) { return Target::FeatureEnd; } else if (ver < 12) { return Target::VulkanV10; } else if (ver < 13) { return Target::VulkanV12; } else { return Target::VulkanV13; } } Target::Feature get_host_vulkan_capability(Target t) { static Target::Feature cap = calculate_host_vulkan_capability(t); return cap; } const std::map os_name_map = { {"os_unknown", Target::OSUnknown}, {"linux", Target::Linux}, {"windows", Target::Windows}, {"osx", Target::OSX}, {"android", Target::Android}, {"ios", Target::IOS}, {"qurt", Target::QuRT}, {"noos", Target::NoOS}, {"fuchsia", Target::Fuchsia}, {"wasmrt", Target::WebAssemblyRuntime}}; bool lookup_os(const std::string &tok, Target::OS &result) { auto os_iter = os_name_map.find(tok); if (os_iter != os_name_map.end()) { result = os_iter->second; return true; } return false; } const std::map arch_name_map = { {"arch_unknown", Target::ArchUnknown}, {"x86", Target::X86}, {"arm", Target::ARM}, {"powerpc", Target::POWERPC}, {"hexagon", Target::Hexagon}, {"wasm", Target::WebAssembly}, {"riscv", Target::RISCV}, }; bool lookup_arch(const std::string &tok, Target::Arch &result) { auto arch_iter = arch_name_map.find(tok); if (arch_iter != arch_name_map.end()) { result = arch_iter->second; return true; } return false; } /// Important design consideration: currently, the string key is /// effectively identical to the LLVM CPU string, and it would be really really /// good to keep it that way, so the proper tune_* can be autogenerated easily /// from the LLVM CPU string (currently, by replacing "-" with "_", /// and prepending "tune_" prefix) /// /// Please keep sorted. const std::map processor_name_map = { {"tune_amdfam10", Target::Processor::AMDFam10}, {"tune_bdver1", Target::Processor::BdVer1}, {"tune_bdver2", Target::Processor::BdVer2}, {"tune_bdver3", Target::Processor::BdVer3}, {"tune_bdver4", Target::Processor::BdVer4}, {"tune_btver1", Target::Processor::BtVer1}, {"tune_btver2", Target::Processor::BtVer2}, {"tune_generic", Target::Processor::ProcessorGeneric}, {"tune_k8", Target::Processor::K8}, {"tune_k8_sse3", Target::Processor::K8_SSE3}, {"tune_znver1", Target::Processor::ZnVer1}, {"tune_znver2", Target::Processor::ZnVer2}, {"tune_znver3", Target::Processor::ZnVer3}, {"tune_znver4", Target::Processor::ZnVer4}, }; bool lookup_processor(const std::string &tok, Target::Processor &result) { auto processor_iter = processor_name_map.find(tok); if (processor_iter != processor_name_map.end()) { result = processor_iter->second; return true; } return false; } const std::map feature_name_map = { {"jit", Target::JIT}, {"debug", Target::Debug}, {"no_asserts", Target::NoAsserts}, {"no_bounds_query", Target::NoBoundsQuery}, {"sse41", Target::SSE41}, {"avx", Target::AVX}, {"avx2", Target::AVX2}, {"fma", Target::FMA}, {"fma4", Target::FMA4}, {"f16c", Target::F16C}, {"armv7s", Target::ARMv7s}, {"no_neon", Target::NoNEON}, {"vsx", Target::VSX}, {"power_arch_2_07", Target::POWER_ARCH_2_07}, {"cuda", Target::CUDA}, {"cuda_capability_30", Target::CUDACapability30}, {"cuda_capability_32", Target::CUDACapability32}, {"cuda_capability_35", Target::CUDACapability35}, {"cuda_capability_50", Target::CUDACapability50}, {"cuda_capability_61", Target::CUDACapability61}, {"cuda_capability_70", Target::CUDACapability70}, {"cuda_capability_75", Target::CUDACapability75}, {"cuda_capability_80", Target::CUDACapability80}, {"cuda_capability_86", Target::CUDACapability86}, {"opencl", Target::OpenCL}, {"cl_doubles", Target::CLDoubles}, {"cl_half", Target::CLHalf}, {"cl_atomics64", Target::CLAtomics64}, {"openglcompute", Target::OpenGLCompute}, {"egl", Target::EGL}, {"user_context", Target::UserContext}, {"profile", Target::Profile}, {"no_runtime", Target::NoRuntime}, {"metal", Target::Metal}, {"c_plus_plus_name_mangling", Target::CPlusPlusMangling}, {"large_buffers", Target::LargeBuffers}, {"hvx", Target::HVX_128}, {"hvx_128", Target::HVX_128}, {"hvx_v62", Target::HVX_v62}, {"hvx_v65", Target::HVX_v65}, {"hvx_v66", Target::HVX_v66}, {"fuzz_float_stores", Target::FuzzFloatStores}, {"soft_float_abi", Target::SoftFloatABI}, {"msan", Target::MSAN}, {"avx512", Target::AVX512}, {"avx512_knl", Target::AVX512_KNL}, {"avx512_skylake", Target::AVX512_Skylake}, {"avx512_cannonlake", Target::AVX512_Cannonlake}, {"avx512_sapphirerapids", Target::AVX512_SapphireRapids}, {"avx512_zen4", Target::AVX512_Zen4}, {"trace_loads", Target::TraceLoads}, {"trace_stores", Target::TraceStores}, {"trace_realizations", Target::TraceRealizations}, {"trace_pipeline", Target::TracePipeline}, {"d3d12compute", Target::D3D12Compute}, {"strict_float", Target::StrictFloat}, {"tsan", Target::TSAN}, {"asan", Target::ASAN}, {"check_unsafe_promises", Target::CheckUnsafePromises}, {"hexagon_dma", Target::HexagonDma}, {"embed_bitcode", Target::EmbedBitcode}, {"enable_llvm_loop_opt", Target::EnableLLVMLoopOpt}, {"wasm_simd128", Target::WasmSimd128}, {"wasm_mvponly", Target::WasmMvpOnly}, {"wasm_threads", Target::WasmThreads}, {"wasm_bulk_memory", Target::WasmBulkMemory}, {"webgpu", Target::WebGPU}, {"sve", Target::SVE}, {"sve2", Target::SVE2}, {"arm_dot_prod", Target::ARMDotProd}, {"arm_fp16", Target::ARMFp16}, {"llvm_large_code_model", Target::LLVMLargeCodeModel}, {"rvv", Target::RVV}, {"armv81a", Target::ARMv81a}, {"sanitizer_coverage", Target::SanitizerCoverage}, {"profile_by_timer", Target::ProfileByTimer}, {"spirv", Target::SPIRV}, {"vulkan", Target::Vulkan}, {"vk_int8", Target::VulkanInt8}, {"vk_int16", Target::VulkanInt16}, {"vk_int64", Target::VulkanInt64}, {"vk_float16", Target::VulkanFloat16}, {"vk_float64", Target::VulkanFloat64}, {"vk_v10", Target::VulkanV10}, {"vk_v12", Target::VulkanV12}, {"vk_v13", Target::VulkanV13}, {"semihosting", Target::Semihosting}, // NOTE: When adding features to this map, be sure to update PyEnums.cpp as well. }; bool lookup_feature(const std::string &tok, Target::Feature &result) { auto feature_iter = feature_name_map.find(tok); if (feature_iter != feature_name_map.end()) { result = feature_iter->second; return true; } return false; } int parse_vector_bits(const std::string &tok) { if (tok.find("vector_bits_") == 0) { std::string num = tok.substr(sizeof("vector_bits_") - 1, std::string::npos); size_t end_index; int parsed = std::stoi(num, &end_index); if (end_index == num.size()) { return parsed; } } return -1; } void set_sanitizer_bits(Target &t) { // Note, we must include Util.h for these to be defined properly (or not) #ifdef HALIDE_INTERNAL_USING_ASAN t.set_feature(Target::ASAN); #endif #ifdef HALIDE_INTERNAL_USING_MSAN t.set_feature(Target::MSAN); #endif #ifdef HALIDE_INTERNAL_USING_TSAN t.set_feature(Target::TSAN); #endif #ifdef HALIDE_INTERNAL_USING_COVSAN t.set_feature(Target::SanitizerCoverage); #endif } } // End anonymous namespace Target get_target_from_environment() { string target = Internal::get_env_variable("HL_TARGET"); if (target.empty()) { return get_host_target(); } else { return Target(target); } } Target get_jit_target_from_environment() { Target host = get_host_target(); host.set_feature(Target::JIT); string target = Internal::get_env_variable("HL_JIT_TARGET"); if (target.empty()) { set_sanitizer_bits(host); return host; } else { Target t(target); t.set_feature(Target::JIT); user_assert((t.os == host.os && t.arch == host.arch && t.bits == host.bits) || Internal::WasmModule::can_jit_target(t)) << "HL_JIT_TARGET must match the host OS, architecture, and bit width.\n" << "HL_JIT_TARGET was " << target << ". " << "Host is " << host.to_string() << ".\n"; user_assert(!t.has_feature(Target::NoBoundsQuery)) << "The Halide JIT requires the use of bounds query, but HL_JIT_TARGET was specified with no_bounds_query: " << target; set_sanitizer_bits(t); return t; } } namespace { bool merge_string(Target &t, const std::string &target) { string rest = target; vector tokens; size_t first_dash; while ((first_dash = rest.find('-')) != string::npos) { // Internal::debug(0) << first_dash << ", " << rest << "\n"; tokens.push_back(rest.substr(0, first_dash)); rest = rest.substr(first_dash + 1); } tokens.push_back(rest); bool os_specified = false, arch_specified = false, bits_specified = false, processor_specified = false, features_specified = false; bool is_host = false; for (size_t i = 0; i < tokens.size(); i++) { const string &tok = tokens[i]; Target::Feature feature; int vector_bits; if (tok == "host") { if (i > 0) { // "host" is now only allowed as the first token. return false; } is_host = true; t = get_host_target(); } else if (tok == "32" || tok == "64" || tok == "0") { if (bits_specified) { return false; } bits_specified = true; t.bits = std::stoi(tok); } else if (lookup_arch(tok, t.arch)) { if (arch_specified) { return false; } arch_specified = true; } else if (lookup_os(tok, t.os)) { if (os_specified) { return false; } os_specified = true; } else if (lookup_processor(tok, t.processor_tune)) { if (processor_specified) { return false; } processor_specified = true; } else if (lookup_feature(tok, feature)) { t.set_feature(feature); features_specified = true; } else if (tok == "trace_all") { t.set_features({Target::TraceLoads, Target::TraceStores, Target::TraceRealizations}); features_specified = true; } else if ((vector_bits = parse_vector_bits(tok)) >= 0) { t.vector_bits = vector_bits; } else { return false; } } if (is_host && t.has_feature(Target::CUDA) && !t.has_feature(Target::CUDACapability30) && !t.has_feature(Target::CUDACapability32) && !t.has_feature(Target::CUDACapability35) && !t.has_feature(Target::CUDACapability50) && !t.has_feature(Target::CUDACapability61) && !t.has_feature(Target::CUDACapability70) && !t.has_feature(Target::CUDACapability75) && !t.has_feature(Target::CUDACapability80) && !t.has_feature(Target::CUDACapability86)) { // Detect host cuda capability t.set_feature(get_host_cuda_capability(t)); } if (is_host && t.has_feature(Target::Vulkan) && !t.has_feature(Target::VulkanV10) && !t.has_feature(Target::VulkanV12) && !t.has_feature(Target::VulkanV13)) { // Detect host vulkan capability t.set_feature(get_host_vulkan_capability(t)); } if (arch_specified && !bits_specified) { return false; } if (bits_specified && t.bits == 0) { // bits == 0 is allowed iff arch and os are "unknown" and no features are set, // to allow for roundtripping the string for default Target() ctor. if (!(arch_specified && t.arch == Target::ArchUnknown) || !(os_specified && t.os == Target::OSUnknown) || features_specified) { return false; } } return true; } void bad_target_string(const std::string &target) { const char *separator = ""; std::string architectures; for (const auto &arch_entry : arch_name_map) { architectures += separator + arch_entry.first; separator = ", "; } separator = ""; std::string oses; for (const auto &os_entry : os_name_map) { oses += separator + os_entry.first; separator = ", "; } separator = ""; std::string processors; for (const auto &processor_entry : processor_name_map) { processors += separator + processor_entry.first; separator = ", "; } separator = ""; // Format the features to go one feature over 70 characters per line, // assume the first line starts with "Features are ". int line_char_start = -(int)sizeof("Features are"); std::string features; for (const auto &feature_entry : feature_name_map) { features += separator + feature_entry.first; if (features.length() - line_char_start > 70) { separator = "\n"; line_char_start = features.length(); } else { separator = ", "; } } user_error << "Did not understand Halide target " << target << "\n" << "Expected format is arch-bits-os-processor-feature1-feature2-...\n" << "Where arch is: " << architectures << ".\n" << "bits is either 32 or 64.\n" << "os is: " << oses << ".\n" << "processor is: " << processors << ".\n" << "\n" << "If arch, bits, or os are omitted, they default to the host.\n" << "\n" << "If processor is omitted, it defaults to tune_generic.\n" << "\n" << "Features are: " << features << ".\n" << "\n" << "The target can also begin with \"host\", which sets the " << "host's architecture, os, and feature set, with the " << "exception of the GPU runtimes, which default to off.\n" << "\n" << "On this platform, the host target is: " << get_host_target().to_string() << "\n"; } void do_check_bad(const Target &t, const std::initializer_list &v) { for (Target::Feature f : v) { user_assert(!t.has_feature(f)) << "Target feature " << Target::feature_to_name(f) << " is incompatible with the Target's architecture. (" << t << ")\n"; } } } // namespace void Target::validate_features() const { // Note that the features don't have to be exhaustive, but enough to avoid obvious mistakes is good. if (arch == X86) { do_check_bad(*this, { ARMDotProd, ARMFp16, ARMv7s, ARMv81a, NoNEON, POWER_ARCH_2_07, RVV, SVE, SVE2, VSX, WasmBulkMemory, WasmMvpOnly, WasmSimd128, WasmThreads, }); } else if (arch == ARM) { do_check_bad(*this, { AVX, AVX2, AVX512, AVX512_Cannonlake, AVX512_KNL, AVX512_SapphireRapids, AVX512_Skylake, AVX512_Zen4, F16C, FMA, FMA4, POWER_ARCH_2_07, RVV, SSE41, VSX, WasmBulkMemory, WasmMvpOnly, WasmSimd128, WasmThreads, }); } else if (arch == WebAssembly) { do_check_bad(*this, { ARMDotProd, ARMFp16, ARMv7s, ARMv81a, AVX, AVX2, AVX512, AVX512_Cannonlake, AVX512_KNL, AVX512_SapphireRapids, AVX512_Skylake, AVX512_Zen4, F16C, FMA, FMA4, HVX_128, HVX_128, HVX_v62, HVX_v65, HVX_v66, NoNEON, POWER_ARCH_2_07, RVV, SSE41, SVE, SVE2, VSX, }); } } Target::Target(const std::string &target) { Target host = get_host_target(); if (target.empty()) { // If nothing is specified, use the full host target. *this = host; } else { if (!merge_string(*this, target) || has_unknowns()) { bad_target_string(target); } } validate_features(); } Target::Target(const char *s) : Target(std::string(s)) { } bool Target::validate_target_string(const std::string &s) { Target t; return merge_string(t, s) && !t.has_unknowns(); } std::string Target::feature_to_name(Target::Feature feature) { for (const auto &feature_entry : feature_name_map) { if (feature == feature_entry.second) { return feature_entry.first; } } internal_error; return ""; } Target::Feature Target::feature_from_name(const std::string &name) { Target::Feature feature; if (lookup_feature(name, feature)) { return feature; } return Target::FeatureEnd; } std::string Target::to_string() const { string result; for (const auto &arch_entry : arch_name_map) { if (arch_entry.second == arch) { result += arch_entry.first; break; } } result += "-" + std::to_string(bits); for (const auto &os_entry : os_name_map) { if (os_entry.second == os) { result += "-" + os_entry.first; break; } } if (processor_tune != ProcessorGeneric) { for (const auto &processor_entry : processor_name_map) { if (processor_entry.second == processor_tune) { result += "-" + processor_entry.first; break; } } } for (const auto &feature_entry : feature_name_map) { if (has_feature(feature_entry.second)) { result += "-" + feature_entry.first; } } // Use has_feature() multiple times (rather than features_any_of()) // to avoid constructing a temporary vector for this rather-common call. if (has_feature(Target::TraceLoads) && has_feature(Target::TraceStores) && has_feature(Target::TraceRealizations)) { result = Internal::replace_all(result, "trace_loads-trace_realizations-trace_stores", "trace_all"); } if (vector_bits != 0) { result += "-vector_bits_" + std::to_string(vector_bits); } return result; } /** Was libHalide compiled with support for this target? */ bool Target::supported() const { bool bad = false; #if !defined(WITH_ARM) bad |= arch == Target::ARM && bits == 32; #endif #if !defined(WITH_AARCH64) bad |= arch == Target::ARM && bits == 64; #endif #if !defined(WITH_X86) bad |= arch == Target::X86; #endif #if !defined(WITH_POWERPC) bad |= arch == Target::POWERPC; #endif #if !defined(WITH_HEXAGON) bad |= arch == Target::Hexagon; #endif #if !defined(WITH_WEBASSEMBLY) bad |= arch == Target::WebAssembly; #endif #if !defined(WITH_RISCV) bad |= arch == Target::RISCV; #endif #if !defined(WITH_NVPTX) bad |= has_feature(Target::CUDA); #endif #if !defined(WITH_OPENCL) bad |= has_feature(Target::OpenCL); #endif #if !defined(WITH_METAL) bad |= has_feature(Target::Metal); #endif #if !defined(WITH_OPENGLCOMPUTE) bad |= has_feature(Target::OpenGLCompute); #endif #if !defined(WITH_D3D12) bad |= has_feature(Target::D3D12Compute); #endif #if !defined(WITH_VULKAN) bad |= has_feature(Target::Vulkan); #endif #if !defined(WITH_WEBGPU) bad |= has_feature(Target::WebGPU); #endif return !bad; } bool Target::has_unknowns() const { return os == OSUnknown || arch == ArchUnknown || bits == 0; } void Target::set_feature(Feature f, bool value) { if (f == FeatureEnd) { return; } user_assert(f < FeatureEnd) << "Invalid Target feature.\n"; features.set(f, value); } void Target::set_features(const std::vector &features_to_set, bool value) { for (Feature f : features_to_set) { set_feature(f, value); } } bool Target::has_feature(Feature f) const { if (f == FeatureEnd) { return true; } user_assert(f < FeatureEnd) << "Invalid Target feature.\n"; return features[f]; } bool Target::features_any_of(const std::vector &test_features) const { for (Feature f : test_features) { if (has_feature(f)) { return true; } } return false; } bool Target::features_all_of(const std::vector &test_features) const { for (Feature f : test_features) { if (!has_feature(f)) { return false; } } return true; } Target Target::with_feature(Feature f) const { Target copy = *this; copy.set_feature(f); return copy; } Target Target::without_feature(Feature f) const { Target copy = *this; copy.set_feature(f, false); return copy; } bool Target::has_gpu_feature() const { return (has_feature(CUDA) || has_feature(OpenCL) || has_feature(Metal) || has_feature(D3D12Compute) || has_feature(OpenGLCompute) || has_feature(Vulkan) || has_feature(WebGPU)); } int Target::get_cuda_capability_lower_bound() const { if (!has_feature(Target::CUDA)) { return -1; } if (has_feature(Target::CUDACapability30)) { return 30; } if (has_feature(Target::CUDACapability32)) { return 32; } if (has_feature(Target::CUDACapability35)) { return 35; } if (has_feature(Target::CUDACapability50)) { return 50; } if (has_feature(Target::CUDACapability61)) { return 61; } if (has_feature(Target::CUDACapability70)) { return 70; } if (has_feature(Target::CUDACapability75)) { return 75; } if (has_feature(Target::CUDACapability80)) { return 80; } if (has_feature(Target::CUDACapability86)) { return 86; } return 20; } int Target::get_vulkan_capability_lower_bound() const { if (!has_feature(Target::Vulkan)) { return -1; } if (has_feature(Target::VulkanV10)) { return 10; } if (has_feature(Target::VulkanV12)) { return 12; } if (has_feature(Target::VulkanV13)) { return 13; } return 10; } bool Target::supports_type(const Type &t) const { if (t.bits() == 64) { if (t.is_float()) { return (!has_feature(Metal) && !has_feature(OpenGLCompute) && !has_feature(D3D12Compute) && (!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles)) && (!has_feature(Vulkan) || has_feature(Target::VulkanFloat64)) && !has_feature(WebGPU)); } else { return (!has_feature(Metal) && !has_feature(OpenGLCompute) && !has_feature(D3D12Compute) && (!has_feature(Vulkan) || has_feature(Target::VulkanInt64)) && !has_feature(WebGPU)); } } return true; } bool Target::supports_type(const Type &t, DeviceAPI device) const { if (device == DeviceAPI::Default_GPU) { device = get_default_device_api_for_target(*this); } if (device == DeviceAPI::Hexagon) { // HVX supports doubles and long long in the scalar unit only. if (t.is_float() || t.bits() == 64) { return t.lanes() == 1; } } else if (device == DeviceAPI::Metal) { // Metal spec says no double or long long. if (t.bits() == 64) { return false; } } else if (device == DeviceAPI::OpenCL) { if (t.is_float() && t.bits() == 64) { return has_feature(Target::CLDoubles); } } else if (device == DeviceAPI::D3D12Compute) { // Shader Model 5.x can optionally support double-precision; 64-bit int // types are not supported. return t.bits() < 64; } else if (device == DeviceAPI::OpenGLCompute) { return t.bits() < 64; } else if (device == DeviceAPI::Vulkan) { if (t.is_float() && t.bits() == 64) { return has_feature(Target::VulkanFloat64); } else if (t.is_float() && t.bits() == 16) { return has_feature(Target::VulkanFloat16); } else if (t.is_int_or_uint() && t.bits() == 64) { return has_feature(Target::VulkanInt64); } else if (t.is_int_or_uint() && t.bits() == 16) { return has_feature(Target::VulkanInt16); } else if (t.is_int_or_uint() && t.bits() == 8) { return has_feature(Target::VulkanInt8); } } else if (device == DeviceAPI::WebGPU) { return t.bits() < 64; } return true; } bool Target::supports_device_api(DeviceAPI api) const { switch (api) { case DeviceAPI::None: return true; case DeviceAPI::Host: return true; case DeviceAPI::Default_GPU: return has_gpu_feature(); case DeviceAPI::Hexagon: return has_feature(Target::HVX); case DeviceAPI::HexagonDma: return has_feature(Target::HexagonDma); default: return has_feature(target_feature_for_device_api(api)); } } DeviceAPI Target::get_required_device_api() const { if (has_feature(Target::CUDA)) { return DeviceAPI::CUDA; } if (has_feature(Target::D3D12Compute)) { return DeviceAPI::D3D12Compute; } if (has_feature(Target::HVX)) { return DeviceAPI::Hexagon; } if (has_feature(Target::HexagonDma)) { return DeviceAPI::HexagonDma; } if (has_feature(Target::Metal)) { return DeviceAPI::Metal; } if (has_feature(Target::OpenCL)) { return DeviceAPI::OpenCL; } if (has_feature(Target::OpenGLCompute)) { return DeviceAPI::OpenGLCompute; } if (has_feature(Target::Vulkan)) { return DeviceAPI::Vulkan; } if (has_feature(Target::WebGPU)) { return DeviceAPI::WebGPU; } return DeviceAPI::None; } Target::Feature target_feature_for_device_api(DeviceAPI api) { switch (api) { case DeviceAPI::CUDA: return Target::CUDA; case DeviceAPI::OpenCL: return Target::OpenCL; case DeviceAPI::OpenGLCompute: return Target::OpenGLCompute; case DeviceAPI::Metal: return Target::Metal; case DeviceAPI::Hexagon: return Target::HVX; case DeviceAPI::D3D12Compute: return Target::D3D12Compute; case DeviceAPI::Vulkan: return Target::Vulkan; case DeviceAPI::WebGPU: return Target::WebGPU; default: return Target::FeatureEnd; } } int Target::natural_vector_size(const Halide::Type &t) const { user_assert(!has_unknowns()) << "natural_vector_size cannot be used on a Target with Unknown values.\n"; const bool is_integer = t.is_int() || t.is_uint(); const int data_size = t.bytes(); if (arch == Target::ARM) { if (vector_bits != 0 && (has_feature(Halide::Target::SVE2) || (t.is_float() && has_feature(Halide::Target::SVE)))) { return vector_bits / (data_size * 8); } else { return 16 / data_size; } } else if (arch == Target::Hexagon) { if (is_integer) { if (has_feature(Halide::Target::HVX)) { return 128 / data_size; } else { user_error << "Target uses hexagon arch without target feature hvx set.\n"; return 0; } } else { // HVX does not have vector float instructions. return 1; } } else if (arch == Target::X86) { if (is_integer && (has_feature(Halide::Target::AVX512_Skylake) || has_feature(Halide::Target::AVX512_Cannonlake))) { // AVX512BW exists on Skylake and Cannonlake return 64 / data_size; } else if (t.is_float() && (has_feature(Halide::Target::AVX512) || has_feature(Halide::Target::AVX512_KNL) || has_feature(Halide::Target::AVX512_Skylake) || has_feature(Halide::Target::AVX512_Cannonlake))) { // AVX512F is on all AVX512 architectures return 64 / data_size; } else if (has_feature(Halide::Target::AVX2)) { // AVX2 uses 256-bit vectors for everything. return 32 / data_size; } else if (!is_integer && has_feature(Halide::Target::AVX)) { // AVX 1 has 256-bit vectors for float, but not for // integer instructions. return 32 / data_size; } else { // SSE was all 128-bit. We ignore MMX. return 16 / data_size; } } else if (arch == Target::WebAssembly) { if (has_feature(Halide::Target::WasmSimd128)) { // 128-bit vectors for other types. return 16 / data_size; } else { // No vectors, sorry. return 1; } } else if (arch == Target::RISCV) { if (vector_bits != 0 && has_feature(Halide::Target::RVV)) { return vector_bits / (data_size * 8); } else { return 1; } } else { // Assume 128-bit vectors on other targets. return 16 / data_size; } } bool Target::get_runtime_compatible_target(const Target &other, Target &result) { // Create mask to select features that: // (a) must be included if either target has the feature (union) // (b) must be included if both targets have the feature (intersection) // (c) must match across both targets; it is an error if one target has the feature and the other doesn't // clang-format off const std::array union_features = {{ // These are true union features. CUDA, D3D12Compute, Metal, NoNEON, OpenCL, OpenGLCompute, Vulkan, WebGPU, // These features are actually intersection-y, but because targets only record the _highest_, // we have to put their union in the result and then take a lower bound. CUDACapability30, CUDACapability32, CUDACapability35, CUDACapability50, CUDACapability61, CUDACapability70, CUDACapability75, CUDACapability80, CUDACapability86, HVX_v62, HVX_v65, HVX_v66, VulkanV10, VulkanV12, VulkanV13, }}; // clang-format on // clang-format off const std::array intersection_features = {{ ARMv7s, ARMv81a, AVX, AVX2, AVX512, AVX512_Cannonlake, AVX512_KNL, AVX512_SapphireRapids, AVX512_Skylake, AVX512_Zen4, F16C, FMA, FMA4, SSE41, VSX, }}; // clang-format on // clang-format off const std::array matching_features = {{ ASAN, Debug, HexagonDma, HVX, MSAN, SoftFloatABI, TSAN, WasmThreads, SanitizerCoverage, }}; // clang-format on // bitsets need to be the same width. decltype(result.features) union_mask; decltype(result.features) intersection_mask; decltype(result.features) matching_mask; for (const auto &feature : union_features) { union_mask.set(feature); } for (const auto &feature : intersection_features) { intersection_mask.set(feature); } for (const auto &feature : matching_features) { matching_mask.set(feature); } if (arch != other.arch || bits != other.bits || os != other.os) { Internal::debug(1) << "runtime targets must agree on platform (arch-bits-os)\n" << " this: " << *this << "\n" << " other: " << other << "\n"; return false; } if ((features & matching_mask) != (other.features & matching_mask)) { Internal::debug(1) << "runtime targets must agree on SoftFloatABI, Debug, TSAN, ASAN, MSAN, HVX, HexagonDma, SanitizerCoverage\n" << " this: " << *this << "\n" << " other: " << other << "\n"; return false; } // Union of features is computed through bitwise-or, and masked away by the features we care about // Intersection of features is computed through bitwise-and and masked away, too. // We merge the bits via bitwise or. Target output = Target{os, arch, bits, processor_tune}; output.features = ((features | other.features) & union_mask) | ((features | other.features) & matching_mask) | ((features & other.features) & intersection_mask); // Pick tight lower bound for CUDA capability. Use fall-through to clear redundant features int cuda_a = get_cuda_capability_lower_bound(); int cuda_b = other.get_cuda_capability_lower_bound(); // get_cuda_capability_lower_bound returns -1 when unused. Casting to unsigned makes this // large, so min selects the true lower bound when one target doesn't specify a capability, // and the other doesn't use CUDA at all. int cuda_capability = std::min((unsigned)cuda_a, (unsigned)cuda_b); if (cuda_capability < 30) { output.features.reset(CUDACapability30); } if (cuda_capability < 32) { output.features.reset(CUDACapability32); } if (cuda_capability < 35) { output.features.reset(CUDACapability35); } if (cuda_capability < 50) { output.features.reset(CUDACapability50); } if (cuda_capability < 61) { output.features.reset(CUDACapability61); } if (cuda_capability < 70) { output.features.reset(CUDACapability70); } if (cuda_capability < 75) { output.features.reset(CUDACapability75); } if (cuda_capability < 80) { output.features.reset(CUDACapability80); } if (cuda_capability < 86) { output.features.reset(CUDACapability86); } // Pick tight lower bound for Vulkan capability. Use fall-through to clear redundant features int vulkan_a = get_vulkan_capability_lower_bound(); int vulkan_b = other.get_vulkan_capability_lower_bound(); // Same trick as above for CUDA int vulkan_capability = std::min((unsigned)vulkan_a, (unsigned)vulkan_b); if (vulkan_capability < 10) { output.features.reset(VulkanV10); } if (vulkan_capability < 12) { output.features.reset(VulkanV12); } if (vulkan_capability < 13) { output.features.reset(VulkanV13); } // Pick tight lower bound for HVX version. Use fall-through to clear redundant features int hvx_a = get_hvx_lower_bound(*this); int hvx_b = get_hvx_lower_bound(other); // Same trick as above for CUDA int hvx_version = std::min((unsigned)hvx_a, (unsigned)hvx_b); if (hvx_version < 62) { output.features.reset(HVX_v62); } if (hvx_version < 65) { output.features.reset(HVX_v65); } if (hvx_version < 66) { output.features.reset(HVX_v66); } result = output; return true; } namespace Internal { void target_test() { Target t; for (const auto &feature : feature_name_map) { t.set_feature(feature.second); } for (int i = 0; i < (int)(Target::FeatureEnd); i++) { internal_assert(t.has_feature((Target::Feature)i)) << "Feature " << i << " not in feature_names_map.\n"; } // 3 targets: {A,B,C}. Want gcd(A,B)=C std::vector> gcd_tests = { {{"x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma"}}, {{"x86-64-linux-sse41-fma-no_asserts-no_runtime", "x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma"}}, {{"x86-64-linux-avx2-sse41", "x86-64-linux-sse41-fma", "x86-64-linux-sse41"}}, {{"x86-64-linux-avx2-sse41", "x86-32-linux-sse41-fma", ""}}, {{"x86-64-linux-cuda", "x86-64-linux", "x86-64-linux-cuda"}}, {{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda", "x86-64-linux-cuda"}}, {{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda-cuda_capability_30", "x86-64-linux-cuda-cuda_capability_30"}}, {{"x86-64-linux-vulkan", "x86-64-linux", "x86-64-linux-vulkan"}}, {{"x86-64-linux-vulkan-vk_v13", "x86-64-linux-vulkan", "x86-64-linux-vulkan"}}, {{"x86-64-linux-vulkan-vk_v13", "x86-64-linux-vulkan-vk_v10", "x86-64-linux-vulkan-vk_v10"}}, {{"hexagon-32-qurt-hvx_v65", "hexagon-32-qurt-hvx_v62", "hexagon-32-qurt-hvx_v62"}}, {{"hexagon-32-qurt-hvx_v62", "hexagon-32-qurt", "hexagon-32-qurt"}}, {{"hexagon-32-qurt-hvx_v62-hvx", "hexagon-32-qurt", ""}}, {{"hexagon-32-qurt-hvx_v62-hvx", "hexagon-32-qurt-hvx", "hexagon-32-qurt-hvx"}}, }; for (const auto &test : gcd_tests) { Target result{}; Target a{test[0]}; Target b{test[1]}; if (a.get_runtime_compatible_target(b, result)) { internal_assert(!test[2].empty() && result == Target{test[2]}) << "Targets " << a.to_string() << " and " << b.to_string() << " were computed to have gcd " << result.to_string() << " but expected '" << test[2] << "'\n"; } else { internal_assert(test[2].empty()) << "Targets " << a.to_string() << " and " << b.to_string() << " were computed to have no gcd " << "but " << test[2] << " was expected."; } } internal_assert(Target().vector_bits == 0) << "Default Target vector_bits not 0.\n"; internal_assert(Target("arm-64-linux-sve2-vector_bits_512").vector_bits == 512) << "Vector bits not parsed correctly.\n"; Target with_vector_bits(Target::Linux, Target::ARM, 64, Target::ProcessorGeneric, {Target::SVE}, 512); internal_assert(with_vector_bits.vector_bits == 512) << "Vector bits not populated in constructor.\n"; internal_assert(Target(with_vector_bits.to_string()).vector_bits == 512) << "Vector bits not round tripped properly.\n"; std::cout << "Target test passed" << std::endl; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/Target.h000066400000000000000000000410621456515664200151750ustar00rootroot00000000000000#ifndef HALIDE_TARGET_H #define HALIDE_TARGET_H /** \file * Defines the structure that describes a Halide target. */ #include #include #include #include "DeviceAPI.h" #include "Type.h" #include "runtime/HalideRuntime.h" namespace Halide { /** A struct representing a target machine and os to generate code for. */ struct Target { /** The operating system used by the target. Determines which * system calls to generate. * Corresponds to os_name_map in Target.cpp. */ enum OS { OSUnknown = 0, Linux, Windows, OSX, Android, IOS, QuRT, NoOS, Fuchsia, WebAssemblyRuntime } os = OSUnknown; /** The architecture used by the target. Determines the * instruction set to use. * Corresponds to arch_name_map in Target.cpp. */ enum Arch { ArchUnknown = 0, X86, ARM, Hexagon, POWERPC, WebAssembly, RISCV } arch = ArchUnknown; /** The bit-width of the target machine. Must be 0 for unknown, or 32 or 64. */ int bits = 0; /** The bit-width of a vector register for targets where this is configurable and * targeting a fixed size is desired. The default of 0 indicates no assumption of * fixed size is allowed. */ int vector_bits = 0; /** The specific processor to be targeted, tuned for. * Corresponds to processor_name_map in Target.cpp. * * New entries should be added to the end. */ enum Processor { /// Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features. ProcessorGeneric = 0, K8, /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). K8_SSE3, /// Tune for later versions of AMD K8 CPU, with SSE3 support. AMDFam10, /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). BtVer1, /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). BdVer1, /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). BdVer2, /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). BdVer3, /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). BdVer4, /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). BtVer2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2013). ZnVer1, /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). ZnVer2, /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). ZnVer3, /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). ZnVer4, /// Tune for AMD Zen 4 CPU (AMD Family 19h, launched 2022). } processor_tune = ProcessorGeneric; /** Optional features a target can have. * Corresponds to feature_name_map in Target.cpp. * See definitions in HalideRuntime.h for full information. */ enum Feature { JIT = halide_target_feature_jit, Debug = halide_target_feature_debug, NoAsserts = halide_target_feature_no_asserts, NoBoundsQuery = halide_target_feature_no_bounds_query, SSE41 = halide_target_feature_sse41, AVX = halide_target_feature_avx, AVX2 = halide_target_feature_avx2, FMA = halide_target_feature_fma, FMA4 = halide_target_feature_fma4, F16C = halide_target_feature_f16c, ARMv7s = halide_target_feature_armv7s, NoNEON = halide_target_feature_no_neon, VSX = halide_target_feature_vsx, POWER_ARCH_2_07 = halide_target_feature_power_arch_2_07, CUDA = halide_target_feature_cuda, CUDACapability30 = halide_target_feature_cuda_capability30, CUDACapability32 = halide_target_feature_cuda_capability32, CUDACapability35 = halide_target_feature_cuda_capability35, CUDACapability50 = halide_target_feature_cuda_capability50, CUDACapability61 = halide_target_feature_cuda_capability61, CUDACapability70 = halide_target_feature_cuda_capability70, CUDACapability75 = halide_target_feature_cuda_capability75, CUDACapability80 = halide_target_feature_cuda_capability80, CUDACapability86 = halide_target_feature_cuda_capability86, OpenCL = halide_target_feature_opencl, CLDoubles = halide_target_feature_cl_doubles, CLHalf = halide_target_feature_cl_half, CLAtomics64 = halide_target_feature_cl_atomic64, OpenGLCompute = halide_target_feature_openglcompute, // NOTE: This feature is deprecated and will be removed in Halide 17. EGL = halide_target_feature_egl, UserContext = halide_target_feature_user_context, Profile = halide_target_feature_profile, NoRuntime = halide_target_feature_no_runtime, Metal = halide_target_feature_metal, CPlusPlusMangling = halide_target_feature_c_plus_plus_mangling, LargeBuffers = halide_target_feature_large_buffers, HexagonDma = halide_target_feature_hexagon_dma, HVX_128 = halide_target_feature_hvx_128, HVX = HVX_128, HVX_v62 = halide_target_feature_hvx_v62, HVX_v65 = halide_target_feature_hvx_v65, HVX_v66 = halide_target_feature_hvx_v66, FuzzFloatStores = halide_target_feature_fuzz_float_stores, SoftFloatABI = halide_target_feature_soft_float_abi, MSAN = halide_target_feature_msan, AVX512 = halide_target_feature_avx512, AVX512_KNL = halide_target_feature_avx512_knl, AVX512_Skylake = halide_target_feature_avx512_skylake, AVX512_Cannonlake = halide_target_feature_avx512_cannonlake, AVX512_SapphireRapids = halide_target_feature_avx512_sapphirerapids, AVX512_Zen4 = halide_target_feature_avx512_zen4, TraceLoads = halide_target_feature_trace_loads, TraceStores = halide_target_feature_trace_stores, TraceRealizations = halide_target_feature_trace_realizations, TracePipeline = halide_target_feature_trace_pipeline, D3D12Compute = halide_target_feature_d3d12compute, StrictFloat = halide_target_feature_strict_float, TSAN = halide_target_feature_tsan, ASAN = halide_target_feature_asan, CheckUnsafePromises = halide_target_feature_check_unsafe_promises, EmbedBitcode = halide_target_feature_embed_bitcode, EnableLLVMLoopOpt = halide_target_feature_enable_llvm_loop_opt, WasmMvpOnly = halide_target_feature_wasm_mvponly, WasmSimd128 = halide_target_feature_wasm_simd128, WasmThreads = halide_target_feature_wasm_threads, WasmBulkMemory = halide_target_feature_wasm_bulk_memory, WebGPU = halide_target_feature_webgpu, SVE = halide_target_feature_sve, SVE2 = halide_target_feature_sve2, ARMDotProd = halide_target_feature_arm_dot_prod, ARMFp16 = halide_target_feature_arm_fp16, LLVMLargeCodeModel = halide_llvm_large_code_model, RVV = halide_target_feature_rvv, ARMv81a = halide_target_feature_armv81a, SanitizerCoverage = halide_target_feature_sanitizer_coverage, ProfileByTimer = halide_target_feature_profile_by_timer, SPIRV = halide_target_feature_spirv, Vulkan = halide_target_feature_vulkan, VulkanInt8 = halide_target_feature_vulkan_int8, VulkanInt16 = halide_target_feature_vulkan_int16, VulkanInt64 = halide_target_feature_vulkan_int64, VulkanFloat16 = halide_target_feature_vulkan_float16, VulkanFloat64 = halide_target_feature_vulkan_float64, VulkanV10 = halide_target_feature_vulkan_version10, VulkanV12 = halide_target_feature_vulkan_version12, VulkanV13 = halide_target_feature_vulkan_version13, Semihosting = halide_target_feature_semihosting, FeatureEnd = halide_target_feature_end }; Target() = default; Target(OS o, Arch a, int b, Processor pt, const std::vector &initial_features = std::vector(), int vb = 0) : os(o), arch(a), bits(b), vector_bits(vb), processor_tune(pt) { for (const auto &f : initial_features) { set_feature(f); } validate_features(); } Target(OS o, Arch a, int b, const std::vector &initial_features = std::vector()) : Target(o, a, b, ProcessorGeneric, initial_features) { } /** Given a string of the form used in HL_TARGET * (e.g. "x86-64-avx"), construct the Target it specifies. Note * that this always starts with the result of get_host_target(), * replacing only the parts found in the target string, so if you * omit (say) an OS specification, the host OS will be used * instead. An empty string is exactly equivalent to * get_host_target(). * * Invalid target strings will fail with a user_error. */ // @{ explicit Target(const std::string &s); explicit Target(const char *s); // @} /** Check if a target string is valid. */ static bool validate_target_string(const std::string &s); /** Return true if any of the arch/bits/os fields are "unknown"/0; return false otherwise. */ bool has_unknowns() const; void set_feature(Feature f, bool value = true); void set_features(const std::vector &features_to_set, bool value = true); bool has_feature(Feature f) const; inline bool has_feature(halide_target_feature_t f) const { return has_feature((Feature)f); } bool features_any_of(const std::vector &test_features) const; bool features_all_of(const std::vector &test_features) const; /** Return a copy of the target with the given feature set. * This is convenient when enabling certain features (e.g. NoBoundsQuery) * in an initialization list, where the target to be mutated may be * a const reference. */ Target with_feature(Feature f) const; /** Return a copy of the target with the given feature cleared. * This is convenient when disabling certain features (e.g. NoBoundsQuery) * in an initialization list, where the target to be mutated may be * a const reference. */ Target without_feature(Feature f) const; /** Is a fully feature GPU compute runtime enabled? I.e. is * Func::gpu_tile and similar going to work? Currently includes * CUDA, OpenCL, Metal and D3D12Compute. We do not include OpenGL, * because it is not capable of gpgpu, and is not scheduled via * Func::gpu_tile. * TODO: Should OpenGLCompute be included here? */ bool has_gpu_feature() const; /** Does this target allow using a certain type. Generally all * types except 64-bit float and int/uint should be supported by * all backends. * * It is likely better to call the version below which takes a DeviceAPI. */ bool supports_type(const Type &t) const; /** Does this target allow using a certain type on a certain device. * This is the prefered version of this routine. */ bool supports_type(const Type &t, DeviceAPI device) const; /** Returns whether a particular device API can be used with this * Target. */ bool supports_device_api(DeviceAPI api) const; /** If this Target (including all Features) requires a specific DeviceAPI, * return it. If it doesn't, return DeviceAPI::None. If the Target has * features with multiple (different) DeviceAPI requirements, the result * will be an arbitrary DeviceAPI. */ DeviceAPI get_required_device_api() const; bool operator==(const Target &other) const { return os == other.os && arch == other.arch && bits == other.bits && processor_tune == other.processor_tune && features == other.features; } bool operator!=(const Target &other) const { return !(*this == other); } /** * Create a "greatest common denominator" runtime target that is compatible with * both this target and \p other. Used by generators to conveniently select a suitable * runtime when linking together multiple functions. * * @param other The other target from which we compute the gcd target. * @param[out] result The gcd target if we return true, otherwise unmodified. Can be the same as *this. * @return Whether it was possible to find a compatible target (true) or not. */ bool get_runtime_compatible_target(const Target &other, Target &result); /** Convert the Target into a string form that can be reconstituted * by merge_string(), which will always be of the form * * arch-bits-os-processor-feature1-feature2...featureN. * * Note that is guaranteed that Target(t1.to_string()) == t1, * but not that Target(s).to_string() == s (since there can be * multiple strings that parse to the same Target)... * *unless* t1 contains 'unknown' fields (in which case you'll get a string * that can't be parsed, which is intentional). */ std::string to_string() const; /** Given a data type, return an estimate of the "natural" vector size * for that data type when compiling for this Target. */ int natural_vector_size(const Halide::Type &t) const; /** Given a data type, return an estimate of the "natural" vector size * for that data type when compiling for this Target. */ template int natural_vector_size() const { return natural_vector_size(type_of()); } /** Return true iff 64 bits and has_feature(LargeBuffers). */ bool has_large_buffers() const { return bits == 64 && has_feature(LargeBuffers); } /** Return the maximum buffer size in bytes supported on this * Target. This is 2^31 - 1 except on 64-bit targets when the LargeBuffers * feature is enabled, which expands the maximum to 2^63 - 1. */ int64_t maximum_buffer_size() const { if (has_large_buffers()) { return (((uint64_t)1) << 63) - 1; } else { return (((uint64_t)1) << 31) - 1; } } /** Get the minimum cuda capability found as an integer. Returns * 20 (our minimum supported cuda compute capability) if no cuda * features are set. */ int get_cuda_capability_lower_bound() const; /** Get the minimum Vulkan capability found as an integer. Returns * 10 (our minimum supported Vulkan compute capability) if no Vulkan * features are set. */ int get_vulkan_capability_lower_bound() const; /** Was libHalide compiled with support for this target? */ bool supported() const; /** Return a bitset of the Featuress set in this Target (set = 1). * Note that while this happens to be the current internal representation, * that might not always be the case. */ const std::bitset &get_features_bitset() const { return features; } /** Return the name corresponding to a given Feature, in the form * used to construct Target strings (e.g., Feature::Debug is "debug" and not "Debug"). */ static std::string feature_to_name(Target::Feature feature); /** Return the feature corresponding to a given name, in the form * used to construct Target strings (e.g., Feature::Debug is "debug" and not "Debug"). * If the string is not a known feature name, return FeatureEnd. */ static Target::Feature feature_from_name(const std::string &name); private: /** A bitmask that stores the active features. */ std::bitset features; /** Attempt to validate that all features set are sensible for the base Target. * This is *not* guaranteed to get all invalid combinations, but is intended * to catch at least the most common (e.g., setting arm-specific features on x86). */ void validate_features() const; }; /** Return the target corresponding to the host machine. */ Target get_host_target(); /** Return the target that Halide will use. If HL_TARGET is set it * uses that. Otherwise calls \ref get_host_target */ Target get_target_from_environment(); /** Return the target that Halide will use for jit-compilation. If * HL_JIT_TARGET is set it uses that. Otherwise calls \ref * get_host_target. Throws an error if the architecture, bit width, * and OS of the target do not match the host target, so this is only * useful for controlling the feature set. */ Target get_jit_target_from_environment(); /** Get the Target feature corresponding to a DeviceAPI. For device * apis that do not correspond to any single target feature, returns * Target::FeatureEnd */ Target::Feature target_feature_for_device_api(DeviceAPI api); namespace Internal { void target_test(); } } // namespace Halide #endif Halide-17.0.1/src/Tracing.cpp000066400000000000000000000462521456515664200156770ustar00rootroot00000000000000#include "Tracing.h" #include "Bounds.h" #include "Function.h" #include "IRMutator.h" #include "IROperator.h" #include "RealizationOrder.h" #include "runtime/HalideRuntime.h" #include namespace Halide { namespace Internal { using std::map; using std::pair; using std::set; using std::string; using std::vector; namespace { struct TraceEventBuilder { string func; Expr trace_tag_expr = Expr(""); vector value; vector coordinates; Type type; enum halide_trace_event_code_t event; Expr parent_id, value_index; Expr build() { Expr values = Call::make(type_of(), Call::make_struct, value, Call::Intrinsic); Expr coords = Call::make(type_of(), Call::make_struct, coordinates, Call::Intrinsic); Expr idx = value_index; if (!idx.defined()) { idx = 0; } // Note: if these arguments are changed in any meaningful way, // VectorizeLoops will likely need attention; it does nontrivial // special-casing of this call to get appropriate results. vector args = {Expr(func), values, coords, (int)type.code(), (int)type.bits(), (int)type.lanes(), (int)event, parent_id, idx, (int)coordinates.size(), trace_tag_expr}; return Call::make(Int(32), Call::trace, args, Call::Extern); } }; class InjectTracing : public IRMutator { public: const map &env; const bool trace_all_loads, trace_all_stores, trace_all_realizations; // We want to preserve the order, so use a vector rather than a map vector>> trace_tags; set trace_tags_added; // The funcs that will have any tracing info emitted (not just trace tags), // and the Type(s) of their elements. map> funcs_touched; InjectTracing(const map &e, const Target &t) : env(e), trace_all_loads(t.has_feature(Target::TraceLoads)), trace_all_stores(t.has_feature(Target::TraceStores)), // Set trace_all_realizations to true if either trace_loads or trace_stores is on too: // They don't work without trace_all_realizations being on (and the errors are missing symbol mysterious nonsense). trace_all_realizations(t.features_any_of({Target::TraceLoads, Target::TraceStores, Target::TraceRealizations})) { } private: void add_trace_tags(const string &name, const vector &t) { if (!t.empty() && !trace_tags_added.count(name)) { trace_tags.emplace_back(name, t); trace_tags_added.insert(name); } } void add_func_touched(const string &name, int value_index, const Type &type) { auto it = funcs_touched.find(name); if (it == funcs_touched.end()) { vector types(value_index + 1); types[value_index] = type; funcs_touched[name] = types; } else { // If the type already present is missing, or "handle0" (aka "we don't know yet", // replace it with the given type. Otherwise, assert the types match. vector &types = it->second; if ((int)types.size() <= value_index) { types.resize(value_index + 1); types[value_index] = type; } else { internal_assert(type == Type() || type == types[value_index]) << "Type was already specified as " << types[value_index] << " but now is " << type; } } } using IRMutator::visit; Expr visit(const Call *op) override { Expr expr = IRMutator::visit(op); op = expr.as(); internal_assert(op); bool trace_it = false; Expr trace_parent; if (op->call_type == Call::Halide) { auto it = env.find(op->name); internal_assert(it != env.end()) << op->name << " not in environment\n"; Function f = it->second; internal_assert(!f.can_be_inlined() || !f.schedule().compute_level().is_inlined()); trace_it = f.is_tracing_loads() || trace_all_loads; trace_parent = Variable::make(Int(32), op->name + ".trace_id"); if (trace_it) { add_trace_tags(op->name, f.get_trace_tags()); } } else if (op->call_type == Call::Image) { trace_it = trace_all_loads; // If there is a Function in the env named "name_im", assume that // this image is an ImageParam, so sniff that Function to see // if we want to trace loads on it. (This allows us to trace // loads on inputs without having to enable them globally.) auto it = env.find(op->name + "_im"); if (it != env.end()) { Function f = it->second; // f could be scheduled and have actual loads from it (via ImageParam::in), // so only honor trace the loads if it is inlined. if ((f.is_tracing_loads() || trace_all_loads) && f.can_be_inlined() && f.schedule().compute_level().is_inlined()) { trace_it = true; add_trace_tags(op->name, f.get_trace_tags()); } } trace_parent = Variable::make(Int(32), "pipeline.trace_id"); } if (trace_it) { add_func_touched(op->name, op->value_index, op->type); string value_var_name = unique_name('t'); Expr value_var = Variable::make(op->type, value_var_name); TraceEventBuilder builder; builder.func = op->name; builder.value = {value_var}; builder.coordinates = op->args; builder.type = op->type; builder.event = halide_trace_load; builder.parent_id = trace_parent; builder.value_index = op->value_index; Expr trace = builder.build(); expr = Let::make(value_var_name, op, Call::make(op->type, Call::return_second, {trace, value_var}, Call::PureIntrinsic)); } return expr; } Stmt visit(const Provide *op) override { Stmt stmt = IRMutator::visit(op); op = stmt.as(); internal_assert(op); map::const_iterator iter = env.find(op->name); if (iter == env.end()) { return stmt; } Function f = iter->second; internal_assert(!f.can_be_inlined() || !f.schedule().compute_level().is_inlined()); if (f.is_tracing_stores() || trace_all_stores) { // Wrap each expr in a tracing call const vector &values = op->values; vector traces(op->values.size()); TraceEventBuilder builder; builder.func = f.name(); builder.coordinates = op->args; builder.event = halide_trace_store; builder.parent_id = Variable::make(Int(32), op->name + ".trace_id"); for (size_t i = 0; i < values.size(); i++) { Type t = values[i].type(); add_func_touched(f.name(), (int)i, t); string value_var_name = unique_name('t'); Expr value_var = Variable::make(t, value_var_name); builder.type = t; builder.value_index = (int)i; builder.value = {value_var}; Expr trace = builder.build(); if (!is_const_one(op->predicate)) { trace = Call::make(trace.type(), Call::if_then_else, {op->predicate, trace}, Call::PureIntrinsic); } traces[i] = Let::make(value_var_name, values[i], Call::make(t, Call::return_second, {trace, value_var}, Call::PureIntrinsic)); } // Lift the args out into lets so that the order of // evaluation is right for scatters. Otherwise the store // is traced before any loads in the index. vector args = op->args; vector> lets; for (auto &arg : args) { if (!arg.as() && !is_const(arg)) { string name = unique_name('t'); lets.emplace_back(name, arg); arg = Variable::make(arg.type(), name); } } stmt = Provide::make(op->name, traces, args, op->predicate); for (const auto &p : lets) { stmt = LetStmt::make(p.first, p.second, stmt); } } return stmt; } Stmt visit(const Realize *op) override { Stmt stmt = IRMutator::visit(op); op = stmt.as(); internal_assert(op); map::const_iterator iter = env.find(op->name); if (iter == env.end()) { return stmt; } Function f = iter->second; if (f.is_tracing_realizations() || trace_all_realizations) { add_trace_tags(op->name, f.get_trace_tags()); for (size_t i = 0; i < op->types.size(); i++) { add_func_touched(op->name, i, op->types[i]); } // Throw a tracing call before and after the realize body TraceEventBuilder builder; builder.func = op->name; builder.parent_id = Variable::make(Int(32), "pipeline.trace_id"); builder.event = halide_trace_begin_realization; for (const auto &bound : op->bounds) { builder.coordinates.push_back(bound.min); builder.coordinates.push_back(bound.extent); } // Begin realization returns a unique token to pass to further trace calls affecting this buffer. Expr call_before = builder.build(); builder.event = halide_trace_end_realization; builder.parent_id = Variable::make(Int(32), op->name + ".trace_id"); Expr call_after = builder.build(); Stmt new_body = op->body; new_body = Block::make(new_body, Evaluate::make(call_after)); new_body = LetStmt::make(op->name + ".trace_id", call_before, new_body); stmt = Realize::make(op->name, op->types, op->memory_type, op->bounds, op->condition, new_body); // Warning: 'op' may be invalid at this point } else if (f.is_tracing_stores() || f.is_tracing_loads()) { // We need a trace id defined to pass to the loads and stores Stmt new_body = op->body; new_body = LetStmt::make(op->name + ".trace_id", 0, new_body); stmt = Realize::make(op->name, op->types, op->memory_type, op->bounds, op->condition, new_body); } return stmt; } Stmt visit(const ProducerConsumer *op) override { Stmt stmt = IRMutator::visit(op); op = stmt.as(); internal_assert(op); map::const_iterator iter = env.find(op->name); if (iter == env.end()) { return stmt; } Function f = iter->second; if (f.is_tracing_realizations() || trace_all_realizations) { // Throw a tracing call around each pipeline event TraceEventBuilder builder; builder.func = op->name; builder.parent_id = Variable::make(Int(32), op->name + ".trace_id"); // Use the size of the pure step const vector &f_args = f.args(); for (int i = 0; i < f.dimensions(); i++) { Expr min = Variable::make(Int(32), f.name() + ".s0." + f_args[i] + ".min"); Expr max = Variable::make(Int(32), f.name() + ".s0." + f_args[i] + ".max"); Expr extent = (max + 1) - min; builder.coordinates.push_back(min); builder.coordinates.push_back(extent); } builder.event = (op->is_producer ? halide_trace_produce : halide_trace_consume); Expr begin_op_call = builder.build(); builder.event = (op->is_producer ? halide_trace_end_produce : halide_trace_end_consume); Expr end_op_call = builder.build(); Stmt new_body = Block::make(op->body, Evaluate::make(end_op_call)); stmt = LetStmt::make(f.name() + ".trace_id", begin_op_call, ProducerConsumer::make(op->name, op->is_producer, new_body)); } return stmt; } }; class RemoveRealizeOverOutput : public IRMutator { using IRMutator::visit; const vector &outputs; Stmt visit(const Realize *op) override { for (const Function &f : outputs) { if (op->name == f.name()) { return mutate(op->body); } } return IRMutator::visit(op); } public: RemoveRealizeOverOutput(const vector &o) : outputs(o) { } }; } // namespace Stmt inject_tracing(Stmt s, const string &pipeline_name, bool trace_pipeline, const map &env, const vector &outputs, const Target &t) { Stmt original = s; InjectTracing tracing(env, t); // Add a dummy realize block for the output buffers for (const Function &output : outputs) { Region output_region; Parameter output_buf = output.output_buffers()[0]; internal_assert(output_buf.is_buffer()); for (int i = 0; i < output.dimensions(); i++) { string d = std::to_string(i); Expr min = Variable::make(Int(32), output_buf.name() + ".min." + d); Expr extent = Variable::make(Int(32), output_buf.name() + ".extent." + d); output_region.emplace_back(min, extent); } s = Realize::make(output.name(), output.output_types(), MemoryType::Auto, output_region, const_true(), s); } // Inject tracing calls s = tracing.mutate(s); // Strip off the dummy realize blocks s = RemoveRealizeOverOutput(outputs).mutate(s); if (!s.same_as(original) || trace_pipeline || t.has_feature(Target::TracePipeline)) { // Add pipeline start and end events TraceEventBuilder builder; builder.func = pipeline_name; builder.event = halide_trace_begin_pipeline; builder.parent_id = 0; Expr pipeline_start = builder.build(); builder.event = halide_trace_end_pipeline; builder.parent_id = Variable::make(Int(32), "pipeline.trace_id"); Expr pipeline_end = builder.build(); s = Block::make(s, Evaluate::make(pipeline_end)); // All trace_tag events go at the start, immediately after begin_pipeline. // For a given realization/input/output, we output them in the order // we encounter them (which is to say, the order they were added); however, // we don't attempt to preserve a particular order between functions. for (const auto &trace_tags : tracing.trace_tags) { // builder.parent_id is already set correctly builder.func = trace_tags.first; // func name builder.event = halide_trace_tag; // We must reverse-iterate to preserve order for (auto it = trace_tags.second.rbegin(); it != trace_tags.second.rend(); ++it) { user_assert(it->find('\0') == string::npos) << "add_trace_tag() may not contain the null character."; builder.trace_tag_expr = Expr(*it); s = Block::make(Evaluate::make(builder.build()), s); } } builder.event = halide_trace_tag; vector order = topological_order(outputs, env); // Compute boxes_touched and send a func_type_and_dim trace-tag for // everything that we actually touched, in topological order. // We include the type(s) of each Func (could be multiple for Tuple-valued // Funcs), and the dimensions and guess-at-ranges-rouched. Note that the // dimensions should be exact, but the ranges-touched is a conservative estimate; // that's ok, as we just want to send these as rough guesses for a tracing tool to use for // automatic layout. (Note that we deliberately send these // before any user-specified trace-tags.) Expr space = Expr(" "); std::map bt = boxes_touched(s); for (auto topo_it = order.rbegin(); topo_it != order.rend(); ++topo_it) { const string &o = *topo_it; auto p = tracing.funcs_touched.find(*topo_it); if (p == tracing.funcs_touched.end() && ends_with(o, "_im")) { p = tracing.funcs_touched.find(o.substr(0, o.size() - 3)); } if (p == tracing.funcs_touched.end()) { continue; } const string &func_name = p->first; const vector &func_types = p->second; builder.func = func_name; vector strings; strings.emplace_back("func_type_and_dim:"); strings.push_back(space); strings.emplace_back((int)func_types.size()); for (const auto &func_type : func_types) { strings.push_back(space); strings.emplace_back((int)func_type.code()); strings.push_back(space); strings.emplace_back(func_type.bits()); strings.push_back(space); strings.emplace_back(func_type.lanes()); } auto it = bt.find(func_name); internal_assert(it != bt.end()); const Box &box = it->second; strings.push_back(space); strings.emplace_back((int)box.bounds.size()); for (const Interval &i : box.bounds) { internal_assert(i.min.defined() && i.max.defined()); if (i.is_bounded()) { strings.push_back(space); strings.push_back(i.min); strings.push_back(space); // Emit as (min, extent) rather than (min, max) strings.push_back(i.max - i.min + Expr(1)); } else { // This should really only happen for unusual cases // that we won't end up realizing, so we can just // use any numeric values. strings.push_back(space); strings.emplace_back(0); strings.push_back(space); strings.emplace_back(0); } } builder.trace_tag_expr = Internal::Call::make(type_of(), Internal::Call::stringify, strings, Internal::Call::PureIntrinsic); s = Block::make(Evaluate::make(builder.build()), s); } s = LetStmt::make("pipeline.trace_id", pipeline_start, s); } return s; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/Tracing.h000066400000000000000000000015051456515664200153340ustar00rootroot00000000000000#ifndef HALIDE_TRACING_H #define HALIDE_TRACING_H /** \file * Defines the lowering pass that injects print statements when tracing is turned on */ #include #include #include #include "Expr.h" namespace Halide { struct Target; namespace Internal { class Function; /** Take a statement representing a halide pipeline, inject calls to * tracing functions at interesting points, such as * allocations. Should be done before storage flattening, but after * all bounds inference. */ Stmt inject_tracing(Stmt, const std::string &pipeline_name, bool trace_pipeline, const std::map &env, const std::vector &outputs, const Target &Target); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/TrimNoOps.cpp000066400000000000000000000363321456515664200162000ustar00rootroot00000000000000#include #include "CSE.h" #include "CodeGen_GPU_Dev.h" #include "ExprUsesVar.h" #include "IREquality.h" #include "IRMutator.h" #include "IROperator.h" #include "Simplify.h" #include "Solve.h" #include "Substitute.h" #include "TrimNoOps.h" #include "Var.h" namespace Halide { namespace Internal { using std::string; using std::vector; namespace { /** Remove identity functions, even if they have side-effects. */ class StripIdentities : public IRMutator { using IRMutator::visit; Expr visit(const Call *op) override { if (Call::as_tag(op) || op->is_intrinsic(Call::return_second)) { return mutate(op->args.back()); } else { return IRMutator::visit(op); } } }; /** Check if an Expr loads from the given buffer. */ class LoadsFromBuffer : public IRVisitor { using IRVisitor::visit; void visit(const Load *op) override { if (op->name == buffer) { result = true; } else { IRVisitor::visit(op); } } string buffer; public: bool result = false; LoadsFromBuffer(const string &b) : buffer(b) { } }; bool loads_from_buffer(const Expr &e, const string &buf) { LoadsFromBuffer l(buf); e.accept(&l); return l.result; } /** Construct a sufficient condition for the visited stmt to be a no-op. */ class IsNoOp : public IRVisitor { using IRVisitor::visit; Expr make_and(Expr a, Expr b) { if (is_const_zero(a) || is_const_one(b)) { return a; } if (is_const_zero(b) || is_const_one(a)) { return b; } return a && b; } Expr make_or(Expr a, Expr b) { if (is_const_zero(a) || is_const_one(b)) { return b; } if (is_const_zero(b) || is_const_one(a)) { return a; } return a || b; } void visit(const Store *op) override { if (op->value.type().is_handle() || is_const_zero(op->predicate)) { condition = const_false(); } else { if (is_const_zero(condition)) { return; } // If the value being stored is the same as the value loaded, // this is a no-op debug(3) << "Considering store: " << Stmt(op) << "\n"; // Early-out: There's no way for that to be true if the // RHS does not load from the buffer being stored to. if (!loads_from_buffer(op->value, op->name)) { condition = const_false(); return; } Expr equivalent_load = Load::make(op->value.type(), op->name, op->index, Buffer<>(), Parameter(), op->predicate, op->alignment); Expr is_no_op = equivalent_load == op->value; is_no_op = StripIdentities().mutate(is_no_op); // We need to call CSE since sometimes we have "let" stmt on the RHS // that makes the expr harder to solve, i.e. the solver will just give up // and return a conservative false on call to and_condition_over_domain(). is_no_op = simplify(common_subexpression_elimination(is_no_op)); debug(3) << "Anding condition over domain... " << is_no_op << "\n"; is_no_op = and_condition_over_domain(is_no_op, Scope::empty_scope()); condition = make_and(condition, is_no_op); debug(3) << "Condition is now " << condition << "\n"; } } void visit(const For *op) override { if (is_const_zero(condition)) { return; } Expr old_condition = condition; condition = const_true(); op->body.accept(this); Scope varying; varying.push(op->name, Interval(op->min, op->min + op->extent - 1)); condition = simplify(common_subexpression_elimination(condition)); debug(3) << "About to relax over " << op->name << " : " << condition << "\n"; condition = and_condition_over_domain(condition, varying); debug(3) << "Relaxed: " << condition << "\n"; condition = make_and(old_condition, make_or(condition, simplify(op->extent <= 0))); } void visit(const IfThenElse *op) override { if (is_const_zero(condition)) { return; } Expr total_condition = condition; condition = const_true(); op->then_case.accept(this); // This is a no-op if we're previously a no-op, and the // condition is false or the if body is a no-op. total_condition = make_and(total_condition, make_or(!op->condition, condition)); condition = const_true(); if (op->else_case.defined()) { op->else_case.accept(this); total_condition = make_and(total_condition, make_or(op->condition, condition)); } condition = total_condition; } void visit(const Call *op) override { // If the loop calls an impure function, we can't remove the // call to it. Most notably: image_store. if (!op->is_pure()) { condition = const_false(); return; } IRVisitor::visit(op); } void visit(const Acquire *op) override { condition = const_false(); } template void visit_let(const LetOrLetStmt *op) { IRVisitor::visit(op); if (expr_uses_var(condition, op->name)) { condition = Let::make(op->name, op->value, condition); } } void visit(const LetStmt *op) override { visit_let(op); } void visit(const Let *op) override { visit_let(op); } public: Expr condition = const_true(); }; class SimplifyUsingBounds : public IRMutator { struct ContainingLoop { string var; Interval i; }; vector containing_loops; using IRMutator::visit; // Can we prove a condition over the non-rectangular domain of the for loops we're in? bool provably_true_over_domain(Expr test) { debug(3) << "Attempting to prove: " << test << "\n"; for (size_t i = containing_loops.size(); i > 0; i--) { // Because the domain is potentially non-rectangular, we // need to take each variable one-by-one, simplifying in // between to allow for cancellations of the bounds of // inner loops with outer loop variables. auto loop = containing_loops[i - 1]; if (is_const(test)) { break; } else if (!expr_uses_var(test, loop.var)) { continue; } else if (loop.i.is_bounded() && can_prove(loop.i.min == loop.i.max) && expr_uses_var(test, loop.var)) { // If min == max then either the domain only has one correct value, which we // can substitute directly. // Need to call CSE here since simplify() is sometimes unable to simplify expr with // non-trivial 'let' value, e.g. (let x = min(10, y-1) in (x < y)) test = common_subexpression_elimination(Let::make(loop.var, loop.i.min, test)); } else if (loop.i.is_bounded() && can_prove(loop.i.min >= loop.i.max) && expr_uses_var(test, loop.var)) { // If min >= max then either the domain only has one correct value, // or the domain is empty, which implies both min/max are true under // the domain. // Need to call CSE here since simplify() is sometimes unable to simplify expr with // non-trivial 'let' value, e.g. (let x = 10 in x < y) || (let x = min(10, y-1) in (x < y)) test = common_subexpression_elimination(Let::make(loop.var, loop.i.min, test) || Let::make(loop.var, loop.i.max, test)); } else { Scope s; // Rearrange the expression if possible so that the // loop var only occurs once. SolverResult solved = solve_expression(test, loop.var); if (solved.fully_solved) { test = solved.result; } s.push(loop.var, loop.i); test = and_condition_over_domain(test, s); } test = simplify(test); debug(3) << " -> " << test << "\n"; } return is_const_one(test); } Expr visit(const Min *op) override { if (!op->type.is_int() || op->type.bits() < 32) { return IRMutator::visit(op); } else { Expr a = mutate(op->a); Expr b = mutate(op->b); Expr test = a <= b; if (provably_true_over_domain(a <= b)) { return a; } else if (provably_true_over_domain(b <= a)) { return b; } else { return Min::make(a, b); } } } Expr visit(const Max *op) override { if (!op->type.is_int() || op->type.bits() < 32) { return IRMutator::visit(op); } else { Expr a = mutate(op->a); Expr b = mutate(op->b); if (provably_true_over_domain(a >= b)) { return a; } else if (provably_true_over_domain(b >= a)) { return b; } else { return Max::make(a, b); } } } template Expr visit_cmp(const Cmp *op) { Expr expr = IRMutator::visit(op); if (provably_true_over_domain(expr)) { expr = make_one(op->type); } else if (provably_true_over_domain(!expr)) { expr = make_zero(op->type); } return expr; } Expr visit(const LE *op) override { return visit_cmp(op); } Expr visit(const LT *op) override { return visit_cmp(op); } Expr visit(const GE *op) override { return visit_cmp(op); } Expr visit(const GT *op) override { return visit_cmp(op); } Expr visit(const EQ *op) override { return visit_cmp(op); } Expr visit(const NE *op) override { return visit_cmp(op); } template StmtOrExpr visit_let(const LetStmtOrLet *op) { Expr value = mutate(op->value); StmtOrExpr body; if (value.type() == Int(32) && is_pure(value)) { containing_loops.push_back({op->name, {value, value}}); body = mutate(op->body); containing_loops.pop_back(); } else { body = mutate(op->body); } return LetStmtOrLet::make(op->name, value, body); } Expr visit(const Let *op) override { return visit_let(op); } Stmt visit(const LetStmt *op) override { return visit_let(op); } Stmt visit(const For *op) override { // Simplify the loop bounds. Expr min = mutate(op->min); Expr extent = mutate(op->extent); containing_loops.push_back({op->name, {min, min + extent - 1}}); Stmt body = mutate(op->body); containing_loops.pop_back(); return For::make(op->name, min, extent, op->for_type, op->partition_policy, op->device_api, body); } public: SimplifyUsingBounds(const string &v, const Interval &i) { containing_loops.push_back({v, i}); } SimplifyUsingBounds() = default; }; class TrimNoOps : public IRMutator { using IRMutator::visit; Stmt visit(const For *op) override { // Bounds of GPU loops can't depend on outer gpu loop vars if (CodeGen_GPU_Dev::is_gpu_var(op->name)) { debug(3) << "TrimNoOps found gpu loop var: " << op->name << "\n"; return IRMutator::visit(op); } Stmt body = mutate(op->body); debug(3) << "\n\n ***** Trim no ops in loop over " << op->name << "\n"; IsNoOp is_no_op; body.accept(&is_no_op); debug(3) << "Condition is " << is_no_op.condition << "\n"; is_no_op.condition = simplify(simplify(common_subexpression_elimination(is_no_op.condition))); debug(3) << "Simplified condition is " << is_no_op.condition << "\n"; if (is_const_one(is_no_op.condition)) { // This loop is definitely useless debug(3) << "Removed empty loop.\n" << "Old: " << Stmt(op) << "\n"; return Evaluate::make(0); } else if (is_const_zero(is_no_op.condition)) { // This loop is definitely needed if (body.same_as(op->body)) { return op; } else { return For::make(op->name, op->min, op->extent, op->for_type, op->partition_policy, op->device_api, body); } } // The condition is something interesting. Try to see if we // can trim the loop bounds over which the loop does // something. Interval i = solve_for_outer_interval(!is_no_op.condition, op->name); debug(3) << "Interval is: " << i.min << ", " << i.max << "\n"; if (i.is_everything()) { // Nope. return For::make(op->name, op->min, op->extent, op->for_type, op->partition_policy, op->device_api, body); } if (i.is_empty()) { // Empty loop debug(3) << "Removed empty loop.\n" << "Old: " << Stmt(op) << "\n"; return Evaluate::make(0); } // Simplify the body to take advantage of the fact that the // loop range is now truncated body = simplify(SimplifyUsingBounds(op->name, i).mutate(body)); string new_min_name = unique_name(op->name + ".new_min"); string new_max_name = unique_name(op->name + ".new_max"); string old_max_name = unique_name(op->name + ".old_max"); Expr new_min_var = Variable::make(Int(32), new_min_name); Expr new_max_var = Variable::make(Int(32), new_max_name); Expr old_max_var = Variable::make(Int(32), old_max_name); // Convert max to max-plus-one if (i.has_upper_bound()) { i.max = i.max + 1; } // Truncate the loop bounds to the region over which it's not // a no-op. Expr old_max = op->min + op->extent; Expr new_min, new_max; if (i.has_lower_bound()) { new_min = clamp(i.min, op->min, old_max_var); } else { new_min = op->min; } if (i.has_upper_bound()) { new_max = clamp(i.max, new_min_var, old_max_var); } else { new_max = old_max; } Expr new_extent = new_max_var - new_min_var; Stmt stmt = For::make(op->name, new_min_var, new_extent, op->for_type, op->partition_policy, op->device_api, body); stmt = LetStmt::make(new_max_name, new_max, stmt); stmt = LetStmt::make(new_min_name, new_min, stmt); stmt = LetStmt::make(old_max_name, old_max, stmt); stmt = simplify(stmt); debug(3) << "Rewrote loop.\n" << "Old: " << Stmt(op) << "\n" << "New: " << stmt << "\n"; return stmt; } }; } // namespace Stmt trim_no_ops(Stmt s) { s = TrimNoOps().mutate(s); return s; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/TrimNoOps.h000066400000000000000000000006711456515664200156420ustar00rootroot00000000000000#ifndef TRIM_NO_OPS_H #define TRIM_NO_OPS_H /** \file * Defines a lowering pass that truncates loops to the region over * which they actually do something. */ #include "Expr.h" namespace Halide { namespace Internal { /** Truncate loop bounds to the region over which they actually do * something. For examples see test/correctness/trim_no_ops.cpp */ Stmt trim_no_ops(Stmt s); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/Tuple.cpp000066400000000000000000000006211456515664200153670ustar00rootroot00000000000000#include "Tuple.h" #include "Debug.h" #include "Func.h" namespace Halide { Tuple::Tuple(const FuncRef &f) : exprs(f.size()) { user_assert(f.size() > 1) << "Can't construct a Tuple from a call to Func \"" << f.function().name() << "\" because it does not return a Tuple.\n"; for (size_t i = 0; i < f.size(); i++) { exprs[i] = f[i]; } } } // namespace Halide Halide-17.0.1/src/Tuple.h000066400000000000000000000032061456515664200150360ustar00rootroot00000000000000#ifndef HALIDE_TUPLE_H #define HALIDE_TUPLE_H /** \file * * Defines Tuple - the front-end handle on small arrays of expressions. */ #include #include "Expr.h" namespace Halide { class FuncRef; /** Create a small array of Exprs for defining and calling functions * with multiple outputs. */ class Tuple { private: std::vector exprs; public: /** The number of elements in the tuple. */ size_t size() const { return exprs.size(); } /** Get a reference to an element. */ Expr &operator[](size_t x) { user_assert(x < exprs.size()) << "Tuple access out of bounds\n"; return exprs[x]; } /** Get a copy of an element. */ Expr operator[](size_t x) const { user_assert(x < exprs.size()) << "Tuple access out of bounds\n"; return exprs[x]; } /** Construct a Tuple of a single Expr */ explicit Tuple(Expr e) { exprs.emplace_back(std::move(e)); } /** Construct a Tuple from some Exprs. */ //@{ template Tuple(const Expr &a, const Expr &b, Args &&...args) { exprs = std::vector{a, b, std::forward(args)...}; } //@} /** Construct a Tuple from a vector of Exprs */ explicit HALIDE_NO_USER_CODE_INLINE Tuple(const std::vector &e) : exprs(e) { user_assert(!e.empty()) << "Tuples must have at least one element\n"; } /** Construct a Tuple from a function reference. */ Tuple(const FuncRef &); /** Treat the tuple as a vector of Exprs */ const std::vector &as_vector() const { return exprs; } }; } // namespace Halide #endif Halide-17.0.1/src/Type.cpp000066400000000000000000000255221456515664200152260ustar00rootroot00000000000000#include "IR.h" #include #include namespace Halide { using std::ostringstream; namespace { uint64_t max_uint(int bits) { uint64_t max_val = 0xffffffffffffffffULL; return max_val >> (64 - bits); } int64_t max_int(int bits) { int64_t max_val = 0x7fffffffffffffffLL; return max_val >> (64 - bits); } int64_t min_int(int bits) { return -max_int(bits) - 1; } } // namespace /** Return an expression which is the maximum value of this type */ Halide::Expr Type::max() const { if (is_vector()) { return Internal::Broadcast::make(element_of().max(), lanes()); } else if (is_int()) { return Internal::IntImm::make(*this, max_int(bits())); } else if (is_uint()) { return Internal::UIntImm::make(*this, max_uint(bits())); } else { internal_assert(is_float()); if (bits() == 16) { return Internal::FloatImm::make(*this, 65504.0); } else if (bits() == 32) { return Internal::FloatImm::make(*this, std::numeric_limits::infinity()); } else if (bits() == 64) { return Internal::FloatImm::make(*this, std::numeric_limits::infinity()); } else { internal_error << "Unknown float type: " << (*this) << "\n"; return 0; } } } /** Return an expression which is the minimum value of this type */ Halide::Expr Type::min() const { if (is_vector()) { return Internal::Broadcast::make(element_of().min(), lanes()); } else if (is_int()) { return Internal::IntImm::make(*this, min_int(bits())); } else if (is_uint()) { return Internal::UIntImm::make(*this, 0); } else { internal_assert(is_float()); if (bits() == 16) { return Internal::FloatImm::make(*this, -65504.0); } else if (bits() == 32) { return Internal::FloatImm::make(*this, -std::numeric_limits::infinity()); } else if (bits() == 64) { return Internal::FloatImm::make(*this, -std::numeric_limits::infinity()); } else { internal_error << "Unknown float type: " << (*this) << "\n"; return 0; } } } bool Type::is_max(int64_t x) const { return x > 0 && is_max((uint64_t)x); } bool Type::is_max(uint64_t x) const { if (is_int()) { return x == (uint64_t)max_int(bits()); } else if (is_uint()) { return x == max_uint(bits()); } else { return false; } } bool Type::is_min(int64_t x) const { if (is_int()) { return x == min_int(bits()); } else if (is_uint()) { return x == 0; } else { return false; } } bool Type::is_min(uint64_t x) const { return false; } bool Type::can_represent(Type other) const { if (*this == other) { return true; } if (lanes() != other.lanes()) { return false; } if (is_int()) { return ((other.is_int() && other.bits() <= bits()) || (other.is_uint() && other.bits() < bits())); } else if (is_uint()) { return other.is_uint() && other.bits() <= bits(); } else if (is_bfloat()) { return (other.is_bfloat() && other.bits() <= bits()); } else if (is_float()) { if (other.is_bfloat()) { return bits() > other.bits(); } else { return ((other.is_float() && other.bits() <= bits()) || (bits() == 64 && other.bits() <= 32) || (bits() == 32 && other.bits() <= 16)); } } else { return false; } } bool Type::can_represent(int64_t x) const { if (is_int()) { return x >= min_int(bits()) && x <= max_int(bits()); } else if (is_uint()) { return x >= 0 && (uint64_t)x <= max_uint(bits()); } else if (is_bfloat()) { switch (bits()) { case 16: // Round-trip from int64_t to bfloat16_t and back to see // if the value was preserved. This round-tripping must be // done via float in both directions, which gives us the // following ridiculous chain of casts: return (int64_t)(float)(bfloat16_t)(float)x == x; default: return false; } } else if (is_float()) { switch (bits()) { case 16: return (int64_t)(float)(float16_t)(float)x == x; case 32: return (int64_t)(float)x == x; case 64: return (int64_t)(double)x == x; default: return false; } } else { return false; } } bool Type::can_represent(uint64_t x) const { if (is_int()) { return x <= (uint64_t)(max_int(bits())); } else if (is_uint()) { return x <= max_uint(bits()); } else if (is_bfloat()) { switch (bits()) { case 16: return (uint64_t)(float)(bfloat16_t)(float)x == x; default: return false; } } else if (is_float()) { switch (bits()) { case 16: return (uint64_t)(float)(float16_t)(float)x == x; case 32: return (uint64_t)(float)x == x; case 64: return (uint64_t)(double)x == x; default: return false; } } else { return false; } } bool Type::can_represent(double x) const { if (is_int()) { int64_t i = Internal::safe_numeric_cast(x); return (x >= min_int(bits())) && (x <= max_int(bits())) && (x == (double)i); } else if (is_uint()) { uint64_t u = Internal::safe_numeric_cast(x); return (x >= 0) && (x <= max_uint(bits())) && (x == (double)u); } else if (is_bfloat()) { switch (bits()) { case 16: return (double)(bfloat16_t)x == x; default: return false; } } else if (is_float()) { switch (bits()) { case 16: return (double)(float16_t)x == x; case 32: return (double)(float)x == x; case 64: return true; default: return false; } } else { return false; } } bool Type::same_handle_type(const Type &other) const { const halide_handle_cplusplus_type *first = handle_type; const halide_handle_cplusplus_type *second = other.handle_type; if (first == second) { return true; } if (first == nullptr) { first = halide_handle_traits::type_info(); } if (second == nullptr) { second = halide_handle_traits::type_info(); } return first->inner_name == second->inner_name && first->namespaces == second->namespaces && first->enclosing_types == second->enclosing_types && first->cpp_type_modifiers == second->cpp_type_modifiers && first->reference_type == second->reference_type; } std::string type_to_c_type(Type type, bool include_space, bool c_plus_plus) { bool needs_space = true; ostringstream oss; if (type.is_bfloat()) { oss << "bfloat" << type.bits() << "_t"; } else if (type.is_float()) { if (type.bits() == 32) { oss << "float"; } else if (type.bits() == 64) { oss << "double"; } else { oss << "float" << type.bits() << "_t"; } if (type.is_vector()) { oss << type.lanes(); } } else if (type.is_handle()) { needs_space = false; // If there is no type info or is generating C (not C++) and // the type is a class or in an inner scope, just use void *. if (type.handle_type == nullptr || (!c_plus_plus && (!type.handle_type->namespaces.empty() || !type.handle_type->enclosing_types.empty() || type.handle_type->inner_name.cpp_type_type == halide_cplusplus_type_name::Class))) { oss << "void *"; } else { if (type.handle_type->inner_name.cpp_type_type == halide_cplusplus_type_name::Struct) { oss << "struct "; } if (!type.handle_type->namespaces.empty() || !type.handle_type->enclosing_types.empty()) { oss << "::"; for (const auto &ns : type.handle_type->namespaces) { oss << ns << "::"; } for (const auto &enclosing_type : type.handle_type->enclosing_types) { oss << enclosing_type.name << "::"; } } oss << type.handle_type->inner_name.name; if (type.handle_type->reference_type == halide_handle_cplusplus_type::LValueReference) { oss << " &"; } else if (type.handle_type->reference_type == halide_handle_cplusplus_type::RValueReference) { oss << " &&"; } for (auto modifier : type.handle_type->cpp_type_modifiers) { if (modifier & halide_handle_cplusplus_type::Const) { oss << " const"; } if (modifier & halide_handle_cplusplus_type::Volatile) { oss << " volatile"; } if (modifier & halide_handle_cplusplus_type::Restrict) { oss << " restrict"; } if ((modifier & halide_handle_cplusplus_type::Pointer) && !(modifier & halide_handle_cplusplus_type::FunctionTypedef)) { oss << " *"; } } } } else { // This ends up using different type names than OpenCL does // for the integer vector types. E.g. uint16x8_t rather than // OpenCL's short8. Should be fine as CodeGen_C introduces // typedefs for them and codegen always goes through this // routine or its override in CodeGen_OpenCL to make the // names. This may be the better bet as the typedefs are less // likely to collide with built-in types (e.g. the OpenCL // ones for a C compiler that decides to compile OpenCL). // This code also supports arbitrary vector sizes where the // OpenCL ones must be one of 2, 3, 4, 8, 16, which is too // restrictive for already existing architectures. switch (type.bits()) { case 1: // bool vectors are always emitted as uint8 in the C++ backend if (type.is_vector()) { oss << "uint8x" << type.lanes() << "_t"; } else { oss << "bool"; } break; default: if (type.is_uint()) { oss << "u"; } oss << "int" << type.bits(); if (type.is_vector()) { oss << "x" << type.lanes(); } oss << "_t"; } } if (include_space && needs_space) { oss << " "; } return oss.str(); } } // namespace Halide Halide-17.0.1/src/Type.h000066400000000000000000000506571456515664200147020ustar00rootroot00000000000000#ifndef HALIDE_TYPE_H #define HALIDE_TYPE_H #include "Error.h" #include "Float16.h" #include "Util.h" #include "runtime/HalideRuntime.h" #include /** \file * Defines halide types */ /** A set of types to represent a C++ function signature. This allows * two things. First, proper prototypes can be provided for Halide * generated functions, giving better compile time type * checking. Second, C++ name mangling can be done to provide link * time type checking for both Halide generated functions and calls * from Halide to external functions. * * These are intended to be constexpr producable. * * halide_handle_traits has to go outside the Halide namespace due to template * resolution rules. TODO(zalman): Do all types need to be in global namespace? */ //@{ /** A structure to represent the (unscoped) name of a C++ composite type for use * as a single argument (or return value) in a function signature. * * Currently does not support the restrict qualifier, references, or * r-value references. These features cannot be used in extern * function calls from Halide or in the generated function from * Halide, but their applicability seems limited anyway. * * Although this is in the global namespace, it should be considered "Halide Internal" * and subject to change; code outside Halide should avoid referencing it. */ struct halide_cplusplus_type_name { /// An enum to indicate whether a C++ type is non-composite, a struct, class, or union enum CPPTypeType { Simple, ///< "int" Struct, ///< "struct Foo" Class, ///< "class Foo" Union, ///< "union Foo" Enum, ///< "enum Foo" } cpp_type_type; // Note: order is reflected in map_to_name table in CPlusPlusMangle.cpp std::string name; halide_cplusplus_type_name(CPPTypeType cpp_type_type, const std::string &name) : cpp_type_type(cpp_type_type), name(name) { } bool operator==(const halide_cplusplus_type_name &rhs) const { return cpp_type_type == rhs.cpp_type_type && name == rhs.name; } bool operator!=(const halide_cplusplus_type_name &rhs) const { return !(*this == rhs); } bool operator<(const halide_cplusplus_type_name &rhs) const { return cpp_type_type < rhs.cpp_type_type || (cpp_type_type == rhs.cpp_type_type && name < rhs.name); } }; /** A structure to represent the fully scoped name of a C++ composite * type for use in generating function signatures that use that type. * * This is intended to be a constexpr usable type. * * Although this is in the global namespace, it should be considered "Halide Internal" * and subject to change; code outside Halide should avoid referencing it. */ struct halide_handle_cplusplus_type { halide_cplusplus_type_name inner_name; std::vector namespaces; std::vector enclosing_types; /// One set of modifiers on a type. /// The const/volatile/restrict properties are "inside" the pointer property. enum Modifier : uint8_t { Const = 1 << 0, ///< Bitmask flag for "const" Volatile = 1 << 1, ///< Bitmask flag for "volatile" Restrict = 1 << 2, ///< Bitmask flag for "restrict" Pointer = 1 << 3, ///< Bitmask flag for a pointer "*" FunctionTypedef = 1 << 4, ///< Bitmask flag for a function typedef; when this is set, Pointer should also always be set }; /// Qualifiers and indirections on type. 0 is innermost. std::vector cpp_type_modifiers; /// References are separate because they only occur at the outermost level. /// No modifiers are needed for references as they are not allowed to apply /// to the reference itself. (This isn't true for restrict, but that is a C++ /// extension anyway.) If modifiers are needed, the last entry in the above /// array would be the modifers for the reference. enum ReferenceType : uint8_t { NotReference = 0, LValueReference = 1, // "&" RValueReference = 2, // "&&" }; ReferenceType reference_type; halide_handle_cplusplus_type(const halide_cplusplus_type_name &inner_name, const std::vector &namespaces = {}, const std::vector &enclosing_types = {}, const std::vector &modifiers = {}, ReferenceType reference_type = NotReference) : inner_name(inner_name), namespaces(namespaces), enclosing_types(enclosing_types), cpp_type_modifiers(modifiers), reference_type(reference_type) { } template static halide_handle_cplusplus_type make(); }; //@} /** halide_c_type_to_name is a utility class used to provide a user-extensible * way of naming Handle types. * * Although this is in the global namespace, it should be considered "Halide Internal" * and subject to change; code outside Halide should avoid referencing it * directly (use the HALIDE_DECLARE_EXTERN_xxx macros instead). */ template struct halide_c_type_to_name { static constexpr bool known_type = false; static halide_cplusplus_type_name name() { return {halide_cplusplus_type_name::Simple, "void"}; } }; #define HALIDE_DECLARE_EXTERN_TYPE(TypeType, Type) \ template<> \ struct halide_c_type_to_name { \ static constexpr bool known_type = true; \ static halide_cplusplus_type_name name() { \ return {halide_cplusplus_type_name::TypeType, #Type}; \ } \ } #define HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Simple, T) #define HALIDE_DECLARE_EXTERN_STRUCT_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Struct, T) #define HALIDE_DECLARE_EXTERN_CLASS_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Class, T) #define HALIDE_DECLARE_EXTERN_UNION_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Union, T) HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(char); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(bool); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int8_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint8_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int32_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint32_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int64_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint64_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::float16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::bfloat16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(halide_task_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(halide_loop_task_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(float); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(double); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_buffer_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_dimension_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_device_interface_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_filter_metadata_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_semaphore_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_semaphore_acquire_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_parallel_task_t); // You can make arbitrary user-defined types be "Known" using the // macro above. This is useful for making Param<> arguments for // Generators type safe. e.g., // // struct MyFunStruct { ... }; // // ... // // HALIDE_DECLARE_EXTERN_STRUCT_TYPE(MyFunStruct); // // ... // // class MyGenerator : public Generator { // Param my_struct_ptr; // ... // }; template /*static*/ halide_handle_cplusplus_type halide_handle_cplusplus_type::make() { constexpr bool is_ptr = std::is_pointer::value; constexpr bool is_lvalue_reference = std::is_lvalue_reference::value; constexpr bool is_rvalue_reference = std::is_rvalue_reference::value; using TNoRef = typename std::remove_reference::type; using TNoRefNoPtr = typename std::remove_pointer::type; constexpr bool is_function_pointer = std::is_pointer::value && std::is_function::value; // Don't remove the pointer-ness from a function pointer. using TBase = typename std::conditional::type; constexpr bool is_const = std::is_const::value; constexpr bool is_volatile = std::is_volatile::value; constexpr uint8_t modifiers = static_cast( (is_function_pointer ? halide_handle_cplusplus_type::FunctionTypedef : 0) | (is_ptr ? halide_handle_cplusplus_type::Pointer : 0) | (is_const ? halide_handle_cplusplus_type::Const : 0) | (is_volatile ? halide_handle_cplusplus_type::Volatile : 0)); // clang-format off constexpr halide_handle_cplusplus_type::ReferenceType ref_type = (is_lvalue_reference ? halide_handle_cplusplus_type::LValueReference : is_rvalue_reference ? halide_handle_cplusplus_type::RValueReference : halide_handle_cplusplus_type::NotReference); // clang-format on using TNonCVBase = typename std::remove_cv::type; constexpr bool known_type = halide_c_type_to_name::known_type; static_assert(!(!known_type && !is_ptr), "Unknown types must be pointers"); halide_handle_cplusplus_type info = { halide_c_type_to_name::name(), {}, {}, {modifiers}, ref_type}; // Pull off any namespaces info.inner_name.name = Halide::Internal::extract_namespaces(info.inner_name.name, info.namespaces); return info; } /** A type traits template to provide a halide_handle_cplusplus_type * value from a C++ type. * * Note the type represented is implicitly a pointer. * * A NULL pointer of type halide_handle_traits represents "void *". * This is chosen for compactness or representation as Type is a very * widely used data structure. * * Although this is in the global namespace, it should be considered "Halide Internal" * and subject to change; code outside Halide should avoid referencing it directly. */ template struct halide_handle_traits { // This trait must return a pointer to a global structure. I.e. it should never be freed. // A return value of nullptr here means "void *". HALIDE_ALWAYS_INLINE static const halide_handle_cplusplus_type *type_info() { if (std::is_pointer::value || std::is_lvalue_reference::value || std::is_rvalue_reference::value) { static const halide_handle_cplusplus_type the_info = halide_handle_cplusplus_type::make(); return &the_info; } return nullptr; } }; namespace Halide { struct Expr; /** Types in the halide type system. They can be ints, unsigned ints, * or floats of various bit-widths (the 'bits' field). They can also * be vectors of the same (by setting the 'lanes' field to something * larger than one). Front-end code shouldn't use vector * types. Instead vectorize a function. */ struct Type { private: halide_type_t type; public: /** Aliases for halide_type_code_t values for legacy compatibility * and to match the Halide internal C++ style. */ // @{ static const halide_type_code_t Int = halide_type_int; static const halide_type_code_t UInt = halide_type_uint; static const halide_type_code_t Float = halide_type_float; static const halide_type_code_t BFloat = halide_type_bfloat; static const halide_type_code_t Handle = halide_type_handle; // @} /** The number of bytes required to store a single scalar value of this type. Ignores vector lanes. */ int bytes() const { return (bits() + 7) / 8; } // Default ctor initializes everything to predictable-but-unlikely values Type() : type(Handle, 0, 0) { } /** Construct a runtime representation of a Halide type from: * code: The fundamental type from an enum. * bits: The bit size of one element. * lanes: The number of vector elements in the type. */ Type(halide_type_code_t code, int bits, int lanes, const halide_handle_cplusplus_type *handle_type = nullptr) : type(code, (uint8_t)bits, (uint16_t)lanes), handle_type(handle_type) { user_assert(lanes == type.lanes) << "Halide only supports vector types with up to 65535 lanes. " << lanes << " lanes requested."; user_assert(bits == type.bits) << "Halide only supports types with up to 255 bits. " << bits << " bits requested."; } /** Trivial copy constructor. */ Type(const Type &that) = default; /** Trivial copy assignment operator. */ Type &operator=(const Type &that) = default; /** Type is a wrapper around halide_type_t with more methods for use * inside the compiler. This simply constructs the wrapper around * the runtime value. */ HALIDE_ALWAYS_INLINE Type(const halide_type_t &that, const halide_handle_cplusplus_type *handle_type = nullptr) : type(that), handle_type(handle_type) { } /** Unwrap the runtime halide_type_t for use in runtime calls, etc. * Representation is exactly equivalent. */ HALIDE_ALWAYS_INLINE operator halide_type_t() const { return type; } /** Return the underlying data type of an element as an enum value. */ HALIDE_ALWAYS_INLINE halide_type_code_t code() const { return (halide_type_code_t)type.code; } /** Return the bit size of a single element of this type. */ HALIDE_ALWAYS_INLINE int bits() const { return type.bits; } /** Return the number of vector elements in this type. */ HALIDE_ALWAYS_INLINE int lanes() const { return type.lanes; } /** Return Type with same number of bits and lanes, but new_code for a type code. */ Type with_code(halide_type_code_t new_code) const { return Type(new_code, bits(), lanes(), (new_code == code()) ? handle_type : nullptr); } /** Return Type with same type code and lanes, but new_bits for the number of bits. */ Type with_bits(int new_bits) const { return Type(code(), new_bits, lanes(), (new_bits == bits()) ? handle_type : nullptr); } /** Return Type with same type code and number of bits, * but new_lanes for the number of vector lanes. */ Type with_lanes(int new_lanes) const { return Type(code(), bits(), new_lanes, handle_type); } /** Return Type with the same type code and number of lanes, but with at least twice as many bits. */ Type widen() const { if (bits() == 1) { // Widening a 1-bit type should produce an 8-bit type. return with_bits(8); } else { return with_bits(bits() * 2); } } /** Return Type with the same type code and number of lanes, but with at most half as many bits. */ Type narrow() const { internal_assert(bits() != 1) << "Attempting to narrow a 1-bit type\n"; if (bits() == 8) { // Narrowing an 8-bit type should produce a 1-bit type. return with_bits(1); } else { return with_bits(bits() / 2); } } /** Type to be printed when declaring handles of this type. */ const halide_handle_cplusplus_type *handle_type = nullptr; /** Is this type boolean (represented as UInt(1))? */ HALIDE_ALWAYS_INLINE bool is_bool() const { return code() == UInt && bits() == 1; } /** Is this type a vector type? (lanes() != 1). * TODO(abadams): Decide what to do for lanes() == 0. */ HALIDE_ALWAYS_INLINE bool is_vector() const { return lanes() != 1; } /** Is this type a scalar type? (lanes() == 1). * TODO(abadams): Decide what to do for lanes() == 0. */ HALIDE_ALWAYS_INLINE bool is_scalar() const { return lanes() == 1; } /** Is this type a floating point type (float or double). */ HALIDE_ALWAYS_INLINE bool is_float() const { return code() == Float || code() == BFloat; } /** Is this type a floating point type (float or double). */ HALIDE_ALWAYS_INLINE bool is_bfloat() const { return code() == BFloat; } /** Is this type a signed integer type? */ HALIDE_ALWAYS_INLINE bool is_int() const { return code() == Int; } /** Is this type an unsigned integer type? */ HALIDE_ALWAYS_INLINE bool is_uint() const { return code() == UInt; } /** Is this type an integer type of any sort? */ HALIDE_ALWAYS_INLINE bool is_int_or_uint() const { return code() == Int || code() == UInt; } /** Is this type an opaque handle type (void *) */ HALIDE_ALWAYS_INLINE bool is_handle() const { return code() == Handle; } // Returns true iff type is a signed integral type where overflow is defined. HALIDE_ALWAYS_INLINE bool can_overflow_int() const { return is_int() && bits() <= 16; } // Returns true iff type does have a well-defined overflow behavior. HALIDE_ALWAYS_INLINE bool can_overflow() const { return is_uint() || can_overflow_int(); } /** Check that the type name of two handles matches. */ bool same_handle_type(const Type &other) const; /** Compare two types for equality */ bool operator==(const Type &other) const { return type == other.type && (code() != Handle || same_handle_type(other)); } /** Compare two types for inequality */ bool operator!=(const Type &other) const { return type != other.type || (code() == Handle && !same_handle_type(other)); } /** Compare two types for equality */ bool operator==(const halide_type_t &other) const { return type == other; } /** Compare two types for inequality */ bool operator!=(const halide_type_t &other) const { return type != other; } /** Compare ordering of two types so they can be used in certain containers and algorithms */ bool operator<(const Type &other) const { if (type < other.type) { return true; } if (code() == Handle) { return handle_type < other.handle_type; } return false; } /** Produce the scalar type (that of a single element) of this vector type */ Type element_of() const { return with_lanes(1); } /** Can this type represent all values of another type? */ bool can_represent(Type other) const; /** Can this type represent a particular constant? */ // @{ bool can_represent(double x) const; bool can_represent(int64_t x) const; bool can_represent(uint64_t x) const; // @} /** Check if an integer constant value is the maximum or minimum * representable value for this type. */ // @{ bool is_max(uint64_t) const; bool is_max(int64_t) const; bool is_min(uint64_t) const; bool is_min(int64_t) const; // @} /** Return an expression which is the maximum value of this type. * Returns infinity for types which can represent it. */ Expr max() const; /** Return an expression which is the minimum value of this type. * Returns -infinity for types which can represent it. */ Expr min() const; }; /** Constructing a signed integer type */ inline Type Int(int bits, int lanes = 1) { return Type(Type::Int, bits, lanes); } /** Constructing an unsigned integer type */ inline Type UInt(int bits, int lanes = 1) { return Type(Type::UInt, bits, lanes); } /** Construct a floating-point type */ inline Type Float(int bits, int lanes = 1) { return Type(Type::Float, bits, lanes); } /** Construct a floating-point type in the bfloat format. Only 16-bit currently supported. */ inline Type BFloat(int bits, int lanes = 1) { return Type(Type::BFloat, bits, lanes); } /** Construct a boolean type */ inline Type Bool(int lanes = 1) { return UInt(1, lanes); } /** Construct a handle type */ inline Type Handle(int lanes = 1, const halide_handle_cplusplus_type *handle_type = nullptr) { return Type(Type::Handle, 64, lanes, handle_type); } /** Construct the halide equivalent of a C type */ template inline Type type_of() { return Type(halide_type_of(), halide_handle_traits::type_info()); } /** Halide type to a C++ type */ std::string type_to_c_type(Type type, bool include_space, bool c_plus_plus = true); } // namespace Halide #endif Halide-17.0.1/src/UnifyDuplicateLets.cpp000066400000000000000000000060441456515664200200600ustar00rootroot00000000000000#include "UnifyDuplicateLets.h" #include "IREquality.h" #include "IRMutator.h" #include namespace Halide { namespace Internal { using std::map; using std::string; namespace { class UnifyDuplicateLets : public IRMutator { using IRMutator::visit; map scope; map rewrites; string producing; public: using IRMutator::mutate; Expr mutate(const Expr &e) override { if (e.defined()) { map::iterator iter = scope.find(e); if (iter != scope.end()) { return Variable::make(e.type(), iter->second); } else { return IRMutator::mutate(e); } } else { return Expr(); } } protected: Expr visit(const Variable *op) override { map::iterator iter = rewrites.find(op->name); if (iter != rewrites.end()) { return Variable::make(op->type, iter->second); } else { return op; } } // Can't unify lets where the RHS might be not be pure bool is_impure; Expr visit(const Call *op) override { is_impure |= !op->is_pure(); return IRMutator::visit(op); } Expr visit(const Load *op) override { is_impure = true; return IRMutator::visit(op); } Stmt visit(const ProducerConsumer *op) override { if (op->is_producer) { string old_producing = producing; producing = op->name; Stmt stmt = IRMutator::visit(op); producing = old_producing; return stmt; } else { return IRMutator::visit(op); } } template auto visit_let(const LetStmtOrLet *op) -> decltype(op->body) { is_impure = false; Expr value = mutate(op->value); auto body = op->body; bool should_pop = false; bool should_erase = false; if (!is_impure) { map::iterator iter = scope.find(value); if (iter == scope.end()) { scope[value] = op->name; should_pop = true; } else { value = Variable::make(value.type(), iter->second); rewrites[op->name] = iter->second; should_erase = true; } } body = mutate(op->body); if (should_pop) { scope.erase(value); } if (should_erase) { rewrites.erase(op->name); } if (value.same_as(op->value) && body.same_as(op->body)) { return op; } else { return LetStmtOrLet::make(op->name, value, body); } } Expr visit(const Let *op) override { return visit_let(op); } Stmt visit(const LetStmt *op) override { return visit_let(op); } }; } // namespace Stmt unify_duplicate_lets(const Stmt &s) { return UnifyDuplicateLets().mutate(s); } } // namespace Internal } // namespace Halide Halide-17.0.1/src/UnifyDuplicateLets.h000066400000000000000000000007031456515664200175210ustar00rootroot00000000000000#ifndef HALIDE_UNIFY_DUPLICATE_LETS_H #define HALIDE_UNIFY_DUPLICATE_LETS_H /** \file * Defines the lowering pass that coalesces redundant let statements */ #include "Expr.h" namespace Halide { namespace Internal { /** Find let statements that all define the same value, and make later * ones just reuse the symbol names of the earlier ones. */ Stmt unify_duplicate_lets(const Stmt &s); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/UniquifyVariableNames.cpp000066400000000000000000000161051456515664200205450ustar00rootroot00000000000000#include "UniquifyVariableNames.h" #include "IREquality.h" #include "IRMutator.h" #include "IROperator.h" #include "IRVisitor.h" #include "Scope.h" #include "Var.h" #include namespace Halide { namespace Internal { using std::pair; using std::string; using std::vector; namespace { class UniquifyVariableNames : public IRMutator { using IRMutator::visit; // The mapping from old names to new names Scope renaming; // Get a new previously unused name for a let binding or for loop, // and push it onto the renaming. Will return the original name if // possible, but pushes unconditionally to simplify cleanup. string make_new_name(const string &base) { if (!renaming.contains(base)) { renaming.push(base, base); return base; } for (size_t i = std::max((size_t)1, renaming.count(base));; i++) { string candidate = base + "_" + std::to_string(i); if (!renaming.contains(candidate)) { // Reserve this name for this base name renaming.push(base, candidate); // And reserve the generated name forever more (will not be popped) renaming.push(candidate, candidate); return candidate; } } } template auto visit_let(const LetOrLetStmt *op) -> decltype(op->body) { struct Frame { const LetOrLetStmt *op; Expr value; string new_name; }; vector frames; decltype(op->body) result; while (op) { frames.emplace_back(); auto &f = frames.back(); f.op = op; f.value = mutate(op->value); f.new_name = make_new_name(op->name); result = op->body; op = result.template as(); } result = mutate(result); for (auto it = frames.rbegin(); it != frames.rend(); it++) { renaming.pop(it->op->name); if (it->new_name == it->op->name && result.same_as(it->op->body) && it->op->value.same_as(it->value)) { result = it->op; } else { result = LetOrLetStmt::make(it->new_name, it->value, result); } } return result; } Stmt visit(const LetStmt *op) override { return visit_let(op); } Expr visit(const Let *op) override { return visit_let(op); } Stmt visit(const For *op) override { Expr min = mutate(op->min); Expr extent = mutate(op->extent); string new_name = make_new_name(op->name); Stmt body = mutate(op->body); renaming.pop(op->name); if (new_name == op->name && body.same_as(op->body) && min.same_as(op->min) && extent.same_as(op->extent)) { return op; } else { return For::make(new_name, min, extent, op->for_type, op->partition_policy, op->device_api, body); } } Expr visit(const Variable *op) override { if (renaming.contains(op->name)) { string new_name = renaming.get(op->name); if (new_name != op->name) { return Variable::make(op->type, new_name); } } return op; } public: UniquifyVariableNames(const Scope *free_vars) { renaming.set_containing_scope(free_vars); } }; class FindFreeVars : public IRVisitor { using IRVisitor::visit; Scope<> scope; void visit(const Variable *op) override { if (!scope.contains(op->name)) { free_vars.push(op->name, op->name); } } template void visit_let(const T *op) { vector> frame; decltype(op->body) body; do { op->value.accept(this); frame.emplace_back(scope, op->name); body = op->body; op = body.template as(); } while (op); body.accept(this); } void visit(const Let *op) override { visit_let(op); } void visit(const LetStmt *op) override { visit_let(op); } void visit(const For *op) override { op->min.accept(this); op->extent.accept(this); { ScopedBinding<> bind(scope, op->name); op->body.accept(this); } } public: Scope free_vars; }; } // namespace Stmt uniquify_variable_names(const Stmt &s) { FindFreeVars finder; s.accept(&finder); UniquifyVariableNames u(&finder.free_vars); return u.mutate(s); } void check(vector> in, vector> out) { Stmt in_stmt = Evaluate::make(0), out_stmt = Evaluate::make(0); for (auto it = in.rbegin(); it != in.rend(); it++) { in_stmt = LetStmt::make(it->first.name(), it->second, in_stmt); } for (auto it = out.rbegin(); it != out.rend(); it++) { out_stmt = LetStmt::make(it->first.name(), it->second, out_stmt); } Stmt s = uniquify_variable_names(in_stmt); internal_assert(equal(s, out_stmt)) << "Failure in uniquify_variable_names\n" << "Input:\n" << in_stmt << "\n" << "Produced:\n" << s << "\n" << "Correct output:\n" << out_stmt << "\n"; } void uniquify_variable_names_test() { Var x("x"), x_1("x_1"), x_2("x_2"), x_3{"x_3"}; Var y("y"), y_1("y_1"), y_2("y_2"), y_3{"y_3"}; // Stmts with all names already unique should be unchanged check({{x, 3}, {y, x}}, {{x, 3}, {y, x}}); // Shadowed definitions of Vars should be given unique names check({{x, 3}, {y, x}, {x, x + y}, {y, x + y}, {x, x + y}, {y, x + y}}, {{x, 3}, {y, x}, {x_1, x + y}, {y_1, x_1 + y}, {x_2, x_1 + y_1}, {y_2, x_2 + y_1}}); // Check a case with a free var after then end of the scope of a let of the same name check({{x, Let::make(y.name(), 3, y)}, // y is bound {x, y}}, // This is not the same y. It's free and can't be renamed. {{x, Let::make(y_1.name(), 3, y_1)}, // We rename the bound one {x_1, y}}); // An existing in-scope use of one of the names that would be // autogenerated should be skipped over check({{x_1, 8}, {x, 3}, {y, x}, {x, x + y}, {y, x + y}, {x, x + y}, {y, x + y}}, {{x_1, 8}, {x, 3}, {y, x}, {x_2, x + y}, {y_1, x_2 + y}, {x_3, x_2 + y_1}, {y_2, x_3 + y_1}}); // Check parallel bindings. The scope doesn't overlap so they can keep their name check({{x, Let::make(y.name(), 3, y)}, {x, Let::make(y.name(), 4, y)}}, {{x, Let::make(y.name(), 3, y)}, {x_1, Let::make(y.name(), 4, y)}}); std::cout << "uniquify_variable_names test passed" << std::endl; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/UniquifyVariableNames.h000066400000000000000000000010331456515664200202040ustar00rootroot00000000000000#ifndef HALIDE_UNIQUIFY_VARIABLE_NAMES #define HALIDE_UNIQUIFY_VARIABLE_NAMES /** \file * Defines the lowering pass that renames all variables to have unique names. */ #include "Expr.h" namespace Halide { namespace Internal { /** Modify a statement so that every internally-defined variable name * is unique. This lets later passes assume syntactic equivalence is * semantic equivalence. */ Stmt uniquify_variable_names(const Stmt &s); void uniquify_variable_names_test(); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/UnpackBuffers.cpp000066400000000000000000000115271456515664200170430ustar00rootroot00000000000000#include "UnpackBuffers.h" #include "IROperator.h" #include "IRVisitor.h" #include namespace Halide { namespace Internal { using std::map; using std::pair; using std::set; using std::string; using std::vector; namespace { struct BufferInfo { Expr handle; int dimensions; }; class FindBufferSymbols : public IRVisitor { using IRVisitor::visit; void visit_param(const string &ref_name, const Parameter ¶m) { if (param.defined() && param.is_buffer()) { const string &name = param.name(); buffers[name] = BufferInfo{Variable::make(type_of(), name + ".buffer", param), param.dimensions()}; } } void visit_buffer(const string &ref_name, const Buffer<> &buffer) { if (buffer.defined()) { const string &name = buffer.name(); buffers[name] = BufferInfo{Variable::make(type_of(), name + ".buffer", buffer), buffer.dimensions()}; } } void visit(const Variable *op) override { visit_param(op->name, op->param); visit_buffer(op->name, op->image); symbols.insert(op->name); } void visit(const Load *op) override { visit_param(op->name, op->param); visit_buffer(op->name, op->image); symbols.insert(op->name); IRVisitor::visit(op); } void visit(const Store *op) override { visit_param(op->name, op->param); symbols.insert(op->name); IRVisitor::visit(op); } public: set symbols; map buffers; }; } // namespace Stmt unpack_buffers(Stmt s) { FindBufferSymbols finder; s.accept(&finder); vector> lets; for (auto &p : finder.buffers) { const string &name = p.first; const BufferInfo &info = p.second; vector args = {info.handle}; string host_var = name; Expr host_val = Call::make(type_of(), Call::buffer_get_host, args, Call::Extern); lets.emplace_back(host_var, host_val); string dev_var = name + ".device"; Expr dev_val = Call::make(type_of(), Call::buffer_get_device, args, Call::Extern); lets.emplace_back(dev_var, dev_val); string dev_interface_var = name + ".device_interface"; Expr dev_interface_val = Call::make(type_of(), Call::buffer_get_device_interface, args, Call::Extern); lets.emplace_back(dev_interface_var, dev_interface_val); string type_code_var = name + ".type"; Expr type_code_val = Call::make(UInt(32), Call::buffer_get_type, args, Call::Extern); lets.emplace_back(type_code_var, type_code_val); string host_dirty_var = name + ".host_dirty"; Expr host_dirty_val = Call::make(Bool(), Call::buffer_get_host_dirty, args, Call::Extern); lets.emplace_back(host_dirty_var, host_dirty_val); string dev_dirty_var = name + ".device_dirty"; Expr dev_dirty_val = Call::make(Bool(), Call::buffer_get_device_dirty, args, Call::Extern); lets.emplace_back(dev_dirty_var, dev_dirty_val); string dimensions_var = name + ".dimensions"; Expr dimensions_val = Call::make(Int(32), Call::buffer_get_dimensions, args, Call::Extern); lets.emplace_back(dimensions_var, dimensions_val); for (int i = 0; i < info.dimensions; i++) { vector args = {info.handle, i}; string min_var = name + ".min." + std::to_string(i); Expr min_val = Call::make(Int(32), Call::buffer_get_min, args, Call::Extern); lets.emplace_back(min_var, min_val); string extent_var = name + ".extent." + std::to_string(i); Expr extent_val = Call::make(Int(32), Call::buffer_get_extent, args, Call::Extern); lets.emplace_back(extent_var, extent_val); string stride_var = name + ".stride." + std::to_string(i); Expr stride_val = Call::make(Int(32), Call::buffer_get_stride, args, Call::Extern); lets.emplace_back(stride_var, stride_val); } } while (!lets.empty()) { pair l = lets.back(); lets.pop_back(); if (finder.symbols.count(l.first)) { s = LetStmt::make(l.first, l.second, s); } } // Create buffer is not null assertions for (auto &p : finder.buffers) { Expr buf = p.second.handle; Expr cond = reinterpret(buf) != 0; Expr error = Call::make(Int(32), "halide_error_buffer_argument_is_null", {p.first}, Call::Extern); Stmt check = AssertStmt::make(cond, error); s = Block::make(check, s); } return s; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/UnpackBuffers.h000066400000000000000000000010671456515664200165060ustar00rootroot00000000000000#ifndef HALIDE_UNPACK_BUFFERS_H #define HALIDE_UNPACK_BUFFERS_H /** \file * Defines the lowering pass that unpacks buffer arguments onto the symbol table */ #include "Expr.h" namespace Halide { namespace Internal { /** Creates let stmts for the various buffer components * (e.g. foo.extent.0) in any referenced concrete buffers or buffer * parameters. After this pass, the only undefined symbols should * scalar parameters and the buffers themselves (e.g. foo.buffer). */ Stmt unpack_buffers(Stmt s); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/UnrollLoops.cpp000066400000000000000000000034101456515664200165650ustar00rootroot00000000000000#include "UnrollLoops.h" #include "IRMutator.h" #include "IROperator.h" #include "Simplify.h" #include "Substitute.h" #include "UniquifyVariableNames.h" namespace Halide { namespace Internal { namespace { class UnrollLoops : public IRMutator { using IRMutator::visit; Stmt visit(const For *for_loop) override { if (for_loop->for_type == ForType::Unrolled) { Stmt body = for_loop->body; const IntImm *e = for_loop->extent.as(); internal_assert(e) << "Loop over " << for_loop->name << " should have had a constant extent\n"; body = mutate(body); if (e->value == 1) { user_warning << "Warning: Unrolling a for loop of extent 1: " << for_loop->name << "\n"; } Stmt iters; for (int i = e->value - 1; i >= 0; i--) { Stmt iter = substitute(for_loop->name, for_loop->min + i, body); // It's necessary to eagerly simplify this iteration // here to resolve things like muxes down to a single // item before we go and make N copies of something of // size N. iter = simplify(iter); if (!iters.defined()) { iters = iter; } else { iters = Block::make(iter, iters); } } return iters; } else { return IRMutator::visit(for_loop); } } }; } // namespace Stmt unroll_loops(const Stmt &s) { Stmt stmt = UnrollLoops().mutate(s); // Unrolling duplicates variable names. Other passes assume variable names are unique. return uniquify_variable_names(stmt); } } // namespace Internal } // namespace Halide Halide-17.0.1/src/UnrollLoops.h000066400000000000000000000006731456515664200162420ustar00rootroot00000000000000#ifndef HALIDE_UNROLL_LOOPS_H #define HALIDE_UNROLL_LOOPS_H /** \file * Defines the lowering pass that unrolls loops marked as such */ #include "Expr.h" namespace Halide { namespace Internal { /** Take a statement with for loops marked for unrolling, and convert * each into several copies of the innermost statement. I.e. unroll * the loop. */ Stmt unroll_loops(const Stmt &); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/UnsafePromises.cpp000066400000000000000000000037221456515664200172460ustar00rootroot00000000000000#include "UnsafePromises.h" #include "IRMutator.h" #include "IROperator.h" namespace Halide { namespace Internal { namespace { class LowerUnsafePromises : public IRMutator { using IRMutator::visit; Expr visit(const Call *op) override { if (op->is_intrinsic(Call::unsafe_promise_clamped)) { if (check) { Expr is_clamped = op->args[0] >= op->args[1] && op->args[0] <= op->args[2]; std::ostringstream promise_expr_text; promise_expr_text << is_clamped; Expr cond_as_string = StringImm::make(promise_expr_text.str()); Expr promise_broken_error = Call::make(Int(32), "halide_error_requirement_failed", {cond_as_string, StringImm::make("from unsafe_promise_clamped")}, Call::Extern); return Call::make(op->args[0].type(), Call::require, {mutate(is_clamped), mutate(op->args[0]), promise_broken_error}, Call::PureIntrinsic); } else { return mutate(op->args[0]); } } else { return IRMutator::visit(op); } } bool check; public: LowerUnsafePromises(bool check) : check(check) { } }; class LowerSafePromises : public IRMutator { using IRMutator::visit; Expr visit(const Call *op) override { if (op->is_intrinsic(Call::promise_clamped)) { return mutate(op->args[0]); } else { return IRMutator::visit(op); } } }; } // namespace Stmt lower_unsafe_promises(const Stmt &s, const Target &t) { return LowerUnsafePromises(t.has_feature(Target::CheckUnsafePromises)).mutate(s); } Stmt lower_safe_promises(const Stmt &s) { return LowerSafePromises().mutate(s); } } // namespace Internal } // namespace Halide Halide-17.0.1/src/UnsafePromises.h000066400000000000000000000011561456515664200167120ustar00rootroot00000000000000#ifndef HALIDE_UNSAFE_PROMISES_H #define HALIDE_UNSAFE_PROMISES_H /** \file * Defines the lowering pass that removes unsafe promises */ #include "Expr.h" namespace Halide { struct Target; namespace Internal { /** Lower all unsafe promises into either assertions or unchecked code, depending on the target. */ Stmt lower_unsafe_promises(const Stmt &s, const Target &t); /** Lower all safe promises by just stripping them. This is a good * idea once no more lowering stages are going to use * boxes_touched. */ Stmt lower_safe_promises(const Stmt &s); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/Util.cpp000066400000000000000000000724241456515664200152250ustar00rootroot00000000000000#ifdef __APPLE__ // This needs to be defined before any other includes in translation // units that use the getcontext/swapcontext family of functions #define _XOPEN_SOURCE #endif #include "Util.h" #include "Debug.h" #include "Error.h" #include "Introspection.h" #include #include #include #include #include #include #include #include #ifdef _MSC_VER #include #else #include #include // For mmap #include #endif #include #include #ifdef __linux__ #define CAN_GET_RUNNING_PROGRAM_NAME #include // For PATH_MAX #include // For swapcontext #endif #if defined(_MSC_VER) && !defined(NOMINMAX) #define NOMINMAX #endif #ifdef _WIN32 #include // needed for CoCreateGuid #include // needed for SHGetFolderPath #include #else #include #endif #ifdef __APPLE__ #define CAN_GET_RUNNING_PROGRAM_NAME #include // Get swapcontext/makecontext etc. // // Apple gets cranky about people using these (because at least some // part of passing a pointer to a function that takes some arguments // as if it's a function that takes no args and then calling it as a // variadic function is deprecated in C) but provides no // alternatives. It's likely they'll continue to have to allow them on // macos for a long time, and these are the entrypoints that tools // like tsan know about, so rolling your own asm is worse. We can // switch to an alternative when one exists. Meanwhile, we work around // their pesky deprecation macro. This is the last include in this // file, so there's no need to restore the value of the macro. #undef __OSX_AVAILABLE_BUT_DEPRECATED #define __OSX_AVAILABLE_BUT_DEPRECATED(...) #undef __API_DEPRECATED #define __API_DEPRECATED(...) #include #endif #ifdef _WIN32 namespace { std::string from_utf16(LPCWSTR pStr) { int len = wcslen(pStr); int mblen = WideCharToMultiByte(CP_UTF8, 0, pStr, len, nullptr, 0, nullptr, nullptr); internal_assert(mblen) << "WideCharToMultiByte() failed; error " << GetLastError() << "\n"; std::string str(mblen, 0); mblen = WideCharToMultiByte(CP_UTF8, 0, pStr, len, &str[0], (int)str.size(), nullptr, nullptr); internal_assert(mblen) << "WideCharToMultiByte() failed; error " << GetLastError() << "\n"; return str; } std::wstring from_utf8(const std::string &str) { int wlen = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), nullptr, 0); internal_assert(wlen) << "MultiByteToWideChar() failed; error " << GetLastError() << "\n"; std::wstring wstr(wlen, 0); wlen = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], (int)wstr.size()); internal_assert(wlen) << "MultiByteToWideChar() failed; error " << GetLastError() << "\n"; return wstr; } } // namespace #endif namespace Halide { namespace Internal { using std::ostringstream; using std::string; using std::vector; std::string get_env_variable(char const *env_var_name) { if (!env_var_name) { return ""; } #ifdef _MSC_VER // call getenv_s without a buffer to determine the correct string length: size_t length = 0; if ((getenv_s(&length, nullptr, 0, env_var_name) != 0) || (length == 0)) { return ""; } // call it again to retrieve the value of the environment variable; // note that 'length' already accounts for the null-terminator std::string lvl(length - 1, '@'); size_t read = 0; if ((getenv_s(&read, &lvl[0], length, env_var_name) != 0) || (read != length)) { return ""; } return lvl; #else char *lvl = getenv(env_var_name); if (lvl) { return std::string(lvl); } return ""; #endif } string running_program_name() { #ifndef CAN_GET_RUNNING_PROGRAM_NAME return ""; #else string program_name; char path[PATH_MAX] = {0}; uint32_t size = sizeof(path); #if defined(__linux__) ssize_t len = ::readlink("/proc/self/exe", path, size - 1); #elif defined(__APPLE__) ssize_t len = ::_NSGetExecutablePath(path, &size); #endif if (len != -1) { #if defined(__linux__) path[len] = '\0'; #endif string tmp = std::string(path); program_name = tmp.substr(tmp.find_last_of('/') + 1); } else { return ""; } return program_name; #endif } namespace { // We use 64K of memory to store unique counters for the purpose of // making names unique. Using less memory increases the likelihood of // hash collisions. This wouldn't break anything, but makes stmts // slightly confusing to read because names that are actually unique // will get suffixes that falsely hint that they are not. const int num_unique_name_counters = (1 << 14); // We want to init these to zero, but cannot use = {0} because that // would invoke a (deleted) copy ctor. The default initialization for // atomics doesn't guarantee any actual initialization. Fortunately // this is a global, which is always zero-initialized. std::atomic unique_name_counters[num_unique_name_counters] = {}; int unique_count(size_t h) { h = h & (num_unique_name_counters - 1); return unique_name_counters[h]++; } } // namespace // There are three possible families of names returned by the methods below: // 1) char pattern: (char that isn't '$') + number (e.g. v234) // 2) string pattern: (string without '$') + '$' + number (e.g. fr#nk82$42) // 3) a string that does not match the patterns above // There are no collisions within each family, due to the unique_count // done above, and there can be no collisions across families by // construction. string unique_name(char prefix) { if (prefix == '$') { prefix = '_'; } return prefix + std::to_string(unique_count((size_t)(prefix))); } string unique_name(const std::string &prefix) { string sanitized = prefix; // Does the input string look like something returned from unique_name(char)? bool matches_char_pattern = true; // Does the input string look like something returned from unique_name(string)? bool matches_string_pattern = true; // Rewrite '$' to '_'. This is a many-to-one mapping, but that's // OK, we're about to hash anyway. It just means that some names // will share the same counter. int num_dollars = 0; for (size_t i = 0; i < sanitized.size(); i++) { if (sanitized[i] == '$') { num_dollars++; sanitized[i] = '_'; } if (i > 0 && !isdigit(sanitized[i])) { // Found a non-digit after the first char matches_char_pattern = false; if (num_dollars) { // Found a non-digit after a '$' matches_string_pattern = false; } } } matches_string_pattern &= num_dollars == 1; matches_char_pattern &= prefix.size() > 1; // Then add a suffix that's globally unique relative to the hash // of the sanitized name. int count = unique_count(std::hash()(sanitized)); if (count == 0) { // We can return the name as-is if there's no risk of it // looking like something unique_name has ever returned in the // past or will ever return in the future. if (!matches_char_pattern && !matches_string_pattern) { return prefix; } } return sanitized + "$" + std::to_string(count); } bool starts_with(const string &str, const string &prefix) { if (str.size() < prefix.size()) { return false; } for (size_t i = 0; i < prefix.size(); i++) { if (str[i] != prefix[i]) { return false; } } return true; } bool ends_with(const string &str, const string &suffix) { if (str.size() < suffix.size()) { return false; } size_t off = str.size() - suffix.size(); for (size_t i = 0; i < suffix.size(); i++) { if (str[off + i] != suffix[i]) { return false; } } return true; } string replace_all(const string &str, const string &find, const string &replace) { size_t pos = 0; string result = str; while ((pos = result.find(find, pos)) != string::npos) { result.replace(pos, find.length(), replace); pos += replace.length(); } return result; } string make_entity_name(void *stack_ptr, const string &type, char prefix) { string name = Introspection::get_variable_name(stack_ptr, type); if (name.empty()) { return unique_name(prefix); } else { // Halide names may not contain '.' for (char &c : name) { if (c == '.') { c = ':'; } } return unique_name(name); } } std::vector split_string(const std::string &source, const std::string &delim) { std::vector elements; size_t start = 0; size_t found = 0; while ((found = source.find(delim, start)) != std::string::npos) { elements.push_back(source.substr(start, found - start)); start = found + delim.size(); } // If start is exactly source.size(), the last thing in source is a // delimiter, in which case we want to add an empty string to elements. if (start <= source.size()) { elements.push_back(source.substr(start, std::string::npos)); } return elements; } std::string extract_namespaces(const std::string &name, std::vector &namespaces) { namespaces = split_string(name, "::"); std::string result = namespaces.back(); namespaces.pop_back(); return result; } std::string strip_namespaces(const std::string &name) { std::vector unused; return extract_namespaces(name, unused); } bool file_exists(const std::string &name) { #ifdef _MSC_VER return _access(name.c_str(), 0) == 0; #else return ::access(name.c_str(), F_OK) == 0; #endif } void assert_file_exists(const std::string &name) { internal_assert(file_exists(name)) << "File not found: " << name; } void assert_no_file_exists(const std::string &name) { internal_assert(!file_exists(name)) << "File (wrongly) found: " << name; } void file_unlink(const std::string &name) { #ifdef _MSC_VER _unlink(name.c_str()); #else ::unlink(name.c_str()); #endif } void ensure_no_file_exists(const std::string &name) { if (file_exists(name)) { file_unlink(name); } assert_no_file_exists(name); } void dir_rmdir(const std::string &name) { #ifdef _MSC_VER std::wstring wname = from_utf8(name); internal_assert(RemoveDirectoryW(wname.c_str())) << "RemoveDirectoryW() failed to remove " << name << "; error " << GetLastError() << "\n"; #else int r = ::rmdir(name.c_str()); internal_assert(r == 0) << "Unable to remove dir: " << name << "\n"; #endif } FileStat file_stat(const std::string &name) { #ifdef _MSC_VER struct _stat a; if (_stat(name.c_str(), &a) != 0) { user_error << "Could not stat " << name << "\n"; } #else struct stat a; if (::stat(name.c_str(), &a) != 0) { user_error << "Could not stat " << name << "\n"; } #endif return {static_cast(a.st_size), static_cast(a.st_mtime), static_cast(a.st_uid), static_cast(a.st_gid), static_cast(a.st_mode)}; } #ifdef _WIN32 namespace { // GetTempPath() will fail rudely if env vars aren't set properly, // which is the case when we run under a tool in Bazel. Instead, // look for the current user's AppData/Local/Temp path, which // should be valid and writable in all versions of Windows that // we support for compilation purposes. std::string get_windows_tmp_dir() { // Allow overriding of the tmpdir on Windows via an env var; // some Windows configs can (apparently) lock down AppData/Local/Temp // via policy, making various things break. (Note that this is intended // to be a short-lived workaround; we would prefer to be able to avoid // requiring this sort of band-aid if possible.) std::string tmp_dir = get_env_variable("HL_WINDOWS_TMP_DIR"); if (!tmp_dir.empty()) { return tmp_dir; } PWSTR wlocal_path; HRESULT ret = SHGetKnownFolderPath(FOLDERID_LocalAppData, 0, nullptr, &wlocal_path); internal_assert(ret == S_OK) << "Unable to get Local AppData folder; error " << GetLastError() << "\n"; std::string tmp = from_utf16(wlocal_path); CoTaskMemFree(wlocal_path); tmp = replace_all(tmp, "\\", "/"); if (tmp.back() != '/') tmp += '/'; tmp += "Temp/"; return tmp; } } // namespace #endif std::string file_make_temp(const std::string &prefix, const std::string &suffix) { internal_assert(prefix.find('/') == string::npos && prefix.find('\\') == string::npos && suffix.find('/') == string::npos && suffix.find('\\') == string::npos); #ifdef _WIN32 // Windows implementations of mkstemp() try to create the file in the root // directory Unfortunately, that requires ADMIN privileges, which are not // guaranteed here. std::wstring tmp_dir = from_utf8(get_windows_tmp_dir()); std::wstring wprefix = from_utf8(prefix); WCHAR tmp_file[MAX_PATH]; // Note that GetTempFileNameW() actually creates the file. DWORD ret = GetTempFileNameW(tmp_dir.c_str(), wprefix.c_str(), 0, tmp_file); internal_assert(ret != 0) << "GetTempFileNameW() failed; error " << GetLastError() << "\n"; return from_utf16(tmp_file); #else std::string templ = "/tmp/" + prefix + "XXXXXX" + suffix; // Copy into a temporary buffer, since mkstemp modifies the buffer in place. std::vector buf(templ.size() + 1); strcpy(&buf[0], templ.c_str()); int fd = mkstemps(&buf[0], suffix.size()); internal_assert(fd != -1) << "Unable to create temp file for (" << &buf[0] << ")\n"; close(fd); return std::string(&buf[0]); #endif } std::string dir_make_temp() { #ifdef _WIN32 std::string tmp_dir = get_windows_tmp_dir(); // There's no direct API to do this in Windows; // our clunky-but-adequate approach here is to use // CoCreateGuid() to create a probably-unique name. // Add a limit on the number of tries just in case. for (int tries = 0; tries < 100; ++tries) { GUID guid; HRESULT hr = CoCreateGuid(&guid); internal_assert(hr == S_OK); std::ostringstream name; name << std::hex << std::setfill('0') << std::setw(8) << guid.Data1 << std::setw(4) << guid.Data2 << guid.Data3 << std::setw(2); for (int i = 0; i < 8; i++) { name << (int)guid.Data4[i]; } std::string dir = tmp_dir + name.str(); std::wstring wdir = from_utf8(dir); BOOL success = CreateDirectoryW(wdir.c_str(), nullptr); if (success) { debug(1) << "temp dir is: " << dir << "\n"; return dir; } // If name already existed, just loop and try again. // Any other error, break from loop and fail. if (GetLastError() != ERROR_ALREADY_EXISTS) { break; } } internal_error << "Unable to create temp directory in " << tmp_dir << "\n"; return ""; #else std::string templ = "/tmp/XXXXXX"; // Copy into a temporary buffer, since mkdtemp modifies the buffer in place. std::vector buf(templ.size() + 1); strcpy(&buf[0], templ.c_str()); char *result = mkdtemp(&buf[0]); internal_assert(result != nullptr) << "Unable to create temp directory.\n"; return std::string(result); #endif } std::vector read_entire_file(const std::string &pathname) { std::ifstream f(pathname, std::ios::in | std::ios::binary); std::vector result; f.seekg(0, std::ifstream::end); size_t size = f.tellg(); result.resize(size); f.seekg(0, std::ifstream::beg); f.read(result.data(), result.size()); internal_assert(f.good()) << "Unable to read file: " << pathname; f.close(); return result; } void write_entire_file(const std::string &pathname, const void *source, size_t source_len) { std::ofstream f(pathname, std::ios::out | std::ios::binary); f.write(reinterpret_cast(source), source_len); f.flush(); internal_assert(f.good()) << "Unable to write file: " << pathname; f.close(); } bool add_would_overflow(int bits, int64_t a, int64_t b) { int64_t max_val = 0x7fffffffffffffffLL >> (64 - bits); int64_t min_val = -max_val - 1; return ((b > 0 && a > max_val - b) || // (a + b) > max_val, rewritten to avoid overflow (b < 0 && a < min_val - b)); // (a + b) < min_val, rewritten to avoid overflow } bool add_with_overflow(int bits, int64_t a, int64_t b, int64_t *result) { #ifndef _MSC_VER if (bits == 64) { static_assert(sizeof(long long) == sizeof(int64_t)); bool flag = __builtin_saddll_overflow(a, b, (long long *)result); if (flag) { // Overflowed 64 bits *result = 0; } return !flag; } #endif if (add_would_overflow(bits, a, b)) { *result = 0; return false; } else { *result = a + b; return true; } } bool sub_would_overflow(int bits, int64_t a, int64_t b) { int64_t max_val = 0x7fffffffffffffffLL >> (64 - bits); int64_t min_val = -max_val - 1; return ((b < 0 && a > max_val + b) || // (a - b) > max_val, rewritten to avoid overflow (b > 0 && a < min_val + b)); // (a - b) < min_val, rewritten to avoid overflow } bool sub_with_overflow(int bits, int64_t a, int64_t b, int64_t *result) { #ifndef _MSC_VER if (bits == 64) { static_assert(sizeof(long long) == sizeof(int64_t)); bool flag = __builtin_ssubll_overflow(a, b, (long long *)result); if (flag) { // Overflowed 64 bits *result = 0; } return !flag; } #endif if (sub_would_overflow(bits, a, b)) { *result = 0; return false; } else { *result = a - b; return true; } } bool mul_would_overflow(int bits, int64_t a, int64_t b) { int64_t max_val = 0x7fffffffffffffffLL >> (64 - bits); int64_t min_val = -max_val - 1; if (a == 0) { return false; } else if (a == -1) { return b == min_val; } else { // Do the multiplication as a uint64, for which overflow is // well defined, then cast the bits back to int64 to get // multiplication modulo 2^64. int64_t ab = (int64_t)((uint64_t)a) * ((uint64_t)b); // The first two clauses catch overflow mod 2^bits, assuming // no 64-bit overflow occurs, and the third clause catches // 64-bit overflow. return ab < min_val || ab > max_val || (ab / a != b); } } bool mul_with_overflow(int bits, int64_t a, int64_t b, int64_t *result) { #ifndef _MSC_VER if (bits == 64) { static_assert(sizeof(long long) == sizeof(int64_t)); bool flag = __builtin_smulll_overflow(a, b, (long long *)result); if (flag) { // Overflowed 64 bits *result = 0; } return !flag; } #endif if (mul_would_overflow(bits, a, b)) { *result = 0; return false; } else { *result = a * b; return true; } } struct TickStackEntry { std::chrono::time_point time; string file; int line; }; namespace { vector tick_stack; } // namespace void halide_tic_impl(const char *file, int line) { string f = file; f = split_string(f, "/").back(); tick_stack.push_back({std::chrono::high_resolution_clock::now(), f, line}); } void halide_toc_impl(const char *file, int line) { auto t1 = tick_stack.back(); auto t2 = std::chrono::high_resolution_clock::now(); std::chrono::duration diff = t2 - t1.time; tick_stack.pop_back(); for (size_t i = 0; i < tick_stack.size(); i++) { debug(1) << " "; } string f = file; f = split_string(f, "/").back(); debug(1) << t1.file << ":" << t1.line << " ... " << f << ":" << line << " : " << diff.count() * 1000 << " ms\n"; } std::string c_print_name(const std::string &name, bool prefix_underscore) { ostringstream oss; // Prefix an underscore to avoid reserved words (e.g. a variable named "while") if (prefix_underscore && isalpha(name[0])) { oss << "_"; } for (char c : name) { if (c == '.') { oss << "_"; } else if (c == '$') { oss << "__"; } else if (c != '_' && !isalnum(c)) { oss << "___"; } else { oss << c; } } return oss.str(); } int get_llvm_version() { static_assert(LLVM_VERSION > 0, "LLVM_VERSION is not defined"); return LLVM_VERSION; } #ifdef _WIN32 namespace { struct GenericFiberArgs { const std::function &run; LPVOID main_fiber; #ifdef HALIDE_WITH_EXCEPTIONS std::exception_ptr exception = nullptr; // NOLINT - clang-tidy complains this isn't thrown #endif }; void WINAPI generic_fiber_entry_point(LPVOID argument) { auto *action = reinterpret_cast(argument); #ifdef HALIDE_WITH_EXCEPTIONS try { #endif action->run(); #ifdef HALIDE_WITH_EXCEPTIONS } catch (...) { action->exception = std::current_exception(); } #endif SwitchToFiber(action->main_fiber); } } // namespace #endif } // namespace Internal namespace { struct CompilerStackSize { CompilerStackSize() { std::string stack_size = Internal::get_env_variable("HL_COMPILER_STACK_SIZE"); if (stack_size.empty()) { size = default_compiler_stack_size; } else { size = std::atoi(stack_size.c_str()); } } size_t size; } stack_size; } // namespace void set_compiler_stack_size(size_t sz) { stack_size.size = sz; } size_t get_compiler_stack_size() { return stack_size.size; } namespace Internal { #if defined(HALIDE_INTERNAL_USING_ASAN) || defined(__ANDROID__) // If we are compiling under ASAN, we will get a zillion warnings about // ASAN not supporting makecontext/swapcontext and the possibility of // false positives. // // If we are building for Android, well, it apparently doesn't provide // makecontext() / swapcontext(), despite being posixy #define MAKECONTEXT_OK 0 #else #define MAKECONTEXT_OK 1 #endif #if MAKECONTEXT_OK namespace { // We can't reliably pass arguments through makecontext, because // the calling convention involves an invalid function pointer // cast which passes different numbers of bits on different // platforms, so we use a thread local to pass arguments. thread_local void *run_with_large_stack_arg = nullptr; } // namespace #endif void run_with_large_stack(const std::function &action) { if (stack_size.size == 0) { // User has requested no stack swapping action(); return; } #if _WIN32 // Only exists for its address, which is used to compute remaining stack space. ULONG_PTR approx_stack_pos; ULONG_PTR stack_low, stack_high; GetCurrentThreadStackLimits(&stack_low, &stack_high); ptrdiff_t stack_remaining = (char *)&approx_stack_pos - (char *)stack_low; if (stack_remaining < stack_size.size) { debug(1) << "Insufficient stack space (" << stack_remaining << " bytes). Switching to fiber with " << stack_size.size << "-byte stack.\n"; auto was_a_fiber = IsThreadAFiber(); auto *main_fiber = was_a_fiber ? GetCurrentFiber() : ConvertThreadToFiber(nullptr); internal_assert(main_fiber) << "ConvertThreadToFiber failed with code: " << GetLastError() << "\n"; GenericFiberArgs fiber_args{action, main_fiber}; auto *lower_fiber = CreateFiber(stack_size.size, generic_fiber_entry_point, &fiber_args); internal_assert(lower_fiber) << "CreateFiber failed with code: " << GetLastError() << "\n"; SwitchToFiber(lower_fiber); DeleteFiber(lower_fiber); debug(1) << "Returned from fiber.\n"; #ifdef HALIDE_WITH_EXCEPTIONS if (fiber_args.exception) { debug(1) << "Fiber threw exception. Rethrowing...\n"; std::rethrow_exception(fiber_args.exception); } #endif if (!was_a_fiber) { BOOL success = ConvertFiberToThread(); internal_assert(success) << "ConvertFiberToThread failed with code: " << GetLastError() << "\n"; } return; } #else // On posixy systems we have makecontext / swapcontext #if !MAKECONTEXT_OK action(); return; #else #ifdef HALIDE_WITH_EXCEPTIONS struct Args { const std::function &run; std::exception_ptr exception = nullptr; // NOLINT - clang-tidy complains this isn't thrown } args{action}; auto trampoline = []() { Args *arg = (Args *)run_with_large_stack_arg; try { arg->run(); } catch (...) { arg->exception = std::current_exception(); } }; #else struct Args { const std::function &run; } args{action}; auto trampoline = []() { ((Args *)run_with_large_stack_arg)->run(); }; #endif ucontext_t context, calling_context; // We'll allocate some protected guard pages at the end of the // stack we're making to catch stack overflows when they happen, // as opposed to having them cause silent corruption. We pick an // amount of memory that should be comfortably larger than most // stack frames - 64k. const size_t guard_band = 64 * 1024; void *stack = mmap(nullptr, stack_size.size + guard_band, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); internal_assert(stack) << "mmap failed with error " << strerror(errno); int err = mprotect((char *)stack + stack_size.size, guard_band, PROT_NONE); internal_assert(err == 0) << "mprotect failed with error " << strerror(errno); err = getcontext(&context); internal_assert(err == 0) << "getcontext failed with error " << strerror(errno); context.uc_stack.ss_sp = stack; context.uc_stack.ss_size = stack_size.size; context.uc_stack.ss_flags = 0; context.uc_link = &calling_context; run_with_large_stack_arg = &args; makecontext(&context, trampoline, 0); err = swapcontext(&calling_context, &context); internal_assert(err == 0) << "swapcontext failed with error " << strerror(errno); err = munmap(stack, stack_size.size + guard_band); internal_assert(err == 0) << "munmap failed with error " << strerror(errno); #ifdef HALIDE_WITH_EXCEPTIONS if (args.exception) { debug(1) << "Subcontext threw exception. Rethrowing...\n"; std::rethrow_exception(args.exception); } #endif #endif // not ADDRESS_SANITIZER #endif } // Portable bit-counting methods int popcount64(uint64_t x) { #ifdef _MSC_VER #if defined(_WIN64) return __popcnt64(x); #else return __popcnt((uint32_t)(x >> 32)) + __popcnt((uint32_t)(x & 0xffffffff)); #endif #else static_assert(sizeof(unsigned long long) >= sizeof(uint64_t), ""); return __builtin_popcountll(x); #endif } int clz64(uint64_t x) { internal_assert(x != 0); #ifdef _MSC_VER unsigned long r = 0; #if defined(_WIN64) return _BitScanReverse64(&r, x) ? (63 - r) : 64; #else if (_BitScanReverse(&r, (uint32_t)(x >> 32))) { return (63 - (r + 32)); } else if (_BitScanReverse(&r, (uint32_t)(x & 0xffffffff))) { return 63 - r; } else { return 64; } #endif #else static_assert(sizeof(unsigned long long) >= sizeof(uint64_t), ""); constexpr int offset = (sizeof(unsigned long long) - sizeof(uint64_t)) * 8; return __builtin_clzll(x) + offset; #endif } int ctz64(uint64_t x) { internal_assert(x != 0); #ifdef _MSC_VER unsigned long r = 0; #if defined(_WIN64) return _BitScanForward64(&r, x) ? r : 64; #else if (_BitScanForward(&r, (uint32_t)(x & 0xffffffff))) { return r; } else if (_BitScanForward(&r, (uint32_t)(x >> 32))) { return r + 32; } else { return 64; } #endif #else static_assert(sizeof(unsigned long long) >= sizeof(uint64_t), ""); return __builtin_ctzll(x); #endif } } // namespace Internal void load_plugin(const std::string &lib_name) { #ifdef _WIN32 std::string lib_path = lib_name; if (lib_path.find('.') == std::string::npos) { lib_path += ".dll"; } std::wstring wide_lib = from_utf8(lib_path); HMODULE library = LoadLibraryW(wide_lib.c_str()); if (!library) { DWORD error = GetLastError(); LPWSTR message = nullptr; FormatMessageW(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, error, 0, reinterpret_cast(&message), 0, nullptr); user_assert(message) << "Failed to load: " << lib_path << ".\n" << "FormatMessage failed while processing error in LoadLibraryW (errno " << error << ").\n"; std::string err_msg = from_utf16(message); LocalFree(message); user_error << "Failed to load: " << lib_path << ";\n" << "LoadLibraryW failed with error " << error << ": " << err_msg << "\n"; } #else std::string lib_path = lib_name; if (lib_path.find('.') == std::string::npos) { lib_path = "lib" + lib_path + ".so"; } if (dlopen(lib_path.c_str(), RTLD_LAZY) == nullptr) { user_error << "Failed to load: " << lib_path << ": " << dlerror() << "\n"; } #endif } } // namespace Halide Halide-17.0.1/src/Util.h000066400000000000000000000465011456515664200146670ustar00rootroot00000000000000// Always use assert, even if llvm-config defines NDEBUG #ifdef NDEBUG #undef NDEBUG #include #define NDEBUG #else #include #endif #ifndef HALIDE_UTIL_H #define HALIDE_UTIL_H /** \file * Various utility functions used internally Halide. */ #include #include #include #include #include #include #include #include #include "runtime/HalideRuntime.h" #ifdef Halide_STATIC_DEFINE #define HALIDE_EXPORT #else #if defined(_MSC_VER) // Halide_EXPORTS is quietly defined by CMake when building a shared library #ifdef Halide_EXPORTS #define HALIDE_EXPORT __declspec(dllexport) #else #define HALIDE_EXPORT __declspec(dllimport) #endif #else #define HALIDE_EXPORT __attribute__((visibility("default"))) #endif #endif // If we're in user code, we don't want certain functions to be inlined. #if defined(COMPILING_HALIDE) || defined(BUILDING_PYTHON) #define HALIDE_NO_USER_CODE_INLINE #else #define HALIDE_NO_USER_CODE_INLINE HALIDE_NEVER_INLINE #endif // Clang uses __has_feature() for sanitizers... #if defined(__has_feature) #if __has_feature(address_sanitizer) #define HALIDE_INTERNAL_USING_ASAN #endif #if __has_feature(memory_sanitizer) #define HALIDE_INTERNAL_USING_MSAN #endif #if __has_feature(thread_sanitizer) #define HALIDE_INTERNAL_USING_TSAN #endif #if __has_feature(coverage_sanitizer) #define HALIDE_INTERNAL_USING_COVSAN #endif #if __has_feature(undefined_behavior_sanitizer) #define HALIDE_INTERNAL_USING_UBSAN #endif #endif // ...but GCC/MSVC don't like __has_feature, so handle them separately. // (Only AddressSanitizer for now, not sure if any others are well-supported // outside of Clang. #if defined(__SANITIZE_ADDRESS__) && !defined(HALIDE_INTERNAL_USING_ASAN) #define HALIDE_INTERNAL_USING_ASAN #endif namespace Halide { /** Load a plugin in the form of a dynamic library (e.g. for custom autoschedulers). * If the string doesn't contain any . characters, the proper prefix and/or suffix * for the platform will be added: * * foo -> libfoo.so (Linux/OSX/etc -- note that .dylib is not supported) * foo -> foo.dll (Windows) * * otherwise, it is assumed to be an appropriate pathname. * * Any error in loading will assert-fail. */ void load_plugin(const std::string &lib_name); namespace Internal { /** Some numeric conversions are UB if the value won't fit in the result; * safe_numeric_cast<>() is meant as a drop-in replacement for a C/C++ cast * that adds well-defined behavior for the UB cases, attempting to mimic * common implementation behavior as much as possible. */ template::value>::type * = nullptr> DST safe_numeric_cast(SRC s) { if (std::is_integral::value) { // Treat float -> int as a saturating cast; this is handled // in different ways by different compilers, so an arbitrary but safe // choice like this is reasonable. if (s < (SRC)std::numeric_limits::min()) { return std::numeric_limits::min(); } if (s > (SRC)std::numeric_limits::max()) { return std::numeric_limits::max(); } } return (DST)s; } template::value>::type * = nullptr> DST safe_numeric_cast(SRC s) { if (std::is_integral::value) { // any-int -> signed-int is technically UB if value won't fit; // in practice, common compilers implement such conversions as done below // (as verified by exhaustive testing on Clang for x86-64). We could // probably continue to rely on that behavior, but making it explicit // avoids possible wrather of UBSan and similar debug helpers. // (Yes, using sizeof for this comparison is a little odd for the uint->int // case, but the intent is to match existing common behavior, which this does.) if (std::is_integral::value && std::is_signed::value && sizeof(DST) < sizeof(SRC)) { using UnsignedSrc = typename std::make_unsigned::type; return (DST)(s & (UnsignedSrc)(-1)); } } return (DST)s; } /** An aggressive form of reinterpret cast used for correct type-punning. */ template DstType reinterpret_bits(const SrcType &src) { static_assert(sizeof(SrcType) == sizeof(DstType), "Types must be same size"); DstType dst; memcpy(&dst, &src, sizeof(SrcType)); return dst; } /** Make a unique name for an object based on the name of the stack * variable passed in. If introspection isn't working or there are no * debug symbols, just uses unique_name with the given prefix. */ std::string make_entity_name(void *stack_ptr, const std::string &type, char prefix); /** Get value of an environment variable. Returns its value * is defined in the environment. If the var is not defined, an empty string * is returned. */ std::string get_env_variable(char const *env_var_name); /** Get the name of the currently running executable. Platform-specific. * If program name cannot be retrieved, function returns an empty string. */ std::string running_program_name(); /** Generate a unique name starting with the given prefix. It's unique * relative to all other strings returned by unique_name in this * process. * * The single-character version always appends a numeric suffix to the * character. * * The string version will either return the input as-is (with high * probability on the first time it is called with that input), or * replace any existing '$' characters with underscores, then add a * '$' sign and a numeric suffix to it. * * Note that unique_name('f') therefore differs from * unique_name("f"). The former returns something like f123, and the * latter returns either f or f$123. */ // @{ std::string unique_name(char prefix); std::string unique_name(const std::string &prefix); // @} /** Test if the first string starts with the second string */ bool starts_with(const std::string &str, const std::string &prefix); /** Test if the first string ends with the second string */ bool ends_with(const std::string &str, const std::string &suffix); /** Replace all matches of the second string in the first string with the last string */ std::string replace_all(const std::string &str, const std::string &find, const std::string &replace); /** Split the source string using 'delim' as the divider. */ std::vector split_string(const std::string &source, const std::string &delim); /** Join the source vector using 'delim' as the divider. */ template std::string join_strings(const std::vector &sources, const std::string &delim) { size_t sz = 0; if (!sources.empty()) { sz += delim.size() * (sources.size() - 1); } for (const auto &s : sources) { sz += s.size(); } std::string result; result.reserve(sz); bool need_delim = false; for (const auto &s : sources) { if (need_delim) { result += delim; } result += s; need_delim = true; } return result; } /** Perform a left fold of a vector. Returns a default-constructed * vector element if the vector is empty. Similar to std::accumulate * but with a less clunky syntax. */ template T fold_left(const std::vector &vec, Fn f) { T result; if (vec.empty()) { return result; } result = vec[0]; for (size_t i = 1; i < vec.size(); i++) { result = f(result, vec[i]); } return result; } /** Returns a right fold of a vector. Returns a default-constructed * vector element if the vector is empty. */ template T fold_right(const std::vector &vec, Fn f) { T result; if (vec.empty()) { return result; } result = vec.back(); for (size_t i = vec.size() - 1; i > 0; i--) { result = f(vec[i - 1], result); } return result; } template struct meta_and : std::true_type {}; template struct meta_and : std::integral_constant::value> {}; template struct meta_or : std::false_type {}; template struct meta_or : std::integral_constant::value> {}; template struct all_are_convertible : meta_and...> {}; /** Returns base name and fills in namespaces, outermost one first in vector. */ std::string extract_namespaces(const std::string &name, std::vector &namespaces); /** Like extract_namespaces(), but strip and discard the namespaces, returning base name only */ std::string strip_namespaces(const std::string &name); struct FileStat { uint64_t file_size; uint32_t mod_time; // Unix epoch time uint32_t uid; uint32_t gid; uint32_t mode; }; /** Create a unique file with a name of the form prefixXXXXXsuffix in an arbitrary * (but writable) directory; this is typically /tmp, but the specific * location is not guaranteed. (Note that the exact form of the file name * may vary; in particular, the suffix may be ignored on Windows.) * The file is created (but not opened), thus this can be called from * different threads (or processes, e.g. when building with parallel make) * without risking collision. Note that if this file is used as a temporary * file, the caller is responsibly for deleting it. Neither the prefix nor suffix * may contain a directory separator. */ std::string file_make_temp(const std::string &prefix, const std::string &suffix); /** Create a unique directory in an arbitrary (but writable) directory; this is * typically somewhere inside /tmp, but the specific location is not guaranteed. * The directory will be empty (i.e., this will never return /tmp itself, * but rather a new directory inside /tmp). The caller is responsible for removing the * directory after use. */ std::string dir_make_temp(); /** Wrapper for access(). Quietly ignores errors. */ bool file_exists(const std::string &name); /** assert-fail if the file doesn't exist. useful primarily for testing purposes. */ void assert_file_exists(const std::string &name); /** assert-fail if the file DOES exist. useful primarily for testing purposes. */ void assert_no_file_exists(const std::string &name); /** Wrapper for unlink(). Asserts upon error. */ void file_unlink(const std::string &name); /** Wrapper for unlink(). Quietly ignores errors. */ void file_unlink(const std::string &name); /** Ensure that no file with this path exists. If such a file * exists and cannot be removed, assert-fail. */ void ensure_no_file_exists(const std::string &name); /** Wrapper for rmdir(). Asserts upon error. */ void dir_rmdir(const std::string &name); /** Wrapper for stat(). Asserts upon error. */ FileStat file_stat(const std::string &name); /** Read the entire contents of a file into a vector. The file * is read in binary mode. Errors trigger an assertion failure. */ std::vector read_entire_file(const std::string &pathname); /** Create or replace the contents of a file with a given pointer-and-length * of memory. If the file doesn't exist, it is created; if it does exist, it * is completely overwritten. Any error triggers an assertion failure. */ void write_entire_file(const std::string &pathname, const void *source, size_t source_len); inline void write_entire_file(const std::string &pathname, const std::vector &source) { write_entire_file(pathname, source.data(), source.size()); } /** A simple utility class that creates a temporary file in its ctor and * deletes that file in its dtor; this is useful for temporary files that you * want to ensure are deleted when exiting a certain scope. Since this is essentially * just an RAII wrapper around file_make_temp() and file_unlink(), it has the same * failure modes (i.e.: assertion upon error). */ class TemporaryFile final { public: TemporaryFile(const std::string &prefix, const std::string &suffix) : temp_path(file_make_temp(prefix, suffix)) { } const std::string &pathname() const { return temp_path; } ~TemporaryFile() { if (do_unlink) { file_unlink(temp_path); } } // You can call this if you want to defeat the automatic deletion; // this is rarely what you want to do (since it defeats the purpose // of this class), but can be quite handy for debugging purposes. void detach() { do_unlink = false; } private: const std::string temp_path; bool do_unlink = true; public: TemporaryFile(const TemporaryFile &) = delete; TemporaryFile &operator=(const TemporaryFile &) = delete; TemporaryFile(TemporaryFile &&) = delete; TemporaryFile &operator=(TemporaryFile &&) = delete; }; /** Routines to test if math would overflow for signed integers with * the given number of bits. */ // @{ bool add_would_overflow(int bits, int64_t a, int64_t b); bool sub_would_overflow(int bits, int64_t a, int64_t b); bool mul_would_overflow(int bits, int64_t a, int64_t b); // @} /** Routines to perform arithmetic on signed types without triggering signed * overflow. If overflow would occur, sets result to zero, and returns * false. Otherwise set result to the correct value, and returns true. */ // @{ HALIDE_MUST_USE_RESULT bool add_with_overflow(int bits, int64_t a, int64_t b, int64_t *result); HALIDE_MUST_USE_RESULT bool sub_with_overflow(int bits, int64_t a, int64_t b, int64_t *result); HALIDE_MUST_USE_RESULT bool mul_with_overflow(int bits, int64_t a, int64_t b, int64_t *result); // @} /** Helper class for saving/restoring variable values on the stack, to allow * for early-exit that preserves correctness */ template struct ScopedValue { T &var; T old_value; /** Preserve the old value, restored at dtor time */ ScopedValue(T &var) : var(var), old_value(var) { } /** Preserve the old value, then set the var to a new value. */ ScopedValue(T &var, T new_value) : var(var), old_value(var) { var = new_value; } ~ScopedValue() { var = old_value; } operator T() const { return old_value; } // allow move but not copy ScopedValue(const ScopedValue &that) = delete; ScopedValue(ScopedValue &&that) noexcept = default; }; // Helpers for timing blocks of code. Put 'TIC;' at the start and // 'TOC;' at the end. Timing is reported at the toc via // debug(0). The calls can be nested and will pretty-print // appropriately. Took this idea from matlab via Jon Barron. // // Note that this uses global state internally, and is not thread-safe // at all. Only use it for single-threaded debugging sessions. void halide_tic_impl(const char *file, int line); void halide_toc_impl(const char *file, int line); #define HALIDE_TIC Halide::Internal::halide_tic_impl(__FILE__, __LINE__) #define HALIDE_TOC Halide::Internal::halide_toc_impl(__FILE__, __LINE__) #ifdef COMPILING_HALIDE #define TIC HALIDE_TIC #define TOC HALIDE_TOC #endif // statically cast a value from one type to another: this is really just // some syntactic sugar around static_cast<>() to avoid compiler warnings // regarding 'bool' in some compliation configurations. template struct StaticCast { template inline constexpr static TO value(const FROM &from) { if constexpr (std::is_same::value) { return from != 0; } else { return static_cast(from); } } }; // Like std::is_convertible, but with additional tests for arithmetic types: // ensure that the value will roundtrip losslessly (e.g., no integer truncation // or dropping of fractional parts). template struct IsRoundtrippable { template inline constexpr static bool value(const FROM &from) { if constexpr (std::is_convertible::value) { if constexpr (std::is_arithmetic::value && std::is_arithmetic::value && !std::is_same::value) { const TO to = static_cast(from); const FROM roundtripped = static_cast(to); return roundtripped == from; } else { return true; } } else { return false; } } }; /** Emit a version of a string that is a valid identifier in C (. is replaced with _) * If prefix_underscore is true (the default), an underscore will be prepended if the * input starts with an alphabetic character to avoid reserved word clashes. */ std::string c_print_name(const std::string &name, bool prefix_underscore = true); /** Return the LLVM_VERSION against which this libHalide is compiled. This is provided * only for internal tests which need to verify behavior; please don't use this outside * of Halide tests. */ int get_llvm_version(); } // namespace Internal /** Set how much stack the compiler should use for compilation in * bytes. This can also be set through the environment variable * HL_COMPILER_STACK_SIZE, though this function takes precedence. A * value of zero causes the compiler to just use the calling stack for * all compilation tasks. * * Calling this or setting the environment variable should not be * necessary. It is provided for three kinds of testing: * * First, Halide uses it in our internal tests to make sure * we're not using a silly amount of stack size on some * canary programs to avoid stack usage regressions. * * Second, if you have a mysterious crash inside a generator, you can * set a larger stack size as a way to test if it's a stack * overflow. Perhaps our default stack size is not large enough for * your program and schedule. Use this call or the environment var as * a workaround, and then open a bug with a reproducer at * github.com/halide/Halide/issues so that we can determine what's * going wrong that is causing your code to use so much stack. * * Third, perhaps using a side-stack is causing problems with * sanitizing, debugging, or profiling tools. If this is a problem, * you can set HL_COMPILER_STACK_SIZE to zero to make Halide stay on * the main thread's stack. */ void set_compiler_stack_size(size_t); /** The default amount of stack used for lowering and codegen. 32 MB * ought to be enough for anyone. */ constexpr size_t default_compiler_stack_size = 32 * 1024 * 1024; /** Return how much stack size the compiler should use for calls that * go through run_with_large_stack below. Currently that's lowering * and codegen. If no call to set_compiler_stack_size has been made, * this checks the value of the environment variable * HL_COMPILER_STACK_SIZE. If that's unset, it returns * default_compiler_stack_size, defined above. */ size_t get_compiler_stack_size(); namespace Internal { /** Call the given action in a platform-specific context that * provides at least the stack space returned by * get_compiler_stack_size. If that value is zero, just calls the * function on the calling thread. Otherwise on Windows this * uses a Fiber, and on other platforms it uses swapcontext. */ void run_with_large_stack(const std::function &action); /** Portable versions of popcount, count-leading-zeros, and count-trailing-zeros. */ // @{ int popcount64(uint64_t x); int clz64(uint64_t x); int ctz64(uint64_t x); // @} } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/Var.cpp000066400000000000000000000015571456515664200150370ustar00rootroot00000000000000#include "Var.h" #include "IR.h" #include "Util.h" namespace Halide { Var::Var(const std::string &n) : e(Internal::Variable::make(Int(32), n)) { } Var::Var() : e(Internal::Variable::make(Int(32), Internal::make_entity_name(this, "Halide:.*:Var", 'v'))) { } Var Var::implicit(int n) { return Var("_" + std::to_string(n)); } bool Var::is_implicit(const std::string &name) { return Internal::starts_with(name, "_") && name.find_first_not_of("0123456789", 1) == std::string::npos; } const std::string &Var::name() const { return e.as()->name; } namespace Internal { std::vector make_argument_list(int dimensionality) { std::vector args(dimensionality); for (int i = 0; i < dimensionality; i++) { args[i] = Var::implicit(i); } return args; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/Var.h000066400000000000000000000137071456515664200145040ustar00rootroot00000000000000#ifndef HALIDE_VAR_H #define HALIDE_VAR_H /** \file * Defines the Var - the front-end variable */ #include #include #include "Expr.h" namespace Halide { /** A Halide variable, to be used when defining functions. It is just * a name, and can be reused in places where no name conflict will * occur. It can be used in the left-hand-side of a function * definition, or as an Expr. As an Expr, it always has type * Int(32). */ class Var { /* The expression representing the Var. Guaranteed to be an * Internal::Variable of type Int(32). Created once on * construction of the Var to avoid making a fresh Expr every time * the Var is used in a context in which is will be converted to * one. */ Expr e; public: /** Construct a Var with the given name */ Var(const std::string &n); /** Construct a Var with an automatically-generated unique name. */ Var(); /** Get the name of a Var */ const std::string &name() const; /** Test if two Vars are the same. This simply compares the names. */ bool same_as(const Var &other) const { return name() == other.name(); } /** Implicit var constructor. Implicit variables are injected * automatically into a function call if the number of arguments * to the function are fewer than its dimensionality and a * placeholder ("_") appears in its argument list. Defining a * function to equal an expression containing implicit variables * similarly appends those implicit variables, in the same order, * to the left-hand-side of the definition where the placeholder * ('_') appears. * * For example, consider the definition: * \code Func f, g; Var x, y; f(x, y) = 3; \endcode * * A call to f with the placeholder symbol _ * will have implicit arguments injected automatically, so f(2, _) * is equivalent to f(2, _0), where _0 = ImplicitVar<0>(), and f(_) * (and indeed f when cast to an Expr) is equivalent to f(_0, _1). * The following definitions are all equivalent, differing only in the * variable names. * \code g(_) = f*3; g(_) = f(_)*3; g(x, _) = f(x, _)*3; g(x, y) = f(x, y)*3; \endcode * * These are expanded internally as follows: * \code g(_0, _1) = f(_0, _1)*3; g(_0, _1) = f(_0, _1)*3; g(x, _0) = f(x, _0)*3; g(x, y) = f(x, y)*3; \endcode * * The following, however, defines g as four dimensional: \code g(x, y, _) = f*3; \endcode * * It is equivalent to: * \code g(x, y, _0, _1) = f(_0, _1)*3; \endcode * * Expressions requiring differing numbers of implicit variables * can be combined. The left-hand-side of a definition injects * enough implicit variables to cover all of them: * \code Func h; h(x) = x*3; g(x) = h + (f + f(x)) * f(x, y); \endcode * * expands to: * \code Func h; h(x) = x*3; g(x, _0, _1) = h(_0) + (f(_0, _1) + f(x, _0)) * f(x, y); \endcode * * The first ten implicits, _0 through _9, are predeclared in this * header and can be used for scheduling. They should never be * used as arguments in a declaration or used in a call. * * While it is possible to use Var::implicit or the predeclared * implicits to create expressions that can be treated as small * anonymous functions (e.g. Func(_0 + _1)) this is considered * poor style. Instead use \ref lambda. */ static Var implicit(int n); /** Return whether a variable name is of the form for an implicit argument. * TODO: This is almost guaranteed to incorrectly fire on user * declared variables at some point. We should likely prevent * user Var declarations from making names of this form. */ //{ static bool is_implicit(const std::string &name); bool is_implicit() const { return is_implicit(name()); } //} /** Return the argument index for a placeholder argument given its * name. Returns 0 for _0, 1 for _1, etc. Returns -1 if * the variable is not of implicit form. */ //{ static int implicit_index(const std::string &name) { return is_implicit(name) ? atoi(name.c_str() + 1) : -1; } int implicit_index() const { return implicit_index(name()); } //} /** Test if a var is the placeholder variable _ */ //{ static bool is_placeholder(const std::string &name) { return name == "_"; } bool is_placeholder() const { return is_placeholder(name()); } //} /** A Var can be treated as an Expr of type Int(32) */ operator const Expr &() const { return e; } /** A Var that represents the location outside the outermost loop. */ static Var outermost() { return Var("__outermost"); } }; template struct ImplicitVar { Var to_var() const { if (N >= 0) { return Var::implicit(N); } else { return Var("_"); } } operator Var() const { return to_var(); } operator Expr() const { return to_var(); } }; /** A placeholder variable for inferred arguments. See \ref Var::implicit */ static constexpr ImplicitVar<> _; /** The first ten implicit Vars for use in scheduling. See \ref Var::implicit */ // @{ static constexpr ImplicitVar<0> _0; static constexpr ImplicitVar<1> _1; static constexpr ImplicitVar<2> _2; static constexpr ImplicitVar<3> _3; static constexpr ImplicitVar<4> _4; static constexpr ImplicitVar<5> _5; static constexpr ImplicitVar<6> _6; static constexpr ImplicitVar<7> _7; static constexpr ImplicitVar<8> _8; static constexpr ImplicitVar<9> _9; // @} namespace Internal { /** Make a list of unique arguments for definitions with unnamed arguments. */ std::vector make_argument_list(int dimensionality); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/VectorizeLoops.cpp000066400000000000000000001772321456515664200173020ustar00rootroot00000000000000#include #include #include "CSE.h" #include "CodeGen_GPU_Dev.h" #include "Deinterleave.h" #include "ExprUsesVar.h" #include "IREquality.h" #include "IRMutator.h" #include "IROperator.h" #include "IRPrinter.h" #include "Scope.h" #include "Simplify.h" #include "Solve.h" #include "Substitute.h" #include "VectorizeLoops.h" namespace Halide { namespace Internal { using std::map; using std::pair; using std::string; using std::vector; namespace { Expr get_lane(const Expr &e, int l) { return Shuffle::make_slice(e, l, 0, 1); } /** A helper like .as(), but unwraps arbitrarily many layers of * nested broadcasts. Guaranteed to return either a broadcast of a scalar or * nullptr. */ const Broadcast *as_scalar_broadcast(const Expr &e) { const Broadcast *b = e.as(); if (b && b->value.type().is_scalar()) { return b; } else if (b) { return as_scalar_broadcast(b->value); } else { return nullptr; } }; /** Find the exact scalar max and min lanes of a vector expression. Not * conservative like bounds_of_expr, but uses similar rules for some common node * types where it can be exact. Always returns a scalar, even in the case of * nested vectorization. */ Interval bounds_of_lanes(const Expr &e) { if (e.type().is_scalar()) { return {e, e}; } if (const Add *add = e.as()) { if (const Broadcast *b = as_scalar_broadcast(add->b)) { Interval ia = bounds_of_lanes(add->a); return {ia.min + b->value, ia.max + b->value}; } else if (const Broadcast *b = as_scalar_broadcast(add->a)) { Interval ia = bounds_of_lanes(add->b); return {b->value + ia.min, b->value + ia.max}; } } else if (const Sub *sub = e.as()) { if (const Broadcast *b = as_scalar_broadcast(sub->b)) { Interval ia = bounds_of_lanes(sub->a); return {ia.min - b->value, ia.max - b->value}; } else if (const Broadcast *b = as_scalar_broadcast(sub->a)) { Interval ia = bounds_of_lanes(sub->b); return {b->value - ia.max, b->value - ia.min}; } } else if (const Mul *mul = e.as()) { if (const Broadcast *b = as_scalar_broadcast(mul->b)) { if (is_positive_const(b->value)) { Interval ia = bounds_of_lanes(mul->a); return {ia.min * b->value, ia.max * b->value}; } else if (is_negative_const(b->value)) { Interval ia = bounds_of_lanes(mul->a); return {ia.max * b->value, ia.min * b->value}; } } else if (const Broadcast *b = as_scalar_broadcast(mul->a)) { if (is_positive_const(b->value)) { Interval ia = bounds_of_lanes(mul->b); return {b->value * ia.min, b->value * ia.max}; } else if (is_negative_const(b->value)) { Interval ia = bounds_of_lanes(mul->b); return {b->value * ia.max, b->value * ia.min}; } } } else if (const Div *div = e.as
()) { if (const Broadcast *b = as_scalar_broadcast(div->b)) { if (is_positive_const(b->value)) { Interval ia = bounds_of_lanes(div->a); return {ia.min / b->value, ia.max / b->value}; } else if (is_negative_const(b->value)) { Interval ia = bounds_of_lanes(div->a); return {ia.max / b->value, ia.min / b->value}; } } } else if (const And *and_ = e.as()) { if (const Broadcast *b = as_scalar_broadcast(and_->b)) { Interval ia = bounds_of_lanes(and_->a); return {ia.min && b->value, ia.max && b->value}; } else if (const Broadcast *b = as_scalar_broadcast(and_->a)) { Interval ia = bounds_of_lanes(and_->b); return {ia.min && b->value, ia.max && b->value}; } } else if (const Or *or_ = e.as()) { if (const Broadcast *b = as_scalar_broadcast(or_->b)) { Interval ia = bounds_of_lanes(or_->a); return {ia.min && b->value, ia.max && b->value}; } else if (const Broadcast *b = as_scalar_broadcast(or_->a)) { Interval ia = bounds_of_lanes(or_->b); return {ia.min && b->value, ia.max && b->value}; } } else if (const Min *min = e.as()) { if (const Broadcast *b = as_scalar_broadcast(min->b)) { Interval ia = bounds_of_lanes(min->a); // ia and b->value have both had one nesting layer of vectorization // peeled off, but that doesn't make them the same type. return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)}; } else if (const Broadcast *b = as_scalar_broadcast(min->a)) { Interval ia = bounds_of_lanes(min->b); return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)}; } } else if (const Max *max = e.as()) { if (const Broadcast *b = as_scalar_broadcast(max->b)) { Interval ia = bounds_of_lanes(max->a); return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)}; } else if (const Broadcast *b = as_scalar_broadcast(max->a)) { Interval ia = bounds_of_lanes(max->b); return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)}; } } else if (const Not *not_ = e.as()) { Interval ia = bounds_of_lanes(not_->a); return {!ia.max, !ia.min}; } else if (const Ramp *r = e.as()) { Expr last_lane_idx = make_const(r->base.type().element_of(), r->lanes - 1); Interval ib = bounds_of_lanes(r->base); const Broadcast *b = as_scalar_broadcast(r->stride); Expr stride = b ? b->value : r->stride; if (stride.type().is_scalar()) { if (is_positive_const(stride)) { return {ib.min, ib.max + last_lane_idx * stride}; } else if (is_negative_const(stride)) { return {ib.min + last_lane_idx * stride, ib.max}; } } } else if (const LE *le = e.as()) { // The least true this can be is if we maximize the LHS and minimize the RHS. // The most true this can be is if we minimize the LHS and maximize the RHS. // This is only exact if one of the two sides is a Broadcast. Interval ia = bounds_of_lanes(le->a); Interval ib = bounds_of_lanes(le->b); if (ia.is_single_point() || ib.is_single_point()) { return {ia.max <= ib.min, ia.min <= ib.max}; } } else if (const LT *lt = e.as()) { // The least true this can be is if we maximize the LHS and minimize the RHS. // The most true this can be is if we minimize the LHS and maximize the RHS. // This is only exact if one of the two sides is a Broadcast. Interval ia = bounds_of_lanes(lt->a); Interval ib = bounds_of_lanes(lt->b); if (ia.is_single_point() || ib.is_single_point()) { return {ia.max < ib.min, ia.min < ib.max}; } } else if (const Broadcast *b = as_scalar_broadcast(e)) { return {b->value, b->value}; } else if (const Let *let = e.as()) { Interval ia = bounds_of_lanes(let->value); Interval ib = bounds_of_lanes(let->body); if (expr_uses_var(ib.min, let->name)) { ib.min = Let::make(let->name, let->value, ib.min); } if (expr_uses_var(ib.max, let->name)) { ib.max = Let::make(let->name, let->value, ib.max); } return ib; } // If all else fails, just take the explicit min and max over the // lanes if (e.type().is_bool()) { Expr min_lane = VectorReduce::make(VectorReduce::And, e, 1); Expr max_lane = VectorReduce::make(VectorReduce::Or, e, 1); return {min_lane, max_lane}; } else { Expr min_lane = VectorReduce::make(VectorReduce::Min, e, 1); Expr max_lane = VectorReduce::make(VectorReduce::Max, e, 1); return {min_lane, max_lane}; } }; // A ramp with the lanes repeated inner_repetitions times, and then // the whole vector repeated outer_repetitions times. // E.g: <0 0 2 2 4 4 6 6 0 0 2 2 4 4 6 6>. struct InterleavedRamp { Expr base, stride; int lanes, inner_repetitions, outer_repetitions; }; bool equal_or_zero(int a, int b) { return a == 0 || b == 0 || a == b; } bool is_interleaved_ramp(const Expr &e, const Scope &scope, InterleavedRamp *result) { if (const Ramp *r = e.as()) { const Broadcast *b_base = r->base.as(); const Broadcast *b_stride = r->stride.as(); if (r->base.type().is_scalar()) { result->base = r->base; result->stride = r->stride; result->lanes = r->lanes; result->inner_repetitions = 1; result->outer_repetitions = 1; return true; } else if (b_base && b_stride && b_base->lanes == b_stride->lanes) { // Ramp of broadcast result->base = b_base->value; result->stride = b_stride->value; result->lanes = r->lanes; result->inner_repetitions = b_base->lanes; result->outer_repetitions = 1; return true; } } else if (const Broadcast *b = e.as()) { if (b->value.type().is_scalar()) { result->base = b->value; result->stride = 0; result->lanes = b->lanes; result->inner_repetitions = 0; result->outer_repetitions = 0; return true; } else if (is_interleaved_ramp(b->value, scope, result)) { // Broadcast of interleaved ramp result->outer_repetitions *= b->lanes; return true; } } else if (const Add *add = e.as()) { InterleavedRamp ra; if (is_interleaved_ramp(add->a, scope, &ra) && is_interleaved_ramp(add->b, scope, result) && equal_or_zero(ra.inner_repetitions, result->inner_repetitions) && equal_or_zero(ra.outer_repetitions, result->outer_repetitions)) { result->base = simplify(result->base + ra.base); result->stride = simplify(result->stride + ra.stride); result->inner_repetitions = std::max(result->inner_repetitions, ra.inner_repetitions); result->outer_repetitions = std::max(result->outer_repetitions, ra.outer_repetitions); return true; } } else if (const Sub *sub = e.as()) { InterleavedRamp ra; if (is_interleaved_ramp(sub->a, scope, &ra) && is_interleaved_ramp(sub->b, scope, result) && equal_or_zero(ra.inner_repetitions, result->inner_repetitions) && equal_or_zero(ra.outer_repetitions, result->outer_repetitions)) { result->base = simplify(ra.base - result->base); result->stride = simplify(ra.stride - result->stride); result->inner_repetitions = std::max(result->inner_repetitions, ra.inner_repetitions); result->outer_repetitions = std::max(result->outer_repetitions, ra.outer_repetitions); return true; } } else if (const Mul *mul = e.as()) { const int64_t *b = nullptr; if (is_interleaved_ramp(mul->a, scope, result) && (b = as_const_int(mul->b))) { result->base = simplify(result->base * (int)(*b)); result->stride = simplify(result->stride * (int)(*b)); return true; } } else if (const Div *div = e.as
()) { const int64_t *b = nullptr; if (is_interleaved_ramp(div->a, scope, result) && (b = as_const_int(div->b)) && is_const_one(result->stride) && (result->inner_repetitions == 1 || result->inner_repetitions == 0) && can_prove((result->base % (int)(*b)) == 0)) { // TODO: Generalize this. Currently only matches // ramp(base*b, 1, lanes) / b // broadcast(base * b, lanes) / b result->base = simplify(result->base / (int)(*b)); result->inner_repetitions *= (int)(*b); return true; } } else if (const Mod *mod = e.as()) { const int64_t *b = nullptr; if (is_interleaved_ramp(mod->a, scope, result) && (b = as_const_int(mod->b)) && (result->outer_repetitions == 1 || result->outer_repetitions == 0) && can_prove(((int)(*b) % result->stride) == 0)) { // ramp(base, 2, lanes) % 8 result->base = simplify(result->base % (int)(*b)); result->stride = simplify(result->stride % (int)(*b)); result->outer_repetitions *= (int)(*b); return true; } } else if (const Variable *var = e.as()) { if (scope.contains(var->name)) { return is_interleaved_ramp(scope.get(var->name), scope, result); } } return false; } // Allocations inside vectorized loops grow an additional inner // dimension to represent the separate copy of the allocation per // vector lane. This means loads and stores to them need to be // rewritten slightly. class RewriteAccessToVectorAlloc : public IRMutator { Expr var; string alloc; int lanes; using IRMutator::visit; Expr mutate_index(const string &a, Expr index) { index = mutate(index); if (a == alloc) { return index * lanes + var; } else { return index; } } ModulusRemainder mutate_alignment(const string &a, const ModulusRemainder &align) { if (a == alloc) { return align * lanes; } else { return align; } } Expr visit(const Load *op) override { return Load::make(op->type, op->name, mutate_index(op->name, op->index), op->image, op->param, mutate(op->predicate), mutate_alignment(op->name, op->alignment)); } Stmt visit(const Store *op) override { return Store::make(op->name, mutate(op->value), mutate_index(op->name, op->index), op->param, mutate(op->predicate), mutate_alignment(op->name, op->alignment)); } public: RewriteAccessToVectorAlloc(const string &v, string a, int l) : var(Variable::make(Int(32), v)), alloc(std::move(a)), lanes(l) { } }; class SerializeLoops : public IRMutator { using IRMutator::visit; Stmt visit(const For *op) override { if (op->for_type == ForType::Vectorized) { return For::make(op->name, op->min, op->extent, ForType::Serial, op->partition_policy, op->device_api, mutate(op->body)); } return IRMutator::visit(op); } }; // Wrap a vectorized predicate around a Load/Store node. class PredicateLoadStore : public IRMutator { string var; Expr vector_predicate; int lanes; bool valid = true; bool vectorized = false; using IRMutator::visit; Expr merge_predicate(Expr pred, const Expr &new_pred) { if (pred.type().lanes() == new_pred.type().lanes()) { Expr res = simplify(pred && new_pred); return res; } valid = false; return pred; } Expr visit(const Load *op) override { valid = valid && ((op->predicate.type().lanes() == lanes) || (op->predicate.type().is_scalar() && !expr_uses_var(op->index, var))); if (!valid) { return op; } Expr predicate, index; if (!op->index.type().is_scalar()) { internal_assert(op->predicate.type().lanes() == lanes); internal_assert(op->index.type().lanes() == lanes); predicate = mutate(op->predicate); index = mutate(op->index); } else if (expr_uses_var(op->index, var)) { predicate = mutate(Broadcast::make(op->predicate, lanes)); index = mutate(Broadcast::make(op->index, lanes)); } else { return IRMutator::visit(op); } predicate = merge_predicate(predicate, vector_predicate); if (!valid) { return op; } vectorized = true; return Load::make(op->type, op->name, index, op->image, op->param, predicate, op->alignment); } Stmt visit(const Store *op) override { valid = valid && ((op->predicate.type().lanes() == lanes) || (op->predicate.type().is_scalar() && !expr_uses_var(op->index, var))); if (!valid) { return op; } Expr predicate, value, index; if (!op->index.type().is_scalar()) { internal_assert(op->predicate.type().lanes() == lanes); internal_assert(op->index.type().lanes() == lanes); internal_assert(op->value.type().lanes() == lanes); predicate = mutate(op->predicate); value = mutate(op->value); index = mutate(op->index); } else if (expr_uses_var(op->index, var)) { predicate = mutate(Broadcast::make(op->predicate, lanes)); value = mutate(Broadcast::make(op->value, lanes)); index = mutate(Broadcast::make(op->index, lanes)); } else { return IRMutator::visit(op); } predicate = merge_predicate(predicate, vector_predicate); if (!valid) { return op; } vectorized = true; return Store::make(op->name, value, index, op->param, predicate, op->alignment); } Expr visit(const Call *op) override { // We should not vectorize calls with side-effects valid = valid && op->is_pure(); return IRMutator::visit(op); } Expr visit(const VectorReduce *op) override { // We can't predicate vector reductions. valid = valid && is_const_one(vector_predicate); return op; } public: PredicateLoadStore(string v, const Expr &vpred) : var(std::move(v)), vector_predicate(vpred), lanes(vpred.type().lanes()) { internal_assert(lanes > 1); } bool is_vectorized() const { return valid && vectorized; } }; Stmt vectorize_statement(const Stmt &stmt); struct VectorizedVar { string name; Expr min; int lanes; }; // Substitutes a vector for a scalar var in a Stmt. Used on the // body of every vectorized loop. class VectorSubs : public IRMutator { // A list of vectorized loop vars encountered so far. The last // element corresponds to the most inner vectorized loop. std::vector vectorized_vars; // What we're replacing it with. Usually a combination of ramps // and broadcast. It depends on the current loop level and // is updated when vectorized_vars list is updated. std::map replacements; // A scope containing lets and letstmts whose values became // vectors. Contains are original, non-vectorized expressions. Scope scope; // Based on the same set of Exprs, but indexed by the vectorized // var name and holding vectorized expression. Scope vector_scope; // A stack of all containing lets. We need to reinject the scalar // version of them if we scalarize inner code. vector> containing_lets; // Widen an expression to the given number of lanes. Expr widen(Expr e, int lanes) { if (e.type().lanes() == lanes) { return e; } else if (lanes % e.type().lanes() == 0) { return Broadcast::make(e, lanes / e.type().lanes()); } else { internal_error << "Mismatched vector lanes in VectorSubs " << e.type().lanes() << " " << lanes << "\n"; } return Expr(); } using IRMutator::visit; Expr visit(const Cast *op) override { Expr value = mutate(op->value); if (value.same_as(op->value)) { return op; } else { Type t = op->type.with_lanes(value.type().lanes()); return Cast::make(t, value); } } Expr visit(const Reinterpret *op) override { Expr value = mutate(op->value); if (value.same_as(op->value)) { return op; } else { Type t = op->type.with_lanes(value.type().lanes()); return Reinterpret::make(t, value); } } string get_widened_var_name(const string &name) { return name + ".widened." + vectorized_vars.back().name; } Expr visit(const Variable *op) override { if (replacements.count(op->name) > 0) { return replacements[op->name]; } else if (scope.contains(op->name)) { string widened_name = get_widened_var_name(op->name); return Variable::make(vector_scope.get(widened_name).type(), widened_name); } else { return op; } } template Expr mutate_binary_operator(const T *op) { Expr a = mutate(op->a), b = mutate(op->b); if (a.same_as(op->a) && b.same_as(op->b)) { return op; } else { int w = std::max(a.type().lanes(), b.type().lanes()); return T::make(widen(a, w), widen(b, w)); } } Expr visit(const Add *op) override { return mutate_binary_operator(op); } Expr visit(const Sub *op) override { return mutate_binary_operator(op); } Expr visit(const Mul *op) override { return mutate_binary_operator(op); } Expr visit(const Div *op) override { return mutate_binary_operator(op); } Expr visit(const Mod *op) override { return mutate_binary_operator(op); } Expr visit(const Min *op) override { return mutate_binary_operator(op); } Expr visit(const Max *op) override { return mutate_binary_operator(op); } Expr visit(const EQ *op) override { return mutate_binary_operator(op); } Expr visit(const NE *op) override { return mutate_binary_operator(op); } Expr visit(const LT *op) override { return mutate_binary_operator(op); } Expr visit(const LE *op) override { return mutate_binary_operator(op); } Expr visit(const GT *op) override { return mutate_binary_operator(op); } Expr visit(const GE *op) override { return mutate_binary_operator(op); } Expr visit(const And *op) override { return mutate_binary_operator(op); } Expr visit(const Or *op) override { return mutate_binary_operator(op); } Expr visit(const Select *op) override { Expr condition = mutate(op->condition); Expr true_value = mutate(op->true_value); Expr false_value = mutate(op->false_value); if (condition.same_as(op->condition) && true_value.same_as(op->true_value) && false_value.same_as(op->false_value)) { return op; } else { int lanes = std::max(true_value.type().lanes(), false_value.type().lanes()); lanes = std::max(lanes, condition.type().lanes()); // Widen the true and false values, but we don't have to widen the condition true_value = widen(true_value, lanes); false_value = widen(false_value, lanes); return Select::make(condition, true_value, false_value); } } Expr visit(const Load *op) override { Expr predicate = mutate(op->predicate); Expr index = mutate(op->index); if (predicate.same_as(op->predicate) && index.same_as(op->index)) { return op; } else { int w = index.type().lanes(); predicate = widen(predicate, w); return Load::make(op->type.with_lanes(w), op->name, index, op->image, op->param, predicate, op->alignment); } } Expr visit(const Call *op) override { // Widen the call by changing the lanes of all of its // arguments and its return type // Mutate the args auto [new_args, changed] = mutate_with_changes(op->args); int max_lanes = 0; for (const auto &new_arg : new_args) { max_lanes = std::max(new_arg.type().lanes(), max_lanes); } if (!changed) { return op; } else if (op->name == Call::trace) { const int64_t *event = as_const_int(op->args[6]); internal_assert(event != nullptr); if (*event == halide_trace_begin_realization || *event == halide_trace_end_realization) { // Call::trace vectorizes uniquely for begin/end realization, because the coordinates // for these are actually min/extent pairs; we need to maintain the proper dimensionality // count and instead aggregate the widened values into a single pair. for (size_t i = 1; i <= 2; i++) { const Call *make_struct = Call::as_intrinsic(new_args[i], {Call::make_struct}); internal_assert(make_struct); if (i == 1) { // values should always be empty for these events internal_assert(make_struct->args.empty()); continue; } vector call_args(make_struct->args.size()); for (size_t j = 0; j < call_args.size(); j += 2) { Expr min_v = widen(make_struct->args[j], max_lanes); Expr extent_v = widen(make_struct->args[j + 1], max_lanes); Expr min_scalar = get_lane(min_v, 0); Expr max_scalar = min_scalar + get_lane(extent_v, 0); for (int k = 1; k < max_lanes; ++k) { Expr min_k = get_lane(min_v, k); Expr extent_k = get_lane(extent_v, k); min_scalar = min(min_scalar, min_k); max_scalar = max(max_scalar, min_k + extent_k); } call_args[j] = min_scalar; call_args[j + 1] = max_scalar - min_scalar; } new_args[i] = Call::make(make_struct->type.element_of(), Call::make_struct, call_args, Call::Intrinsic); } } else { // Call::trace vectorizes uniquely, because we want a // single trace call for the entire vector, instead of // scalarizing the call and tracing each element. for (size_t i = 1; i <= 2; i++) { // Each struct should be a struct-of-vectors, not a // vector of distinct structs. const Call *make_struct = Call::as_intrinsic(new_args[i], {Call::make_struct}); internal_assert(make_struct); // Widen the call args to have the same lanes as the max lanes found vector call_args(make_struct->args.size()); for (size_t j = 0; j < call_args.size(); j++) { call_args[j] = widen(make_struct->args[j], max_lanes); } new_args[i] = Call::make(make_struct->type.element_of(), Call::make_struct, call_args, Call::Intrinsic); } // One of the arguments to the trace helper // records the number of vector lanes in the type being // stored. new_args[5] = max_lanes; // One of the arguments to the trace helper // records the number entries in the coordinates (which we just widened) if (max_lanes > 1) { new_args[9] = new_args[9] * max_lanes; } } return Call::make(op->type, Call::trace, new_args, op->call_type); } else if (op->is_intrinsic(Call::if_then_else) && op->args.size() == 2) { Expr cond = widen(new_args[0], max_lanes); Expr true_value = widen(new_args[1], max_lanes); const Load *load = true_value.as(); if (load) { return Load::make(op->type.with_lanes(max_lanes), load->name, load->index, load->image, load->param, cond, load->alignment); } } // Widen the args to have the same lanes as the max lanes found for (auto &arg : new_args) { arg = widen(arg, max_lanes); } Type new_op_type = op->type.with_lanes(max_lanes); if (op->is_intrinsic(Call::prefetch)) { // We don't want prefetch args to ve vectorized, but we can't just skip the mutation // (otherwise we can end up with dead loop variables. Instead, use extract_lane() on each arg // to scalarize it again. for (auto &arg : new_args) { if (arg.type().is_vector()) { arg = extract_lane(arg, 0); } } new_op_type = op->type; } return Call::make(new_op_type, op->name, new_args, op->call_type, op->func, op->value_index, op->image, op->param); } Expr visit(const Let *op) override { // Vectorize the let value and check to see if it was vectorized by // this mutator. The type of the expression might already be vector // width. Expr mutated_value = simplify(mutate(op->value)); bool was_vectorized = (!op->value.type().is_vector() && mutated_value.type().is_vector()); // If the value was vectorized by this mutator, add a new name to // the scope for the vectorized value expression. string vectorized_name; if (was_vectorized) { vectorized_name = get_widened_var_name(op->name); scope.push(op->name, op->value); vector_scope.push(vectorized_name, mutated_value); } Expr mutated_body = mutate(op->body); InterleavedRamp ir; if (is_interleaved_ramp(mutated_value, vector_scope, &ir)) { return substitute(vectorized_name, mutated_value, mutated_body); } else if (mutated_value.same_as(op->value) && mutated_body.same_as(op->body)) { return op; } else if (was_vectorized) { scope.pop(op->name); vector_scope.pop(vectorized_name); return Let::make(vectorized_name, mutated_value, mutated_body); } else { return Let::make(op->name, mutated_value, mutated_body); } } Stmt visit(const LetStmt *op) override { Expr mutated_value = simplify(mutate(op->value)); string vectorized_name = op->name; // Check if the value was vectorized by this mutator. bool was_vectorized = (!op->value.type().is_vector() && mutated_value.type().is_vector()); if (was_vectorized) { vectorized_name = get_widened_var_name(op->name); scope.push(op->name, op->value); vector_scope.push(vectorized_name, mutated_value); // Also keep track of the original let, in case inner code scalarizes. containing_lets.emplace_back(op->name, op->value); } Stmt mutated_body = mutate(op->body); if (was_vectorized) { containing_lets.pop_back(); scope.pop(op->name); vector_scope.pop(vectorized_name); } InterleavedRamp ir; if (is_interleaved_ramp(mutated_value, vector_scope, &ir)) { return substitute(vectorized_name, mutated_value, mutated_body); } else if (mutated_value.same_as(op->value) && mutated_body.same_as(op->body)) { return op; } else { return LetStmt::make(vectorized_name, mutated_value, mutated_body); } } Stmt visit(const Provide *op) override { internal_error << "Vectorizing a Provide node is unimplemented. " << "Vectorization usually runs after storage flattening.\n"; return Stmt(); } Stmt visit(const Store *op) override { Expr predicate = mutate(op->predicate); Expr value = mutate(op->value); Expr index = mutate(op->index); if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index)) { return op; } else { int lanes = std::max(predicate.type().lanes(), std::max(value.type().lanes(), index.type().lanes())); return Store::make(op->name, widen(value, lanes), widen(index, lanes), op->param, widen(predicate, lanes), op->alignment); } } Stmt visit(const AssertStmt *op) override { return (mutate(op->condition).type().lanes() > 1) ? scalarize(op) : op; } Stmt visit(const IfThenElse *op) override { Expr cond = mutate(op->condition); int lanes = cond.type().lanes(); debug(3) << "Vectorizing \n" << "Old: " << op->condition << "\n" << "New: " << cond << "\n"; Stmt then_case = mutate(op->then_case); Stmt else_case = mutate(op->else_case); if (lanes > 1) { // We have an if statement with a vector condition, // which would mean control flow divergence within the // SIMD lanes. bool vectorize_predicate = true; Stmt predicated_stmt; if (vectorize_predicate) { PredicateLoadStore p(vectorized_vars.front().name, cond); predicated_stmt = p.mutate(then_case); vectorize_predicate = p.is_vectorized(); } if (vectorize_predicate && else_case.defined()) { PredicateLoadStore p(vectorized_vars.front().name, !cond); predicated_stmt = Block::make(predicated_stmt, p.mutate(else_case)); vectorize_predicate = p.is_vectorized(); } debug(4) << "IfThenElse should vectorize predicate " << "? " << vectorize_predicate << "; cond: " << cond << "\n"; debug(4) << "Predicated stmt:\n" << predicated_stmt << "\n"; // First check if the condition is marked as likely. if (const Call *likely = Call::as_intrinsic(cond, {Call::likely, Call::likely_if_innermost})) { // The meaning of the likely intrinsic is that // Halide should optimize for the case in which // *every* likely value is true. We can do that by // generating a scalar condition that checks if // the least-true lane is true. Expr all_true = bounds_of_lanes(likely->args[0]).min; internal_assert(all_true.type() == Bool()); // Wrap it in the same flavor of likely all_true = Call::make(Bool(), likely->name, {all_true}, Call::PureIntrinsic); if (!vectorize_predicate) { // We should strip the likelies from the case // that's going to scalarize, because it's no // longer likely. Stmt without_likelies = IfThenElse::make(unwrap_tags(op->condition), op->then_case, op->else_case); // scalarize() will put back all vectorized loops around the statement as serial, // but it still may happen that there are vectorized loops inside of the statement // itself which we may want to handle. All the context is invalid though, so // we just start anew for this specific statement. Stmt scalarized = scalarize(without_likelies, false); scalarized = vectorize_statement(scalarized); Stmt stmt = IfThenElse::make(all_true, then_case, scalarized); debug(4) << "...With all_true likely: \n" << stmt << "\n"; return stmt; } else { Stmt stmt = IfThenElse::make(all_true, then_case, predicated_stmt); debug(4) << "...Predicated IfThenElse: \n" << stmt << "\n"; return stmt; } } else { // It's some arbitrary vector condition. if (!vectorize_predicate) { debug(4) << "...Scalarizing vector predicate: \n" << Stmt(op) << "\n"; return scalarize(op); } else { Stmt stmt = predicated_stmt; debug(4) << "...Predicated IfThenElse: \n" << stmt << "\n"; return stmt; } } } else { // It's an if statement on a scalar, we're ok to vectorize the innards. debug(3) << "Not scalarizing if then else\n"; if (cond.same_as(op->condition) && then_case.same_as(op->then_case) && else_case.same_as(op->else_case)) { return op; } else { return IfThenElse::make(cond, then_case, else_case); } } } Stmt visit(const For *op) override { ForType for_type = op->for_type; Expr min = mutate(op->min); Expr extent = mutate(op->extent); Stmt body = op->body; if (min.type().is_vector()) { // Rebase the loop to zero and try again Expr var = Variable::make(Int(32), op->name); Stmt body = substitute(op->name, var + op->min, op->body); Stmt transformed = For::make(op->name, 0, op->extent, for_type, op->partition_policy, op->device_api, body); return mutate(transformed); } if (extent.type().is_vector()) { // We'll iterate up to the max over the lanes, but // inject an if statement inside the loop that stops // each lane from going too far. extent = bounds_of_lanes(extent).max; Expr var = Variable::make(Int(32), op->name); body = IfThenElse::make(likely(var < op->min + op->extent), body); } if (op->for_type == ForType::Vectorized) { const IntImm *extent_int = extent.as(); internal_assert(extent_int) << "Vectorized for loop extent should have been rewritten to a constant\n"; if (extent_int->value <= 1) { user_error << "Loop over " << op->name << " has extent " << extent << ". Can only vectorize loops over a " << "constant extent > 1\n"; } vectorized_vars.push_back({op->name, min, (int)extent_int->value}); update_replacements(); // Go over lets which were vectorized in the order of their occurrence and update // them according to the current loop level. for (auto let = containing_lets.begin(); let != containing_lets.end(); let++) { // Skip if this var wasn't vectorized. if (!scope.contains(let->first)) { continue; } string vectorized_name = get_widened_var_name(let->first); Expr vectorized_value = mutate(scope.get(let->first)); vector_scope.push(vectorized_name, vectorized_value); } body = mutate(body); // Append vectorized lets for this loop level. for (auto let = containing_lets.rbegin(); let != containing_lets.rend(); let++) { // Skip if this var wasn't vectorized. if (!scope.contains(let->first)) { continue; } string vectorized_name = get_widened_var_name(let->first); Expr vectorized_value = vector_scope.get(vectorized_name); vector_scope.pop(vectorized_name); InterleavedRamp ir; if (is_interleaved_ramp(vectorized_value, vector_scope, &ir)) { body = substitute(vectorized_name, vectorized_value, body); } else { body = LetStmt::make(vectorized_name, vectorized_value, body); } } vectorized_vars.pop_back(); update_replacements(); return body; } else { body = mutate(body); if (min.same_as(op->min) && extent.same_as(op->extent) && body.same_as(op->body) && for_type == op->for_type) { return op; } else { return For::make(op->name, min, extent, for_type, op->partition_policy, op->device_api, body); } } } Stmt visit(const Allocate *op) override { vector new_extents; Expr new_expr; // The new expanded dimensions are innermost. for (const auto &vv : vectorized_vars) { new_extents.emplace_back(vv.lanes); } for (const auto &e : op->extents) { Expr extent = mutate(e); // For vector sizes, take the max over the lanes. Note // that we haven't changed the strides, which also may // vary per lane. This is a bit weird, but the way we // set up the vectorized memory means that lanes can't // clobber each others' memory, so it doesn't matter. if (extent.type().is_vector()) { extent = bounds_of_lanes(extent).max; } new_extents.push_back(extent); } if (op->new_expr.defined()) { new_expr = mutate(op->new_expr); user_assert(new_expr.type().is_scalar()) << "Cannot vectorize an allocation with a varying new_expr per vector lane.\n"; } Stmt body = op->body; // Rewrite loads and stores to this allocation like so: // foo[x] -> foo[x*lanes + v] for (const auto &vv : vectorized_vars) { body = RewriteAccessToVectorAlloc(vv.name + ".from_zero", op->name, vv.lanes).mutate(body); } body = mutate(body); for (const auto &vv : vectorized_vars) { // The variable itself could still exist inside an inner scalarized block. body = substitute(vv.name + ".from_zero", Variable::make(Int(32), vv.name), body); } // Difficult to tell how the padding should grow when vectorizing an // allocation. It's not currently an issue, because vectorization // happens before the only source of padding (lowering strided // loads). Add an assert to enforce it. internal_assert(op->padding == 0) << "Vectorization of padded allocations not yet implemented"; return Allocate::make(op->name, op->type, op->memory_type, new_extents, op->condition, body, new_expr, op->free_function); } Stmt visit(const Atomic *op) override { // Recognize a few special cases that we can handle as within-vector reduction trees. do { if (!op->mutex_name.empty()) { // We can't vectorize over a mutex break; } const Store *store = op->body.as(); if (!store) { break; } // f[x] = y if (!expr_uses_var(store->value, store->name) && !expr_uses_var(store->predicate, store->name)) { // This can be naively vectorized just fine. If there are // repeated values in the vectorized store index, the ordering // of writes may be undetermined and backend-dependent, but // they'll be atomic. Stmt s = mutate(store); // We may still need the atomic node, if there was more // parallelism than just the vectorization. s = Atomic::make(op->producer_name, op->mutex_name, s); return s; } // f[x] = f[x] y VectorReduce::Operator reduce_op = VectorReduce::Add; Expr a, b; if (const Add *add = store->value.as()) { a = add->a; b = add->b; reduce_op = VectorReduce::Add; } else if (const Mul *mul = store->value.as()) { a = mul->a; b = mul->b; reduce_op = VectorReduce::Mul; } else if (const Min *min = store->value.as()) { a = min->a; b = min->b; reduce_op = VectorReduce::Min; } else if (const Max *max = store->value.as()) { a = max->a; b = max->b; reduce_op = VectorReduce::Max; } else if (const Cast *cast_op = store->value.as()) { if (cast_op->type.element_of() == UInt(8) && cast_op->value.type().is_bool()) { if (const And *and_op = cast_op->value.as()) { a = and_op->a; b = and_op->b; reduce_op = VectorReduce::And; } else if (const Or *or_op = cast_op->value.as()) { a = or_op->a; b = or_op->b; reduce_op = VectorReduce::Or; } } } else if (const Call *call_op = store->value.as()) { if (call_op->is_intrinsic(Call::saturating_add)) { a = call_op->args[0]; b = call_op->args[1]; reduce_op = VectorReduce::SaturatingAdd; } } if (!a.defined() || !b.defined()) { break; } // Bools get cast to uint8 for storage. Strip off that // cast around any load. if (b.type().is_bool()) { const Cast *cast_op = b.as(); if (cast_op) { b = cast_op->value; } } if (a.type().is_bool()) { const Cast *cast_op = b.as(); if (cast_op) { a = cast_op->value; } } if (a.as() && !b.as()) { std::swap(a, b); } // We require b to be a var, because it should have been lifted. const Variable *var_b = b.as(); const Load *load_a = a.as(); if (!var_b || !scope.contains(var_b->name) || !load_a || load_a->name != store->name || !is_const_one(load_a->predicate) || !is_const_one(store->predicate)) { break; } b = vector_scope.get(get_widened_var_name(var_b->name)); Expr store_index = mutate(store->index); Expr load_index = mutate(load_a->index); // The load and store indices must be the same interleaved // ramp (or the same scalar, in the total reduction case). InterleavedRamp store_ir, load_ir; Expr test; if (store_index.type().is_scalar()) { test = simplify(load_index == store_index); } else if (is_interleaved_ramp(store_index, vector_scope, &store_ir) && is_interleaved_ramp(load_index, vector_scope, &load_ir) && store_ir.inner_repetitions == load_ir.inner_repetitions && store_ir.outer_repetitions == load_ir.outer_repetitions && store_ir.lanes == load_ir.lanes) { test = simplify(store_ir.base == load_ir.base && store_ir.stride == load_ir.stride); } if (!test.defined()) { break; } if (is_const_zero(test)) { break; } else if (!is_const_one(test)) { // TODO: try harder by substituting in more things in scope break; } auto binop = [=](const Expr &a, const Expr &b) { switch (reduce_op) { case VectorReduce::Add: return a + b; case VectorReduce::Mul: return a * b; case VectorReduce::Min: return min(a, b); case VectorReduce::Max: return max(a, b); case VectorReduce::And: return a && b; case VectorReduce::Or: return a || b; case VectorReduce::SaturatingAdd: return saturating_add(a, b); } return Expr(); }; int output_lanes = 1; if (store_index.type().is_scalar()) { // The index doesn't depend on the value being // vectorized, so it's a total reduction. b = VectorReduce::make(reduce_op, b, 1); } else { output_lanes = store_index.type().lanes() / (store_ir.inner_repetitions * store_ir.outer_repetitions); store_index = Ramp::make(store_ir.base, store_ir.stride, output_lanes / store_ir.base.type().lanes()); if (store_ir.inner_repetitions > 1) { b = VectorReduce::make(reduce_op, b, output_lanes * store_ir.outer_repetitions); } // Handle outer repetitions by unrolling the reduction // over slices. if (store_ir.outer_repetitions > 1) { // First remove all powers of two with a binary reduction tree. int reps = store_ir.outer_repetitions; while (reps % 2 == 0) { int l = b.type().lanes() / 2; Expr b0 = Shuffle::make_slice(b, 0, 1, l); Expr b1 = Shuffle::make_slice(b, l, 1, l); b = binop(b0, b1); reps /= 2; } // Then reduce linearly over slices for the rest. if (reps > 1) { Expr v = Shuffle::make_slice(b, 0, 1, output_lanes); for (int i = 1; i < reps; i++) { Expr slice = simplify(Shuffle::make_slice(b, i * output_lanes, 1, output_lanes)); v = binop(v, slice); } b = v; } } } Expr new_load = Load::make(load_a->type.with_lanes(output_lanes), load_a->name, store_index, load_a->image, load_a->param, const_true(output_lanes), ModulusRemainder{}); Expr lhs = cast(b.type(), new_load); b = binop(lhs, b); b = cast(new_load.type(), b); Stmt s = Store::make(store->name, b, store_index, store->param, const_true(b.type().lanes()), store->alignment); // We may still need the atomic node, if there was more // parallelism than just the vectorization. s = Atomic::make(op->producer_name, op->mutex_name, s); return s; } while (false); // In the general case, if a whole stmt has to be done // atomically, we need to serialize. return scalarize(op); } Stmt scalarize(Stmt s, bool serialize_inner_loops = true) { // Wrap a serial loop around it. Maybe LLVM will have // better luck vectorizing it. if (serialize_inner_loops) { s = SerializeLoops().mutate(s); } // We'll need the original scalar versions of any containing lets. for (size_t i = containing_lets.size(); i > 0; i--) { const auto &l = containing_lets[i - 1]; s = LetStmt::make(l.first, l.second, s); } for (int ix = vectorized_vars.size() - 1; ix >= 0; ix--) { s = For::make(vectorized_vars[ix].name, vectorized_vars[ix].min, vectorized_vars[ix].lanes, ForType::Serial, Partition::Auto, DeviceAPI::None, s); } return s; } Expr scalarize(Expr e) { // This method returns a select tree that produces a vector lanes // result expression user_assert(replacements.size() == 1) << "Can't scalarize nested vectorization\n"; string var = replacements.begin()->first; Expr replacement = replacements.begin()->second; Expr result; int lanes = replacement.type().lanes(); for (int i = lanes - 1; i >= 0; --i) { // Hide all the vector let values in scope with a scalar version // in the appropriate lane. for (Scope::const_iterator iter = scope.cbegin(); iter != scope.cend(); ++iter) { e = substitute(iter.name(), get_lane(Variable::make(iter.value().type(), iter.name()), i), e); } // Replace uses of the vectorized variable with the extracted // lane expression e = substitute(var, i, e); if (i == lanes - 1) { result = Broadcast::make(e, lanes); } else { Expr cond = (replacement == Broadcast::make(i, lanes)); result = Select::make(cond, Broadcast::make(e, lanes), result); } } return result; } // Recompute all replacements for vectorized vars based on // the current stack of vectorized loops. void update_replacements() { replacements.clear(); for (const auto &var : vectorized_vars) { // Two different replacements are needed for each loop var // one starting from zero and another starting from loop.min. replacements[var.name] = var.min; replacements[var.name + ".from_zero"] = 0; } Expr strided_ones = 1; for (int ix = vectorized_vars.size() - 1; ix >= 0; ix--) { for (int ik = 0; ik < (int)vectorized_vars.size(); ik++) { if (ix == ik) { replacements[vectorized_vars[ik].name] = Ramp::make(replacements[vectorized_vars[ik].name], strided_ones, vectorized_vars[ix].lanes); replacements[vectorized_vars[ik].name + ".from_zero"] = Ramp::make(replacements[vectorized_vars[ik].name + ".from_zero"], strided_ones, vectorized_vars[ix].lanes); } else { replacements[vectorized_vars[ik].name] = Broadcast::make(replacements[vectorized_vars[ik].name], vectorized_vars[ix].lanes); replacements[vectorized_vars[ik].name + ".from_zero"] = Broadcast::make(replacements[vectorized_vars[ik].name + ".from_zero"], vectorized_vars[ix].lanes); } } strided_ones = Broadcast::make(strided_ones, vectorized_vars[ix].lanes); } } public: VectorSubs(const VectorizedVar &vv) { vectorized_vars.push_back(vv); update_replacements(); } }; // namespace class FindVectorizableExprsInAtomicNode : public IRMutator { // An Atomic node protects all accesses to a given buffer. We // consider a name "poisoned" if it depends on an access to this // buffer. We can't lift or vectorize anything that has been // poisoned. Scope<> poisoned_names; bool poison = false; using IRMutator::visit; template const T *visit_let(const T *op) { mutate(op->value); ScopedBinding<> bind_if(poison, poisoned_names, op->name); mutate(op->body); return op; } Stmt visit(const LetStmt *op) override { return visit_let(op); } Expr visit(const Let *op) override { return visit_let(op); } Expr visit(const Load *op) override { // Even if the load is bad, maybe we can lift the index IRMutator::visit(op); poison |= poisoned_names.contains(op->name); return op; } Expr visit(const Variable *op) override { poison = poisoned_names.contains(op->name); return op; } Stmt visit(const Store *op) override { // A store poisons all subsequent loads, but loads before the // first store can be lifted. mutate(op->index); mutate(op->value); poisoned_names.push(op->name); return op; } Expr visit(const Call *op) override { IRMutator::visit(op); // unsafe_promise_clamped and similar isn't pure because it's // not safe to lift it out of if statements. If *is* safe to // lift it out of atomic nodes though. poison |= !(op->is_pure() || op->is_intrinsic(Call::unsafe_promise_clamped) || op->is_intrinsic(Call::promise_clamped)); return op; } public: using IRMutator::mutate; Expr mutate(const Expr &e) override { bool old_poison = poison; poison = false; IRMutator::mutate(e); if (!poison) { liftable.insert(e); } poison |= old_poison; // We're not actually mutating anything. This class is only a // mutator so that we can override a generic mutate() method. return e; } FindVectorizableExprsInAtomicNode(const string &buf, const map &env) { poisoned_names.push(buf); auto it = env.find(buf); if (it != env.end()) { // Handle tuples size_t n = it->second.values().size(); if (n > 1) { for (size_t i = 0; i < n; i++) { poisoned_names.push(buf + "." + std::to_string(i)); } } } } std::set liftable; }; class LiftVectorizableExprsOutOfSingleAtomicNode : public IRMutator { const std::set &liftable; using IRMutator::visit; template StmtOrExpr visit_let(const LetStmtOrLet *op) { if (liftable.count(op->value)) { // Lift it under its current name to avoid having to // rewrite the variables in other lifted exprs. // TODO: duplicate non-overlapping liftable let stmts due to unrolling. lifted.emplace_back(op->name, op->value); return mutate(op->body); } else { return IRMutator::visit(op); } } Stmt visit(const LetStmt *op) override { return visit_let(op); } Expr visit(const Let *op) override { return visit_let(op); } public: map already_lifted; vector> lifted; using IRMutator::mutate; Expr mutate(const Expr &e) override { if (liftable.count(e) && !is_const(e) && !e.as()) { auto it = already_lifted.find(e); string name; if (it != already_lifted.end()) { name = it->second; } else { name = unique_name('t'); lifted.emplace_back(name, e); already_lifted.emplace(e, name); } return Variable::make(e.type(), name); } else { return IRMutator::mutate(e); } } LiftVectorizableExprsOutOfSingleAtomicNode(const std::set &liftable) : liftable(liftable) { } }; class LiftVectorizableExprsOutOfAllAtomicNodes : public IRMutator { using IRMutator::visit; Stmt visit(const Atomic *op) override { FindVectorizableExprsInAtomicNode finder(op->producer_name, env); finder.mutate(op->body); LiftVectorizableExprsOutOfSingleAtomicNode lifter(finder.liftable); Stmt new_body = lifter.mutate(op->body); new_body = Atomic::make(op->producer_name, op->mutex_name, new_body); while (!lifter.lifted.empty()) { auto p = lifter.lifted.back(); new_body = LetStmt::make(p.first, p.second, new_body); lifter.lifted.pop_back(); } return new_body; } const map &env; public: LiftVectorizableExprsOutOfAllAtomicNodes(const map &env) : env(env) { } }; // Vectorize all loops marked as such in a Stmt class VectorizeLoops : public IRMutator { using IRMutator::visit; Stmt visit(const For *for_loop) override { Stmt stmt; if (for_loop->for_type == ForType::Vectorized) { const IntImm *extent = for_loop->extent.as(); if (!extent || extent->value <= 1) { user_error << "Loop over " << for_loop->name << " has extent " << for_loop->extent << ". Can only vectorize loops over a " << "constant extent > 1\n"; } VectorizedVar vectorized_var = {for_loop->name, for_loop->min, (int)extent->value}; stmt = VectorSubs(vectorized_var).mutate(for_loop->body); } else { stmt = IRMutator::visit(for_loop); } return stmt; } }; /** Check if all stores in a Stmt are to names in a given scope. Used by RemoveUnnecessaryAtomics below. */ class AllStoresInScope : public IRVisitor { using IRVisitor::visit; void visit(const Store *op) override { result = result && s.contains(op->name); } public: bool result = true; const Scope<> &s; AllStoresInScope(const Scope<> &s) : s(s) { } }; bool all_stores_in_scope(const Stmt &stmt, const Scope<> &scope) { AllStoresInScope checker(scope); stmt.accept(&checker); return checker.result; } /** Drop any atomic nodes protecting buffers that are only accessed * from a single thread. */ class RemoveUnnecessaryAtomics : public IRMutator { using IRMutator::visit; // Allocations made from within this same thread bool in_thread = false; Scope<> local_allocs; Stmt visit(const Allocate *op) override { ScopedBinding<> bind(local_allocs, op->name); return IRMutator::visit(op); } Stmt visit(const Atomic *op) override { if (!in_thread || all_stores_in_scope(op->body, local_allocs)) { return mutate(op->body); } else { return op; } } Stmt visit(const For *op) override { if (is_parallel(op->for_type)) { ScopedValue old_in_thread(in_thread, true); Scope<> old_local_allocs; old_local_allocs.swap(local_allocs); Stmt s = IRMutator::visit(op); old_local_allocs.swap(local_allocs); return s; } else { return IRMutator::visit(op); } } }; Stmt vectorize_statement(const Stmt &stmt) { return VectorizeLoops().mutate(stmt); } } // namespace Stmt vectorize_loops(const Stmt &stmt, const map &env) { // Limit the scope of atomic nodes to just the necessary stuff. // TODO: Should this be an earlier pass? It's probably a good idea // for non-vectorizing stuff too. Stmt s = LiftVectorizableExprsOutOfAllAtomicNodes(env).mutate(stmt); s = vectorize_statement(s); s = RemoveUnnecessaryAtomics().mutate(s); return s; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/VectorizeLoops.h000066400000000000000000000011101456515664200167240ustar00rootroot00000000000000#ifndef HALIDE_VECTORIZE_LOOPS_H #define HALIDE_VECTORIZE_LOOPS_H /** \file * Defines the lowering pass that vectorizes loops marked as such */ #include "Expr.h" #include "Function.h" #include namespace Halide { struct Target; namespace Internal { /** Take a statement with for loops marked for vectorization, and turn * them into single statements that operate on vectors. The loops in * question must have constant extent. */ Stmt vectorize_loops(const Stmt &s, const std::map &env); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/WasmExecutor.cpp000066400000000000000000003132761456515664200167410ustar00rootroot00000000000000#include "WasmExecutor.h" #include #include #include #include #include #include #include #include "CodeGen_Posix.h" #include "CodeGen_Targets.h" #include "Error.h" #include "Float16.h" #include "Func.h" #include "ImageParam.h" #include "JITModule.h" #if WITH_WABT || WITH_V8 #include "LLVM_Headers.h" #endif #include "LLVM_Output.h" #include "LLVM_Runtime_Linker.h" #include "Target.h" #if WITH_WABT #include "wabt/binary-reader.h" #include "wabt/cast.h" #include "wabt/common.h" #include "wabt/error-formatter.h" #include "wabt/error.h" #include "wabt/feature.h" #include "wabt/interp/binary-reader-interp.h" #include "wabt/interp/interp-util.h" #include "wabt/interp/interp.h" #include "wabt/interp/istream.h" #include "wabt/result.h" #include "wabt/stream.h" #endif // clang-format off // These includes are order-dependent, don't let clang-format reorder them #ifdef WITH_V8 #include "v8.h" #include "libplatform/libplatform.h" #endif // WITH_V8 // clang-format on #if WITH_WABT || WITH_V8 #if LLVM_VERSION >= 170 LLD_HAS_DRIVER(wasm) #endif #endif namespace Halide { namespace Internal { // Trampolines do not use "_argv" as the suffix because // that name may already exist and if so, will return an int // instead of taking a pointer at the end of the args list to // receive the result value. static const char kTrampolineSuffix[] = "_trampoline"; #ifdef WITH_V8 using namespace v8; #define V8_API_VERSION ((V8_MAJOR_VERSION * 10) + V8_MINOR_VERSION) static_assert(V8_API_VERSION >= 98, "Halide requires V8 v9.8 or later when compiling WITH_V8."); #endif #if WITH_WABT || WITH_V8 namespace { // --------------------- // General debug helpers // --------------------- // Debugging the WebAssembly JIT support is usually disconnected from the rest of HL_DEBUG_CODEGEN #define WASM_DEBUG_LEVEL 0 struct debug_sink { debug_sink() = default; template inline debug_sink &operator<<(T &&x) { return *this; } }; #if WASM_DEBUG_LEVEL #define wdebug(x) Halide::Internal::debug(((x) <= WASM_DEBUG_LEVEL) ? 0 : 255) #define wassert(x) internal_assert(x) #else #define wdebug(x) debug_sink() #define wassert(x) debug_sink() #endif // --------------------- // BDMalloc // --------------------- template inline T align_up(T p, int alignment = 32) { return (p + alignment - 1) & ~(alignment - 1); } // Debugging our Malloc is extremely noisy and usually undesired #define BDMALLOC_DEBUG_LEVEL 0 #if BDMALLOC_DEBUG_LEVEL #define bddebug(x) Halide::Internal::debug(((x) <= BDMALLOC_DEBUG_LEVEL) ? 0 : 255) #else #define bddebug(x) debug_sink() #endif // BDMalloc aka BrainDeadMalloc. This is an *extremely* simple-minded implementation // of malloc/free on top of a WasmMemoryObject, and is intended to be just barely adequate // to allow Halide's JIT-based tests to pass. It is neither memory-efficient nor performant, // nor has it been particularly well-vetted for potential buffer overruns and such. class BDMalloc { struct Region { uint32_t size : 31; uint32_t used : 1; }; uint32_t total_size = 0; std::map regions; public: BDMalloc() = default; uint32_t get_total_size() const { return total_size; } bool inited() const { return total_size > 0; } void init(uint32_t total_size, uint32_t heap_start = 1) { this->total_size = total_size; regions.clear(); internal_assert(heap_start < total_size); // Area before heap_start is permanently off-limits regions[0] = {heap_start, true}; // Everything else is free regions[heap_start] = {total_size - heap_start, false}; } void reset() { this->total_size = 0; regions.clear(); } uint32_t alloc_region(uint32_t requested_size) { internal_assert(requested_size > 0); bddebug(1) << "begin alloc_region " << requested_size << "\n"; validate(); // TODO: this would be faster with a basic free list, // but for most Halide test code, there aren't enough allocations // for this to be worthwhile, or at least that's my observation from // a first run-thru. Consider adding it if any of the JIT-based tests // seem unreasonably slow; as it is, a linear search for the first free // block of adequate size is (apparently) performant enough. // alignment and min-block-size are the same for our purposes here. constexpr uint32_t kAlignment = 32; const uint32_t size = std::max(align_up((uint32_t)requested_size, kAlignment), kAlignment); constexpr uint32_t kMaxAllocSize = 0x7fffffff; internal_assert(size <= kMaxAllocSize); bddebug(2) << "size -> " << size << "\n"; for (auto ®ion : regions) { const uint32_t start = region.first; Region &r = region.second; if (!r.used && r.size >= size) { bddebug(2) << "alloc @ " << start << "," << (uint32_t)r.size << "\n"; if (r.size > size + kAlignment) { // Split the block const uint32_t r2_start = start + size; const uint32_t r2_size = r.size - size; regions[r2_start] = {r2_size, false}; r.size = size; bddebug(2) << "split: r-> " << start << "," << (uint32_t)r.size << "," << (start + r.size) << "\n"; bddebug(2) << "split: r2-> " << r2_start << "," << r2_size << "," << (r2_start + r2_size) << "\n"; } // Just return the block r.used = true; bddebug(1) << "end alloc_region " << requested_size << "\n"; validate(); return start; } } bddebug(1) << "fail alloc_region " << requested_size << "\n"; validate(); return 0; } void free_region(uint32_t start) { bddebug(1) << "begin free_region " << start << "\n"; validate(); // Can't free region at zero if (!start) { return; } internal_assert(start > 0); auto it = regions.find(start); internal_assert(it != regions.end()); internal_assert(it->second.used); it->second.used = false; // If prev region is free, combine with it if (it != regions.begin()) { auto prev = std::prev(it); if (!prev->second.used) { bddebug(2) << "combine prev: " << prev->first << " w/ " << it->first << "\n"; prev->second.size += it->second.size; regions.erase(it); it = prev; } } // If next region is free, combine with it auto next = std::next(it); if (next != regions.end() && !next->second.used) { bddebug(2) << "combine next: " << next->first << " w/ " << it->first << " " << "\n"; it->second.size += next->second.size; regions.erase(next); } bddebug(1) << "end free_region " << start << "\n"; validate(); } void grow_total_size(uint32_t new_total_size) { bddebug(1) << "begin grow_total_size " << new_total_size << "\n"; validate(); internal_assert(new_total_size > total_size); auto it = regions.rbegin(); const uint32_t start = it->first; Region &r = it->second; uint32_t r_end = start + r.size; internal_assert(r_end == total_size); uint32_t delta = new_total_size - r_end; if (r.used) { // Add a free region after the last one regions[r_end] = {delta, false}; } else { // Just extend the last (free) region r.size += delta; } // bookkeeping total_size = new_total_size; bddebug(1) << "end grow_total_size " << new_total_size << "\n"; validate(); } void validate() const { internal_assert(total_size > 0); #if (BDMALLOC_DEBUG_LEVEL >= 1) || (WASM_DEBUG_LEVEL >= 1) uint32_t prev_end = 0; bool prev_used = false; for (auto it : regions) { const uint32_t start = it.first; const Region &r = it.second; bddebug(2) << "R: " << start << ".." << (start + r.size - 1) << "," << r.used << "\n"; wassert(start == prev_end) << "start " << start << " prev_end " << prev_end << "\n"; // it's OK to have two used regions in a row, but not two free ones wassert(!(!prev_used && !r.used)); prev_end = start + r.size; prev_used = r.used; } wassert(prev_end == total_size) << "prev_end " << prev_end << " total_size " << total_size << "\n"; bddebug(2) << "\n"; #endif } }; // --------------------- // General Wasm helpers // --------------------- using wasm32_ptr_t = int32_t; const wasm32_ptr_t kMagicJitUserContextValue = -1; // TODO: vector codegen can underead allocated buffers; we need to deliberately // allocate extra and return a pointer partway in to avoid out-of-bounds access // failures. https://github.com/halide/Halide/issues/3738 constexpr size_t kExtraMallocSlop = 32; std::vector compile_to_wasm(const Module &module, const std::string &fn_name) { static std::mutex link_lock; std::lock_guard lock(link_lock); llvm::LLVMContext context; std::unique_ptr fn_module; // Default wasm stack size is ~64k, but schedules with lots of // alloca usage (heavily inlined, or tracing enabled) can blow thru // this, which crashes in amusing ways, so ask for extra stack space // for the alloca usage. size_t stack_size = 65536; { std::unique_ptr cg(new_CodeGen_WebAssembly(module.target())); cg->set_context(context); fn_module = cg->compile(module); stack_size += cg->get_requested_alloca_total(); } stack_size = align_up(stack_size); wdebug(1) << "Requesting stack size of " << stack_size << "\n"; std::unique_ptr llvm_module = link_with_wasm_jit_runtime(&context, module.target(), std::move(fn_module)); llvm::SmallVector object; llvm::raw_svector_ostream object_stream(object); compile_llvm_module_to_object(*llvm_module, object_stream); // TODO: surely there's a better way that doesn't require spooling things // out to temp files TemporaryFile obj_file("", ".o"); write_entire_file(obj_file.pathname(), object.data(), object.size()); #if WASM_DEBUG_LEVEL obj_file.detach(); wdebug(1) << "Dumping obj_file to " << obj_file.pathname() << "\n"; #endif TemporaryFile wasm_output("", ".wasm"); std::string lld_arg_strs[] = { "HalideJITLinker", #if LLVM_VERSION >= 170 "-flavor", "wasm", #endif // For debugging purposes: // "--verbose", // "-error-limit=0", // "--print-gc-sections", "--export=__heap_base", "--allow-undefined", "-zstack-size=" + std::to_string(stack_size), obj_file.pathname(), "--entry=" + fn_name, "-o", wasm_output.pathname() }; constexpr int c = sizeof(lld_arg_strs) / sizeof(lld_arg_strs[0]); const char *lld_args[c]; for (int i = 0; i < c; ++i) { lld_args[i] = lld_arg_strs[i].c_str(); } #if LLVM_VERSION >= 170 // lld will temporarily hijack the signal handlers to ensure that temp files get cleaned up, // but rather than preserving custom handlers in place, it restores the default handlers. // This conflicts with some of our testing infrastructure, which relies on a SIGABRT handler // set at global-ctor time to stay set. Therefore we'll save and restore this ourselves. // Note that we must restore it before using internal_error (and also on the non-error path). auto old_abort_handler = std::signal(SIGABRT, SIG_DFL); llvm::ArrayRef args(lld_args, lld_args + c); auto r = lld::lldMain(args, llvm::outs(), llvm::errs(), {{lld::Wasm, &lld::wasm::link}}); // TODO: https://reviews.llvm.org/D119049 suggests that you should call exitLld() // if canRunAgain is false, but doing do fails with SIGABRT rather than exit(1), which // breaks our error tests. For now, just following the old practice. // // if (!r.canRunAgain) { // std::cerr << "lld::wasm::link failed catastrophically, exiting with: " << r.retCode << "\n"; // lld::exitLld(r.retCode); // Exit now, can't re-execute again. // } if (r.retCode != 0) { std::signal(SIGABRT, old_abort_handler); internal_error << "lld::wasm::link failed with: " << r.retCode << "\n"; } std::signal(SIGABRT, old_abort_handler); #else // lld will temporarily hijack the signal handlers to ensure that temp files get cleaned up, // but rather than preserving custom handlers in place, it restores the default handlers. // This conflicts with some of our testing infrastructure, which relies on a SIGABRT handler // set at global-ctor time to stay set. Therefore we'll save and restore this ourselves. // Note that we must restore it before using internal_error (and also on the non-error path). auto old_abort_handler = std::signal(SIGABRT, SIG_DFL); if (!lld::wasm::link(lld_args, llvm::outs(), llvm::errs(), /*canExitEarly*/ false, /*disableOutput*/ false)) { std::signal(SIGABRT, old_abort_handler); internal_error << "lld::wasm::link failed\n"; } std::signal(SIGABRT, old_abort_handler); #endif #if WASM_DEBUG_LEVEL wasm_output.detach(); wdebug(1) << "Dumping linked wasm to " << wasm_output.pathname() << "\n"; #endif return read_entire_file(wasm_output.pathname()); } // dynamic_type_dispatch is a utility for functors that want to be able // to dynamically dispatch a halide_type_t to type-specialized code. // To use it, a functor must be a *templated* class, e.g. // // template class MyFunctor { int operator()(arg1, arg2...); }; // // dynamic_type_dispatch() is called with a halide_type_t as the first argument, // followed by the arguments to the Functor's operator(): // // auto result = dynamic_type_dispatch(some_halide_type, arg1, arg2); // // Note that this means that the functor must be able to instantiate its // operator() for all the Halide scalar types; it also means that all those // variants *will* be instantiated (increasing code size), so this approach // should only be used when strictly necessary. // clang-format off template class Functor, typename... Args> auto dynamic_type_dispatch(const halide_type_t &type, Args &&... args) -> decltype(std::declval>()(std::forward(args)...)) { #define HANDLE_CASE(CODE, BITS, TYPE) \ case halide_type_t(CODE, BITS).as_u32(): \ return Functor()(std::forward(args)...); switch (type.element_of().as_u32()) { HANDLE_CASE(halide_type_bfloat, 16, bfloat16_t) HANDLE_CASE(halide_type_float, 16, float16_t) HANDLE_CASE(halide_type_float, 32, float) HANDLE_CASE(halide_type_float, 64, double) HANDLE_CASE(halide_type_int, 8, int8_t) HANDLE_CASE(halide_type_int, 16, int16_t) HANDLE_CASE(halide_type_int, 32, int32_t) HANDLE_CASE(halide_type_int, 64, int64_t) HANDLE_CASE(halide_type_uint, 1, bool) HANDLE_CASE(halide_type_uint, 8, uint8_t) HANDLE_CASE(halide_type_uint, 16, uint16_t) HANDLE_CASE(halide_type_uint, 32, uint32_t) HANDLE_CASE(halide_type_uint, 64, uint64_t) HANDLE_CASE(halide_type_handle, 64, void *) default: internal_error; using ReturnType = decltype(std::declval>()(std::forward(args)...)); return ReturnType(); } #undef HANDLE_CASE } // clang-format on // ----------------------- // extern callback helper code // ----------------------- struct ExternArgType { halide_type_t type; bool is_void; bool is_buffer; bool is_ucon; }; using TrampolineFn = void (*)(void **); bool build_extern_arg_types(const std::string &fn_name, const std::map &jit_externs, const JITModule &trampolines, TrampolineFn &trampoline_fn_out, std::vector &arg_types_out) { const auto it = jit_externs.find(fn_name); if (it == jit_externs.end()) { wdebug(1) << "Extern symbol not found in JIT Externs: " << fn_name << "\n"; return false; } const ExternSignature &sig = it->second.extern_c_function().signature(); const auto &tramp_it = trampolines.exports().find(fn_name + kTrampolineSuffix); if (tramp_it == trampolines.exports().end()) { wdebug(1) << "Extern symbol not found in trampolines: " << fn_name << "\n"; return false; } trampoline_fn_out = (TrampolineFn)tramp_it->second.address; const size_t arg_count = sig.arg_types().size(); std::vector arg_types; arg_types.reserve(arg_count + 1); if (sig.is_void_return()) { const bool is_void = true; const bool is_buffer = false; const bool is_ucon = false; // Specifying a type here with bits == 0 should trigger a proper 'void' return type arg_types.emplace_back(ExternArgType{{halide_type_int, 0, 0}, is_void, is_buffer, is_ucon}); } else { const Type &t = sig.ret_type(); const bool is_void = false; const bool is_buffer = (t == type_of()); const bool is_ucon = false; user_assert(t.lanes() == 1) << "Halide Extern functions cannot return vector values."; user_assert(!is_buffer) << "Halide Extern functions cannot return halide_buffer_t."; // TODO: the assertion below could be removed if we are ever able to marshal int64 // values across the barrier, but that may require wasm codegen changes that are tricky. user_assert(!(t.is_handle() && !is_buffer)) << "Halide Extern functions cannot return arbitrary pointers as arguments."; user_assert(!(t.is_int_or_uint() && t.bits() == 64)) << "Halide Extern functions cannot accept 64-bit values as arguments."; arg_types.emplace_back(ExternArgType{t, is_void, is_buffer, is_ucon}); } for (size_t i = 0; i < arg_count; ++i) { const Type &t = sig.arg_types()[i]; const bool is_void = false; const bool is_buffer = (t == type_of()); // Since arbitrary pointer args aren't legal for extern calls, // assume that anything that is a void* is a user context. const bool is_ucon = (t == type_of()); user_assert(t.lanes() == 1) << "Halide Extern functions cannot accept vector values as arguments."; // TODO: the assertion below could be removed if we are ever able to marshal int64 // values across the barrier, but that may require wasm codegen changes that are tricky. user_assert(!(t.is_handle() && !is_buffer)) << "Halide Extern functions cannot accept arbitrary pointers as arguments."; user_assert(!(t.is_int_or_uint() && t.bits() == 64)) << "Halide Extern functions cannot accept 64-bit values as arguments."; arg_types.emplace_back(ExternArgType{t, is_void, is_buffer, is_ucon}); } arg_types_out = std::move(arg_types); return true; } // ----------------------- // halide_buffer_t <-> wasm_halide_buffer_t helpers // ----------------------- struct wasm_halide_buffer_t { uint64_t device; wasm32_ptr_t device_interface; // halide_device_interface_t* wasm32_ptr_t host; // uint8_t* uint64_t flags; halide_type_t type; int32_t dimensions; wasm32_ptr_t dim; // halide_dimension_t* wasm32_ptr_t padding; // always zero }; static_assert(sizeof(halide_type_t) == 4, "halide_type_t"); static_assert(sizeof(halide_dimension_t) == 16, "halide_dimension_t"); static_assert(sizeof(wasm_halide_buffer_t) == 40, "wasm_halide_buffer_t"); #if WITH_WABT std::string to_string(const wabt::MemoryStream &m) { wabt::OutputBuffer &o = const_cast(&m)->output_buffer(); return std::string((const char *)o.data.data(), o.data.size()); } struct WabtContext { JITUserContext *const jit_user_context; wabt::interp::Memory &memory; BDMalloc &bdmalloc; explicit WabtContext(JITUserContext *jit_user_context, wabt::interp::Memory &memory, BDMalloc &bdmalloc) : jit_user_context(jit_user_context), memory(memory), bdmalloc(bdmalloc) { } WabtContext(const WabtContext &) = delete; WabtContext(WabtContext &&) = delete; void operator=(const WabtContext &) = delete; void operator=(WabtContext &&) = delete; }; WabtContext &get_wabt_context(wabt::interp::Thread &thread) { void *host_info = thread.GetCallerInstance()->host_info(); wassert(host_info); return *(WabtContext *)host_info; } uint8_t *get_wasm_memory_base(WabtContext &wabt_context) { return wabt_context.memory.UnsafeData(); } wasm32_ptr_t wabt_malloc(WabtContext &wabt_context, size_t size) { wasm32_ptr_t p = wabt_context.bdmalloc.alloc_region(size); if (!p) { constexpr int kWasmPageSize = 65536; const int32_t pages_needed = (size + kWasmPageSize - 1) / 65536; wdebug(1) << "attempting to grow by pages: " << pages_needed << "\n"; wabt::Result r = wabt_context.memory.Grow(pages_needed); internal_assert(Succeeded(r)) << "Memory::Grow() failed"; wabt_context.bdmalloc.grow_total_size(wabt_context.memory.ByteSize()); p = wabt_context.bdmalloc.alloc_region(size); } wdebug(2) << "allocation of " << size << " at: " << p << "\n"; return p; } void wabt_free(WabtContext &wabt_context, wasm32_ptr_t ptr) { wdebug(2) << "freeing ptr at: " << ptr << "\n"; wabt_context.bdmalloc.free_region(ptr); } // Some internal code can call halide_error(null, ...), so this needs to be resilient to that. // Callers must expect null and not crash. JITUserContext *get_jit_user_context(WabtContext &wabt_context, const wabt::interp::Value &arg) { int32_t ucon_magic = arg.Get(); if (ucon_magic == 0) { return nullptr; } wassert(ucon_magic == kMagicJitUserContextValue); JITUserContext *jit_user_context = wabt_context.jit_user_context; wassert(jit_user_context); return jit_user_context; } void dump_hostbuf(WabtContext &wabt_context, const halide_buffer_t *buf, const std::string &label) { #if WASM_DEBUG_LEVEL >= 2 const halide_dimension_t *dim = buf->dim; const uint8_t *host = buf->host; wdebug(1) << label << " = " << (const void *)buf << " = {\n"; wdebug(1) << " device = " << buf->device << "\n"; wdebug(1) << " device_interface = " << buf->device_interface << "\n"; wdebug(1) << " host = " << (const void *)host << " = {\n"; if (host) { wdebug(1) << " " << (int)host[0] << ", " << (int)host[1] << ", " << (int)host[2] << ", " << (int)host[3] << "...\n"; } wdebug(1) << " }\n"; wdebug(1) << " flags = " << buf->flags << "\n"; wdebug(1) << " type = " << (int)buf->type.code << "," << (int)buf->type.bits << "," << buf->type.lanes << "\n"; wdebug(1) << " dimensions = " << buf->dimensions << "\n"; wdebug(1) << " dim = " << (void *)buf->dim << " = {\n"; for (int i = 0; i < buf->dimensions; i++) { const auto &d = dim[i]; wdebug(1) << " {" << d.min << "," << d.extent << "," << d.stride << "," << d.flags << "},\n"; } wdebug(1) << " }\n"; wdebug(1) << " padding = " << buf->padding << "\n"; wdebug(1) << "}\n"; #endif } void dump_wasmbuf(WabtContext &wabt_context, wasm32_ptr_t buf_ptr, const std::string &label) { #if WASM_DEBUG_LEVEL >= 2 wassert(buf_ptr); uint8_t *base = get_wasm_memory_base(wabt_context); wasm_halide_buffer_t *buf = (wasm_halide_buffer_t *)(base + buf_ptr); halide_dimension_t *dim = buf->dim ? (halide_dimension_t *)(base + buf->dim) : nullptr; uint8_t *host = buf->host ? (base + buf->host) : nullptr; wdebug(1) << label << " = " << buf_ptr << " -> " << (void *)buf << " = {\n"; wdebug(1) << " device = " << buf->device << "\n"; wdebug(1) << " device_interface = " << buf->device_interface << "\n"; wdebug(1) << " host = " << buf->host << " -> " << (void *)host << " = {\n"; if (host) { wdebug(1) << " " << (int)host[0] << ", " << (int)host[1] << ", " << (int)host[2] << ", " << (int)host[3] << "...\n"; } wdebug(1) << " }\n"; wdebug(1) << " flags = " << buf->flags << "\n"; wdebug(1) << " type = " << (int)buf->type.code << "," << (int)buf->type.bits << "," << buf->type.lanes << "\n"; wdebug(1) << " dimensions = " << buf->dimensions << "\n"; wdebug(1) << " dim = " << buf->dim << " -> " << (void *)dim << " = {\n"; for (int i = 0; i < buf->dimensions; i++) { const auto &d = dim[i]; wdebug(1) << " {" << d.min << "," << d.extent << "," << d.stride << "," << d.flags << "},\n"; } wdebug(1) << " }\n"; wdebug(1) << " padding = " << buf->padding << "\n"; wdebug(1) << "}\n"; #endif } // Given a halide_buffer_t on the host, allocate a wasm_halide_buffer_t in wasm // memory space and copy all relevant data. The resulting buf is laid out in // contiguous memory, and can be free with a single free(). wasm32_ptr_t hostbuf_to_wasmbuf(WabtContext &wabt_context, const halide_buffer_t *src) { static_assert(sizeof(halide_type_t) == 4, "halide_type_t"); static_assert(sizeof(halide_dimension_t) == 16, "halide_dimension_t"); static_assert(sizeof(wasm_halide_buffer_t) == 40, "wasm_halide_buffer_t"); wdebug(2) << "\nhostbuf_to_wasmbuf:\n"; if (!src) { return 0; } dump_hostbuf(wabt_context, src, "src"); wassert(src->device == 0); wassert(src->device_interface == nullptr); // Assume our malloc() has everything 32-byte aligned, // and insert enough padding for host to also be 32-byte aligned. const size_t dims_size_in_bytes = sizeof(halide_dimension_t) * src->dimensions; const size_t dims_offset = sizeof(wasm_halide_buffer_t); const size_t mem_needed_base = sizeof(wasm_halide_buffer_t) + dims_size_in_bytes; const size_t host_offset = align_up(mem_needed_base); const size_t host_size_in_bytes = src->size_in_bytes(); const size_t mem_needed = host_offset + host_size_in_bytes; const wasm32_ptr_t dst_ptr = wabt_malloc(wabt_context, mem_needed); wassert(dst_ptr); uint8_t *base = get_wasm_memory_base(wabt_context); wasm_halide_buffer_t *dst = (wasm_halide_buffer_t *)(base + dst_ptr); dst->device = 0; dst->device_interface = 0; dst->host = src->host ? (dst_ptr + host_offset) : 0; dst->flags = src->flags; dst->type = src->type; dst->dimensions = src->dimensions; dst->dim = src->dimensions ? (dst_ptr + dims_offset) : 0; dst->padding = 0; if (src->dim) { memcpy(base + dst->dim, src->dim, dims_size_in_bytes); } if (src->host) { memcpy(base + dst->host, src->host, host_size_in_bytes); } dump_wasmbuf(wabt_context, dst_ptr, "dst"); return dst_ptr; } // Given a pointer to a wasm_halide_buffer_t in wasm memory space, // allocate a Buffer<> on the host and copy all relevant data. void wasmbuf_to_hostbuf(WabtContext &wabt_context, wasm32_ptr_t src_ptr, Halide::Runtime::Buffer<> &dst) { wdebug(2) << "\nwasmbuf_to_hostbuf:\n"; dump_wasmbuf(wabt_context, src_ptr, "src"); wassert(src_ptr); uint8_t *base = get_wasm_memory_base(wabt_context); wasm_halide_buffer_t *src = (wasm_halide_buffer_t *)(base + src_ptr); wassert(src->device == 0); wassert(src->device_interface == 0); halide_buffer_t dst_tmp; dst_tmp.device = 0; dst_tmp.device_interface = nullptr; dst_tmp.host = nullptr; // src->host ? (base + src->host) : nullptr; dst_tmp.flags = src->flags; dst_tmp.type = src->type; dst_tmp.dimensions = src->dimensions; dst_tmp.dim = src->dim ? (halide_dimension_t *)(base + src->dim) : nullptr; dst_tmp.padding = nullptr; dump_hostbuf(wabt_context, &dst_tmp, "dst_tmp"); dst = Halide::Runtime::Buffer<>(dst_tmp); if (src->host) { // Don't use dst.copy(); it can tweak strides in ways that matter. dst.allocate(); const size_t host_size_in_bytes = dst.raw_buffer()->size_in_bytes(); memcpy(dst.raw_buffer()->host, base + src->host, host_size_in_bytes); } dump_hostbuf(wabt_context, dst.raw_buffer(), "dst"); } // Given a wasm_halide_buffer_t, copy possibly-changed data into a halide_buffer_t. // Both buffers are asserted to match in type and dimensions. void copy_wasmbuf_to_existing_hostbuf(WabtContext &wabt_context, wasm32_ptr_t src_ptr, halide_buffer_t *dst) { wassert(src_ptr && dst); wdebug(2) << "\ncopy_wasmbuf_to_existing_hostbuf:\n"; dump_wasmbuf(wabt_context, src_ptr, "src"); uint8_t *base = get_wasm_memory_base(wabt_context); wasm_halide_buffer_t *src = (wasm_halide_buffer_t *)(base + src_ptr); wassert(src->device == 0); wassert(src->device_interface == 0); wassert(src->dimensions == dst->dimensions); wassert(src->type == dst->type); dump_hostbuf(wabt_context, dst, "dst_pre"); if (src->dimensions) { memcpy(dst->dim, base + src->dim, sizeof(halide_dimension_t) * src->dimensions); } if (src->host) { size_t host_size_in_bytes = dst->size_in_bytes(); memcpy(dst->host, base + src->host, host_size_in_bytes); } dst->device = 0; dst->device_interface = nullptr; dst->flags = src->flags; dump_hostbuf(wabt_context, dst, "dst_post"); } // Given a halide_buffer_t, copy possibly-changed data into a wasm_halide_buffer_t. // Both buffers are asserted to match in type and dimensions. void copy_hostbuf_to_existing_wasmbuf(WabtContext &wabt_context, const halide_buffer_t *src, wasm32_ptr_t dst_ptr) { wassert(src && dst_ptr); wdebug(1) << "\ncopy_hostbuf_to_existing_wasmbuf:\n"; dump_hostbuf(wabt_context, src, "src"); uint8_t *base = get_wasm_memory_base(wabt_context); wasm_halide_buffer_t *dst = (wasm_halide_buffer_t *)(base + dst_ptr); wassert(src->device == 0); wassert(src->device_interface == 0); wassert(src->dimensions == dst->dimensions); wassert(src->type == dst->type); dump_wasmbuf(wabt_context, dst_ptr, "dst_pre"); if (src->dimensions) { memcpy(base + dst->dim, src->dim, sizeof(halide_dimension_t) * src->dimensions); } if (src->host) { size_t host_size_in_bytes = src->size_in_bytes(); memcpy(base + dst->host, src->host, host_size_in_bytes); } dst->device = 0; dst->device_interface = 0; dst->flags = src->flags; dump_wasmbuf(wabt_context, dst_ptr, "dst_post"); } // -------------------------------------------------- // Helpers for converting to/from wabt::interp::Value // -------------------------------------------------- template struct LoadValue { inline wabt::interp::Value operator()(const void *src) { const T val = *(const T *)(src); return wabt::interp::Value::Make(val); } }; template<> inline wabt::interp::Value LoadValue::operator()(const void *src) { // WABT doesn't do bools. Stash as u8 for now. const uint8_t val = *(const uint8_t *)src; return wabt::interp::Value::Make(val); } template<> inline wabt::interp::Value LoadValue::operator()(const void *src) { // Halide 'handle' types are always uint64, even on 32-bit systems const uint64_t val = *(const uint64_t *)src; return wabt::interp::Value::Make(val); } template<> inline wabt::interp::Value LoadValue::operator()(const void *src) { const uint16_t val = *(const uint16_t *)src; return wabt::interp::Value::Make(val); } template<> inline wabt::interp::Value LoadValue::operator()(const void *src) { const uint16_t val = *(const uint16_t *)src; return wabt::interp::Value::Make(val); } inline wabt::interp::Value load_value(const Type &t, const void *src) { return dynamic_type_dispatch(t, src); } template inline wabt::interp::Value load_value(const T &val) { return LoadValue()(&val); } // ----- template struct StoreValue { inline void operator()(const wabt::interp::Value &src, void *dst) { *(T *)dst = src.Get(); } }; template<> inline void StoreValue::operator()(const wabt::interp::Value &src, void *dst) { // WABT doesn't do bools. Stash as u8 for now. *(uint8_t *)dst = src.Get(); } template<> inline void StoreValue::operator()(const wabt::interp::Value &src, void *dst) { // Halide 'handle' types are always uint64, even on 32-bit systems *(uint64_t *)dst = src.Get(); } template<> inline void StoreValue::operator()(const wabt::interp::Value &src, void *dst) { *(uint16_t *)dst = src.Get(); } template<> inline void StoreValue::operator()(const wabt::interp::Value &src, void *dst) { *(uint16_t *)dst = src.Get(); } inline void store_value(const Type &t, const wabt::interp::Value &src, void *dst) { dynamic_type_dispatch(t, src, dst); } template inline void store_value(const wabt::interp::Value &src, T *dst) { StoreValue()(src, dst); } // -------------------------------------------------- // Host Callback Functions // -------------------------------------------------- template wabt::Result wabt_posix_math_1(wabt::interp::Thread &thread, const wabt::interp::Values &args, wabt::interp::Values &results, wabt::interp::Trap::Ptr *trap) { wassert(args.size() == 1); const T in = args[0].Get(); const T out = some_func(in); results[0] = wabt::interp::Value::Make(out); return wabt::Result::Ok; } template wabt::Result wabt_posix_math_2(wabt::interp::Thread &thread, const wabt::interp::Values &args, wabt::interp::Values &results, wabt::interp::Trap::Ptr *trap) { wassert(args.size() == 2); const T in1 = args[0].Get(); const T in2 = args[1].Get(); const T out = some_func(in1, in2); results[0] = wabt::interp::Value::Make(out); return wabt::Result::Ok; } #define WABT_HOST_CALLBACK(x) \ wabt::Result wabt_jit_##x##_callback(wabt::interp::Thread &thread, \ const wabt::interp::Values &args, \ wabt::interp::Values &results, \ wabt::interp::Trap::Ptr *trap) #define WABT_HOST_CALLBACK_UNIMPLEMENTED(x) \ WABT_HOST_CALLBACK(x) { \ internal_error << "WebAssembly JIT does not yet support the " #x "() call."; \ return wabt::Result::Ok; \ } WABT_HOST_CALLBACK(__cxa_atexit) { // nothing return wabt::Result::Ok; } WABT_HOST_CALLBACK(__extendhfsf2) { const uint16_t in = args[0].Get(); const float out = (float)float16_t::make_from_bits(in); results[0] = wabt::interp::Value::Make(out); return wabt::Result::Ok; } WABT_HOST_CALLBACK(__truncsfhf2) { const float in = args[0].Get(); const uint16_t out = float16_t(in).to_bits(); results[0] = wabt::interp::Value::Make(out); return wabt::Result::Ok; } WABT_HOST_CALLBACK(abort) { abort(); return wabt::Result::Ok; } WABT_HOST_CALLBACK_UNIMPLEMENTED(fclose) WABT_HOST_CALLBACK_UNIMPLEMENTED(fileno) WABT_HOST_CALLBACK_UNIMPLEMENTED(fopen) WABT_HOST_CALLBACK(free) { WabtContext &wabt_context = get_wabt_context(thread); wasm32_ptr_t p = args[0].Get(); if (p) { p -= kExtraMallocSlop; } wabt_free(wabt_context, p); return wabt::Result::Ok; } WABT_HOST_CALLBACK_UNIMPLEMENTED(fwrite) WABT_HOST_CALLBACK(getenv) { WabtContext &wabt_context = get_wabt_context(thread); const int32_t s = args[0].Get(); uint8_t *base = get_wasm_memory_base(wabt_context); char *e = getenv((char *)base + s); // TODO: this string is leaked if (e) { wasm32_ptr_t r = wabt_malloc(wabt_context, strlen(e) + 1); strcpy((char *)base + r, e); results[0] = wabt::interp::Value::Make(r); } else { results[0] = wabt::interp::Value::Make(0); } return wabt::Result::Ok; } WABT_HOST_CALLBACK(halide_print) { WabtContext &wabt_context = get_wabt_context(thread); wassert(args.size() == 2); JITUserContext *jit_user_context = get_jit_user_context(wabt_context, args[0]); const int32_t str_address = args[1].Get(); uint8_t *p = get_wasm_memory_base(wabt_context); const char *str = (const char *)p + str_address; if (jit_user_context && jit_user_context->handlers.custom_print != nullptr) { (*jit_user_context->handlers.custom_print)(jit_user_context, str); } else { std::cout << str; } return wabt::Result::Ok; } WABT_HOST_CALLBACK(halide_trace_helper) { WabtContext &wabt_context = get_wabt_context(thread); wassert(args.size() == 12); uint8_t *base = get_wasm_memory_base(wabt_context); JITUserContext *jit_user_context = get_jit_user_context(wabt_context, args[0]); const wasm32_ptr_t func_name_ptr = args[1].Get(); const wasm32_ptr_t value_ptr = args[2].Get(); const wasm32_ptr_t coordinates_ptr = args[3].Get(); const int type_code = args[4].Get(); const int type_bits = args[5].Get(); const int type_lanes = args[6].Get(); const int trace_code = args[7].Get(); const int parent_id = args[8].Get(); const int value_index = args[9].Get(); const int dimensions = args[10].Get(); const wasm32_ptr_t trace_tag_ptr = args[11].Get(); wassert(dimensions >= 0 && dimensions < 1024); // not a hard limit, just a sanity check halide_trace_event_t event; event.func = (const char *)(base + func_name_ptr); event.value = value_ptr ? ((void *)(base + value_ptr)) : nullptr; event.coordinates = coordinates_ptr ? ((int32_t *)(base + coordinates_ptr)) : nullptr; event.trace_tag = (const char *)(base + trace_tag_ptr); event.type.code = (halide_type_code_t)type_code; event.type.bits = (uint8_t)type_bits; event.type.lanes = (uint16_t)type_lanes; event.event = (halide_trace_event_code_t)trace_code; event.parent_id = parent_id; event.value_index = value_index; event.dimensions = dimensions; int32_t result = 0; if (jit_user_context && jit_user_context->handlers.custom_trace != nullptr) { result = (*jit_user_context->handlers.custom_trace)(jit_user_context, &event); } else { debug(0) << "Dropping trace event due to lack of trace handler.\n"; } results[0] = wabt::interp::Value::Make(result); return wabt::Result::Ok; } WABT_HOST_CALLBACK(halide_error) { WabtContext &wabt_context = get_wabt_context(thread); wassert(args.size() == 2); JITUserContext *jit_user_context = get_jit_user_context(wabt_context, args[0]); const int32_t str_address = args[1].Get(); uint8_t *p = get_wasm_memory_base(wabt_context); const char *str = (const char *)p + str_address; if (jit_user_context && jit_user_context->handlers.custom_error != nullptr) { (*jit_user_context->handlers.custom_error)(jit_user_context, str); } else { halide_runtime_error << str; } return wabt::Result::Ok; } WABT_HOST_CALLBACK(malloc) { WabtContext &wabt_context = get_wabt_context(thread); size_t size = args[0].Get() + kExtraMallocSlop; wasm32_ptr_t p = wabt_malloc(wabt_context, size); if (p) { p += kExtraMallocSlop; } results[0] = wabt::interp::Value::Make(p); return wabt::Result::Ok; } WABT_HOST_CALLBACK(memcpy) { WabtContext &wabt_context = get_wabt_context(thread); const int32_t dst = args[0].Get(); const int32_t src = args[1].Get(); const int32_t n = args[2].Get(); uint8_t *base = get_wasm_memory_base(wabt_context); memcpy(base + dst, base + src, n); results[0] = wabt::interp::Value::Make(dst); return wabt::Result::Ok; } WABT_HOST_CALLBACK(memmove) { WabtContext &wabt_context = get_wabt_context(thread); const int32_t dst = args[0].Get(); const int32_t src = args[1].Get(); const int32_t n = args[2].Get(); uint8_t *base = get_wasm_memory_base(wabt_context); memmove(base + dst, base + src, n); results[0] = wabt::interp::Value::Make(dst); return wabt::Result::Ok; } WABT_HOST_CALLBACK(memset) { WabtContext &wabt_context = get_wabt_context(thread); const int32_t s = args[0].Get(); const int32_t c = args[1].Get(); const int32_t n = args[2].Get(); uint8_t *base = get_wasm_memory_base(wabt_context); memset(base + s, c, n); results[0] = wabt::interp::Value::Make(s); return wabt::Result::Ok; } WABT_HOST_CALLBACK(memcmp) { WabtContext &wabt_context = get_wabt_context(thread); const int32_t s1 = args[0].Get(); const int32_t s2 = args[1].Get(); const int32_t n = args[2].Get(); uint8_t *base = get_wasm_memory_base(wabt_context); const int32_t r = memcmp(base + s1, base + s2, n); results[0] = wabt::interp::Value::Make(r); return wabt::Result::Ok; } WABT_HOST_CALLBACK(strlen) { WabtContext &wabt_context = get_wabt_context(thread); const int32_t s = args[0].Get(); uint8_t *base = get_wasm_memory_base(wabt_context); int32_t r = strlen((char *)base + s); results[0] = wabt::interp::Value::Make(r); return wabt::Result::Ok; } WABT_HOST_CALLBACK_UNIMPLEMENTED(write) // -------------------------------------------------- // Host Callback Functions // -------------------------------------------------- wabt::Result extern_callback_wrapper(const std::vector &arg_types, TrampolineFn trampoline_fn, wabt::interp::Thread &thread, const wabt::interp::Values &args, wabt::interp::Values &results, wabt::interp::Trap::Ptr *trap) { WabtContext &wabt_context = get_wabt_context(thread); wassert(arg_types.size() >= 1); const size_t arg_types_len = arg_types.size() - 1; const ExternArgType &ret_type = arg_types[0]; // There's wasted space here, but that's ok. std::vector> buffers(arg_types_len); std::vector scalars(arg_types_len, 0); std::vector trampoline_args(arg_types_len, nullptr); for (size_t i = 0; i < arg_types_len; ++i) { const auto &a = arg_types[i + 1]; if (a.is_ucon) { // We have to special-case ucon because Halide considers it an int64 everywhere // (even for wasm, where pointers are int32), and trying to extract it as an // int64 from a Value that is int32 will assert-fail. In JIT mode the value // doesn't even matter (except for guarding that it is our predicted constant). wassert(args[i].Get() == 0 || args[i].Get() == kMagicJitUserContextValue); store_value(Int(32), args[i], &scalars[i]); trampoline_args[i] = &scalars[i]; } else if (a.is_buffer) { const wasm32_ptr_t buf_ptr = args[i].Get(); wasmbuf_to_hostbuf(wabt_context, buf_ptr, buffers[i]); trampoline_args[i] = buffers[i].raw_buffer(); } else { store_value(a.type, args[i], &scalars[i]); trampoline_args[i] = &scalars[i]; } } // The return value (if any) is always scalar. uint64_t ret_val = 0; const bool has_retval = !ret_type.is_void; internal_assert(!ret_type.is_buffer); if (has_retval) { trampoline_args.push_back(&ret_val); } (*trampoline_fn)(trampoline_args.data()); if (has_retval) { results[0] = dynamic_type_dispatch(ret_type.type, (void *)&ret_val); } // Progagate buffer data backwards. Note that for arbitrary extern functions, // we have no idea which buffers might be "input only", so we copy all data for all of them. for (size_t i = 0; i < arg_types_len; ++i) { const auto &a = arg_types[i + 1]; if (a.is_buffer) { const wasm32_ptr_t buf_ptr = args[i].Get(); copy_hostbuf_to_existing_wasmbuf(wabt_context, buffers[i], buf_ptr); } } return wabt::Result::Ok; } bool should_skip_extern_symbol(const std::string &name) { static std::set symbols = { "halide_print", "halide_error"}; return symbols.count(name) > 0; } wabt::interp::HostFunc::Ptr make_extern_callback(wabt::interp::Store &store, const std::map &jit_externs, const JITModule &trampolines, const wabt::interp::ImportDesc &import) { const std::string &fn_name = import.type.name; if (should_skip_extern_symbol(fn_name)) { wdebug(1) << "Skipping extern symbol: " << fn_name << "\n"; return wabt::interp::HostFunc::Ptr(); } TrampolineFn trampoline_fn; std::vector arg_types; if (!build_extern_arg_types(fn_name, jit_externs, trampolines, trampoline_fn, arg_types)) { return wabt::interp::HostFunc::Ptr(); } const auto callback_wrapper = [arg_types, trampoline_fn](wabt::interp::Thread &thread, const wabt::interp::Values &args, wabt::interp::Values &results, wabt::interp::Trap::Ptr *trap) -> wabt::Result { return extern_callback_wrapper(arg_types, trampoline_fn, thread, args, results, trap); }; auto func_type = *wabt::cast(import.type.type.get()); auto host_func = wabt::interp::HostFunc::New(store, func_type, callback_wrapper); return host_func; } wabt::Features calc_features(const Target &target) { wabt::Features f; if (!target.has_feature(Target::WasmMvpOnly)) { f.enable_sign_extension(); f.enable_sat_float_to_int(); } if (target.has_feature(Target::WasmSimd128)) { f.enable_simd(); } return f; } #endif // WITH_WABT #if WITH_V8 v8::Local NewLocalString(v8::Isolate *isolate, const char *s) { return v8::String::NewFromUtf8(isolate, s).ToLocalChecked(); } // ------------------------------ template struct StoreScalar { void operator()(const Local &context, const Local &val, void *slot) { *(T *)slot = (T)val->NumberValue(context).ToChecked(); } }; template<> inline void StoreScalar::operator()(const Local &context, const Local &val, void *slot) { float16_t f((double)val->NumberValue(context).ToChecked()); *(uint16_t *)slot = f.to_bits(); } template<> inline void StoreScalar::operator()(const Local &context, const Local &val, void *slot) { bfloat16_t b((double)val->NumberValue(context).ToChecked()); *(uint16_t *)slot = b.to_bits(); } template<> inline void StoreScalar::operator()(const Local &context, const Local &val, void *slot) { internal_error << "TODO: 64-bit slots aren't yet supported"; } template<> inline void StoreScalar::operator()(const Local &context, const Local &val, void *slot) { internal_error << "TODO: 64-bit slots aren't yet supported"; } template<> inline void StoreScalar::operator()(const Local &context, const Local &val, void *slot) { internal_error << "TODO: 64-bit slots aren't yet supported"; } void store_scalar(const Local &context, const Type &t, const Local &val, void *slot) { return dynamic_type_dispatch(t, context, val, slot); } template void store_scalar(const Local &context, const Local &val, void *slot) { return StoreScalar()(context, val, slot); } // ------------------------------ template struct LoadAndReturnScalar { void operator()(const Local &context, const void *slot, ReturnValue val) { val.Set(*(const T *)slot); } }; template<> inline void LoadAndReturnScalar::operator()(const Local &context, const void *slot, ReturnValue val) { float16_t f = float16_t::make_from_bits(*(const uint16_t *)slot); val.Set((double)f); } template<> inline void LoadAndReturnScalar::operator()(const Local &context, const void *slot, ReturnValue val) { bfloat16_t b = bfloat16_t::make_from_bits(*(const uint16_t *)slot); val.Set((double)b); } template<> inline void LoadAndReturnScalar::operator()(const Local &context, const void *slot, ReturnValue val) { internal_error << "TODO: 64-bit slots aren't yet supported"; } template<> inline void LoadAndReturnScalar::operator()(const Local &context, const void *slot, ReturnValue val) { internal_error << "TODO: 64-bit slots aren't yet supported"; } template<> inline void LoadAndReturnScalar::operator()(const Local &context, const void *slot, ReturnValue val) { internal_error << "TODO: 64-bit slots aren't yet supported"; } // ------------------------------ template struct LoadScalar { Local operator()(const Local &context, const void *val_ptr) { double val = *(const T *)(val_ptr); Isolate *isolate = context->GetIsolate(); return Number::New(isolate, val); } }; template<> inline Local LoadScalar::operator()(const Local &context, const void *val_ptr) { double val = (double)*(const uint16_t *)val_ptr; Isolate *isolate = context->GetIsolate(); return Number::New(isolate, val); } template<> inline Local LoadScalar::operator()(const Local &context, const void *val_ptr) { double val = (double)*(const uint16_t *)val_ptr; Isolate *isolate = context->GetIsolate(); return Number::New(isolate, val); } template<> inline Local LoadScalar::operator()(const Local &context, const void *val_ptr) { internal_error << "TODO: 64-bit slots aren't yet supported"; return Local(); } template<> inline Local LoadScalar::operator()(const Local &context, const void *val_ptr) { internal_error << "TODO: 64-bit slots aren't yet supported"; return Local(); } template<> inline Local LoadScalar::operator()(const Local &context, const void *val_ptr) { internal_error << "TODO: 64-bit slots aren't yet supported"; return Local(); } // ------------------------------ Local load_scalar(const Local &context, const Type &t, const void *val_ptr) { return dynamic_type_dispatch(t, context, val_ptr); } template Local load_scalar(const Local &context, const T &val) { return LoadScalar()(context, &val); } // --------------------------------- enum EmbedderDataSlots { // don't use slot 0 kWasmMemoryObject = 1, kBDMallocPtr, kHeapBase, kJitUserContext, kString_buffer, kString_grow, }; wasm32_ptr_t v8_WasmMemoryObject_malloc(const Local &context, size_t size) { Isolate *isolate = context->GetIsolate(); BDMalloc *bdmalloc = (BDMalloc *)context->GetAlignedPointerFromEmbedderData(kBDMallocPtr); if (!bdmalloc->inited()) { int32_t heap_base = context->GetEmbedderData(kHeapBase)->Int32Value(context).ToChecked(); Local memory_value = context->GetEmbedderData(kWasmMemoryObject).As(); // really a WasmMemoryObject Local buffer_string = context->GetEmbedderData(kString_buffer).As(); Local wasm_memory = Local::Cast(memory_value->Get(context, buffer_string).ToLocalChecked()); wdebug(0) << "heap_base is: " << heap_base << "\n"; wdebug(0) << "initial memory size is: " << wasm_memory->ByteLength() << "\n"; bdmalloc->init(wasm_memory->ByteLength(), heap_base); } wasm32_ptr_t p = bdmalloc->alloc_region(size); if (!p) { Local memory_value = context->GetEmbedderData(kWasmMemoryObject).As(); // really a WasmMemoryObject constexpr int kWasmPageSize = 65536; const int32_t pages_needed = (size + kWasmPageSize - 1) / 65536; wdebug(0) << "attempting to grow by pages: " << pages_needed << "\n"; Local args[1] = {Integer::New(isolate, pages_needed)}; int32_t result = memory_value ->Get(context, context->GetEmbedderData(kString_grow)) .ToLocalChecked() .As() ->CallAsFunction(context, memory_value, 1, args) .ToLocalChecked() ->Int32Value(context) .ToChecked(); wdebug(0) << "grow result: " << result << "\n"; internal_assert(result == (int)(bdmalloc->get_total_size() / kWasmPageSize)); Local buffer_string = context->GetEmbedderData(kString_buffer).As(); Local wasm_memory = Local::Cast(memory_value->Get(context, buffer_string).ToLocalChecked()); wdebug(0) << "New ArrayBuffer size is: " << wasm_memory->ByteLength() << "\n"; bdmalloc->grow_total_size(wasm_memory->ByteLength()); p = bdmalloc->alloc_region(size); } wdebug(2) << "allocation of " << size << " at: " << p << "\n"; return p; } void v8_WasmMemoryObject_free(const Local &context, wasm32_ptr_t ptr) { wdebug(2) << "freeing ptr at: " << ptr << "\n"; BDMalloc *bdmalloc = (BDMalloc *)context->GetAlignedPointerFromEmbedderData(kBDMallocPtr); bdmalloc->free_region(ptr); } uint8_t *get_wasm_memory_base(const Local &context) { Local memory_value = context->GetEmbedderData(kWasmMemoryObject).As(); // really a WasmMemoryObject Local wasm_memory = Local::Cast(memory_value->Get(context, context->GetEmbedderData(kString_buffer)).ToLocalChecked()); std::shared_ptr backing = wasm_memory->GetBackingStore(); uint8_t *p = (uint8_t *)backing->Data(); return p; } void dump_hostbuf(const Local &context, const halide_buffer_t *buf, const std::string &label) { #if WASM_DEBUG_LEVEL >= 2 const halide_dimension_t *dim = buf->dim; const uint8_t *host = buf->host; wdebug(0) << label << " = " << (const void *)buf << " = {\n"; wdebug(0) << " device = " << buf->device << "\n"; wdebug(0) << " device_interface = " << buf->device_interface << "\n"; wdebug(0) << " host = " << (const void *)host << " = {\n"; if (host) { wdebug(0) << " " << (int)host[0] << ", " << (int)host[1] << ", " << (int)host[2] << ", " << (int)host[3] << "...\n"; } wdebug(0) << " }\n"; wdebug(0) << " flags = " << buf->flags << "\n"; wdebug(0) << " type = " << (int)buf->type.code << "," << (int)buf->type.bits << "," << buf->type.lanes << "\n"; wdebug(0) << " dimensions = " << buf->dimensions << "\n"; wdebug(0) << " dim = " << (void *)buf->dim << " = {\n"; for (int i = 0; i < buf->dimensions; i++) { const auto &d = dim[i]; wdebug(0) << " {" << d.min << "," << d.extent << "," << d.stride << "," << d.flags << "},\n"; } wdebug(0) << " }\n"; wdebug(0) << " padding = " << buf->padding << "\n"; wdebug(0) << "}\n"; #endif } void dump_wasmbuf(const Local &context, wasm32_ptr_t buf_ptr, const std::string &label) { #if WASM_DEBUG_LEVEL >= 2 internal_assert(buf_ptr); uint8_t *base = get_wasm_memory_base(context); wasm_halide_buffer_t *buf = (wasm_halide_buffer_t *)(base + buf_ptr); halide_dimension_t *dim = buf->dim ? (halide_dimension_t *)(base + buf->dim) : nullptr; uint8_t *host = buf->host ? (base + buf->host) : nullptr; wdebug(0) << label << " = " << buf_ptr << " -> " << (void *)buf << " = {\n"; wdebug(0) << " device = " << buf->device << "\n"; wdebug(0) << " device_interface = " << buf->device_interface << "\n"; wdebug(0) << " host = " << buf->host << " -> " << (void *)host << " = {\n"; if (host) { wdebug(0) << " " << (int)host[0] << ", " << (int)host[1] << ", " << (int)host[2] << ", " << (int)host[3] << "...\n"; } wdebug(0) << " }\n"; wdebug(0) << " flags = " << buf->flags << "\n"; wdebug(0) << " type = " << (int)buf->type.code << "," << (int)buf->type.bits << "," << buf->type.lanes << "\n"; wdebug(0) << " dimensions = " << buf->dimensions << "\n"; wdebug(0) << " dim = " << buf->dim << " -> " << (void *)dim << " = {\n"; for (int i = 0; i < buf->dimensions; i++) { const auto &d = dim[i]; wdebug(0) << " {" << d.min << "," << d.extent << "," << d.stride << "," << d.flags << "},\n"; } wdebug(0) << " }\n"; wdebug(0) << " padding = " << buf->padding << "\n"; wdebug(0) << "}\n"; #endif } // Given a halide_buffer_t on the host, allocate a wasm_halide_buffer_t in wasm // memory space and copy all relevant data. The resulting buf is laid out in // contiguous memory, and can be free with a single free(). wasm32_ptr_t hostbuf_to_wasmbuf(const Local &context, const halide_buffer_t *src) { static_assert(sizeof(halide_type_t) == 4, "halide_type_t"); static_assert(sizeof(halide_dimension_t) == 16, "halide_dimension_t"); static_assert(sizeof(wasm_halide_buffer_t) == 40, "wasm_halide_buffer_t"); wdebug(0) << "\nhostbuf_to_wasmbuf:\n"; if (!src) { return 0; } dump_hostbuf(context, src, "src"); internal_assert(src->device == 0); internal_assert(src->device_interface == nullptr); // Assume our malloc() has everything 32-byte aligned, // and insert enough padding for host to also be 32-byte aligned. const size_t dims_size_in_bytes = sizeof(halide_dimension_t) * src->dimensions; const size_t dims_offset = sizeof(wasm_halide_buffer_t); const size_t mem_needed_base = sizeof(wasm_halide_buffer_t) + dims_size_in_bytes; const size_t host_offset = align_up(mem_needed_base); const size_t host_size_in_bytes = src->size_in_bytes(); const size_t mem_needed = host_offset + host_size_in_bytes; const wasm32_ptr_t dst_ptr = v8_WasmMemoryObject_malloc(context, mem_needed); internal_assert(dst_ptr); uint8_t *base = get_wasm_memory_base(context); wasm_halide_buffer_t *dst = (wasm_halide_buffer_t *)(base + dst_ptr); dst->device = 0; dst->device_interface = 0; dst->host = src->host ? (dst_ptr + host_offset) : 0; dst->flags = src->flags; dst->type = src->type; dst->dimensions = src->dimensions; dst->dim = src->dimensions ? (dst_ptr + dims_offset) : 0; dst->padding = 0; if (src->dim) { memcpy(base + dst->dim, src->dim, dims_size_in_bytes); } if (src->host) { memcpy(base + dst->host, src->host, host_size_in_bytes); } dump_wasmbuf(context, dst_ptr, "dst"); return dst_ptr; } // Given a pointer to a wasm_halide_buffer_t in wasm memory space, // allocate a Buffer<> on the host and copy all relevant data. void wasmbuf_to_hostbuf(const Local &context, wasm32_ptr_t src_ptr, Halide::Runtime::Buffer<> &dst) { wdebug(0) << "\nwasmbuf_to_hostbuf:\n"; dump_wasmbuf(context, src_ptr, "src"); internal_assert(src_ptr); uint8_t *base = get_wasm_memory_base(context); wasm_halide_buffer_t *src = (wasm_halide_buffer_t *)(base + src_ptr); internal_assert(src->device == 0); internal_assert(src->device_interface == 0); halide_buffer_t dst_tmp; dst_tmp.device = 0; dst_tmp.device_interface = nullptr; dst_tmp.host = nullptr; // src->host ? (base + src->host) : nullptr; dst_tmp.flags = src->flags; dst_tmp.type = src->type; dst_tmp.dimensions = src->dimensions; dst_tmp.dim = src->dim ? (halide_dimension_t *)(base + src->dim) : nullptr; dst_tmp.padding = nullptr; dump_hostbuf(context, &dst_tmp, "dst_tmp"); dst = Halide::Runtime::Buffer<>(dst_tmp); if (src->host) { // Don't use dst.copy(); it can tweak strides in ways that matter. dst.allocate(); const size_t host_size_in_bytes = dst.raw_buffer()->size_in_bytes(); memcpy(dst.raw_buffer()->host, base + src->host, host_size_in_bytes); } dump_hostbuf(context, dst.raw_buffer(), "dst"); } // Given a wasm_halide_buffer_t, copy possibly-changed data into a halide_buffer_t. // Both buffers are asserted to match in type and dimensions. void copy_wasmbuf_to_existing_hostbuf(const Local &context, wasm32_ptr_t src_ptr, halide_buffer_t *dst) { internal_assert(src_ptr && dst); wdebug(0) << "\ncopy_wasmbuf_to_existing_hostbuf:\n"; dump_wasmbuf(context, src_ptr, "src"); uint8_t *base = get_wasm_memory_base(context); wasm_halide_buffer_t *src = (wasm_halide_buffer_t *)(base + src_ptr); internal_assert(src->device == 0); internal_assert(src->device_interface == 0); internal_assert(src->dimensions == dst->dimensions); internal_assert(src->type == dst->type); dump_hostbuf(context, dst, "dst_pre"); if (src->dimensions) { memcpy(dst->dim, base + src->dim, sizeof(halide_dimension_t) * src->dimensions); } if (src->host) { size_t host_size_in_bytes = dst->size_in_bytes(); memcpy(dst->host, base + src->host, host_size_in_bytes); } dst->device = 0; dst->device_interface = nullptr; dst->flags = src->flags; dump_hostbuf(context, dst, "dst_post"); } // Given a halide_buffer_t, copy possibly-changed data into a wasm_halide_buffer_t. // Both buffers are asserted to match in type and dimensions. void copy_hostbuf_to_existing_wasmbuf(const Local &context, const halide_buffer_t *src, wasm32_ptr_t dst_ptr) { internal_assert(src && dst_ptr); wdebug(0) << "\ncopy_hostbuf_to_existing_wasmbuf:\n"; dump_hostbuf(context, src, "src"); uint8_t *base = get_wasm_memory_base(context); wasm_halide_buffer_t *dst = (wasm_halide_buffer_t *)(base + dst_ptr); internal_assert(src->device == 0); internal_assert(src->device_interface == nullptr); internal_assert(src->dimensions == dst->dimensions); internal_assert(src->type == dst->type); dump_wasmbuf(context, dst_ptr, "dst_pre"); if (src->dimensions) { memcpy(base + dst->dim, src->dim, sizeof(halide_dimension_t) * src->dimensions); } if (src->host) { size_t host_size_in_bytes = src->size_in_bytes(); memcpy(base + dst->host, src->host, host_size_in_bytes); } dst->device = 0; dst->device_interface = 0; dst->flags = src->flags; dump_wasmbuf(context, dst_ptr, "dst_post"); } // Some internal code can call halide_error(null, ...), so this needs to be resilient to that. // Callers must expect null and not crash. JITUserContext *get_jit_user_context(const Local &context, const Local &arg) { int32_t ucon_magic = arg->Int32Value(context).ToChecked(); if (ucon_magic == 0) { return nullptr; } internal_assert(ucon_magic == kMagicJitUserContextValue); JITUserContext *jit_user_context = (JITUserContext *)context->GetAlignedPointerFromEmbedderData(kJitUserContext); internal_assert(jit_user_context); return jit_user_context; } void wasm_jit_halide_print_callback(const v8::FunctionCallbackInfo &args) { internal_assert(args.Length() == 2); Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); JITUserContext *jit_user_context = get_jit_user_context(context, args[0]); const int32_t str_address = args[1]->Int32Value(context).ToChecked(); uint8_t *p = get_wasm_memory_base(context); const char *str = (const char *)p + str_address; if (jit_user_context && jit_user_context->handlers.custom_print != nullptr) { (*jit_user_context->handlers.custom_print)(jit_user_context, str); debug(0) << str; } else { std::cout << str; } } void wasm_jit_halide_error_callback(const v8::FunctionCallbackInfo &args) { internal_assert(args.Length() == 2); Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); JITUserContext *jit_user_context = get_jit_user_context(context, args[0]); const int32_t str_address = args[1]->Int32Value(context).ToChecked(); uint8_t *p = get_wasm_memory_base(context); const char *str = (const char *)p + str_address; if (jit_user_context && jit_user_context->handlers.custom_error != nullptr) { (*jit_user_context->handlers.custom_error)(jit_user_context, str); } else { halide_runtime_error << str; } } void wasm_jit_halide_trace_helper_callback(const v8::FunctionCallbackInfo &args) { internal_assert(args.Length() == 12); Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); uint8_t *base = get_wasm_memory_base(context); JITUserContext *jit_user_context = get_jit_user_context(context, args[0]); const wasm32_ptr_t func_name_ptr = args[1]->Int32Value(context).ToChecked(); const wasm32_ptr_t value_ptr = args[2]->Int32Value(context).ToChecked(); const wasm32_ptr_t coordinates_ptr = args[3]->Int32Value(context).ToChecked(); const int type_code = args[4]->Int32Value(context).ToChecked(); const int type_bits = args[5]->Int32Value(context).ToChecked(); const int type_lanes = args[6]->Int32Value(context).ToChecked(); const int trace_code = args[7]->Int32Value(context).ToChecked(); const int parent_id = args[8]->Int32Value(context).ToChecked(); const int value_index = args[9]->Int32Value(context).ToChecked(); const int dimensions = args[10]->Int32Value(context).ToChecked(); const wasm32_ptr_t trace_tag_ptr = args[11]->Int32Value(context).ToChecked(); internal_assert(dimensions >= 0 && dimensions < 1024); // not a hard limit, just a sanity check halide_trace_event_t event; event.func = (const char *)(base + func_name_ptr); event.value = value_ptr ? ((void *)(base + value_ptr)) : nullptr; event.coordinates = coordinates_ptr ? ((int32_t *)(base + coordinates_ptr)) : nullptr; event.trace_tag = (const char *)(base + trace_tag_ptr); event.type.code = (halide_type_code_t)type_code; event.type.bits = (uint8_t)type_bits; event.type.lanes = (uint16_t)type_lanes; event.event = (halide_trace_event_code_t)trace_code; event.parent_id = parent_id; event.value_index = value_index; event.dimensions = dimensions; int result = 0; if (jit_user_context && jit_user_context->handlers.custom_trace != nullptr) { result = (*jit_user_context->handlers.custom_trace)(jit_user_context, &event); } else { debug(0) << "Dropping trace event due to lack of trace handler.\n"; } args.GetReturnValue().Set(load_scalar(context, result)); } void wasm_jit_malloc_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); HandleScope scope(isolate); Local context = isolate->GetCurrentContext(); size_t size = args[0]->Int32Value(context).ToChecked() + kExtraMallocSlop; wasm32_ptr_t p = v8_WasmMemoryObject_malloc(context, size); if (p) { p += kExtraMallocSlop; } args.GetReturnValue().Set(load_scalar(context, p)); } void wasm_jit_free_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); HandleScope scope(isolate); Local context = isolate->GetCurrentContext(); wasm32_ptr_t p = args[0]->Int32Value(context).ToChecked(); if (p) { p -= kExtraMallocSlop; } v8_WasmMemoryObject_free(context, p); } void wasm_jit_abort_callback(const v8::FunctionCallbackInfo &args) { abort(); } void wasm_jit_strlen_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const int32_t s = args[0]->Int32Value(context).ToChecked(); uint8_t *base = get_wasm_memory_base(context); int32_t r = strlen((char *)base + s); args.GetReturnValue().Set(load_scalar(context, r)); } void wasm_jit_write_callback(const v8::FunctionCallbackInfo &args) { internal_error << "WebAssembly JIT does not yet support the write() call."; } void wasm_jit_getenv_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const int32_t s = args[0]->Int32Value(context).ToChecked(); uint8_t *base = get_wasm_memory_base(context); char *e = getenv((char *)base + s); // TODO: this string is leaked if (e) { wasm32_ptr_t r = v8_WasmMemoryObject_malloc(context, strlen(e) + 1); strcpy((char *)base + r, e); args.GetReturnValue().Set(load_scalar(context, r)); } else { args.GetReturnValue().Set(load_scalar(context, 0)); } } void wasm_jit_memcpy_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const int32_t dst = args[0]->Int32Value(context).ToChecked(); const int32_t src = args[1]->Int32Value(context).ToChecked(); const int32_t n = args[2]->Int32Value(context).ToChecked(); uint8_t *base = get_wasm_memory_base(context); memcpy(base + dst, base + src, n); args.GetReturnValue().Set(load_scalar(context, dst)); } void wasm_jit_memmove_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const int32_t dst = args[0]->Int32Value(context).ToChecked(); const int32_t src = args[1]->Int32Value(context).ToChecked(); const int32_t n = args[2]->Int32Value(context).ToChecked(); uint8_t *base = get_wasm_memory_base(context); memmove(base + dst, base + src, n); args.GetReturnValue().Set(load_scalar(context, dst)); } void wasm_jit_fopen_callback(const v8::FunctionCallbackInfo &args) { internal_error << "WebAssembly JIT does not yet support the fopen() call."; } void wasm_jit_fileno_callback(const v8::FunctionCallbackInfo &args) { internal_error << "WebAssembly JIT does not yet support the fileno() call."; } void wasm_jit_fclose_callback(const v8::FunctionCallbackInfo &args) { internal_error << "WebAssembly JIT does not yet support the fclose() call."; } void wasm_jit_fwrite_callback(const v8::FunctionCallbackInfo &args) { internal_error << "WebAssembly JIT does not yet support the fwrite() call."; } void wasm_jit_memset_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const int32_t s = args[0]->Int32Value(context).ToChecked(); const int32_t c = args[1]->Int32Value(context).ToChecked(); const int32_t n = args[2]->Int32Value(context).ToChecked(); uint8_t *base = get_wasm_memory_base(context); memset(base + s, c, n); args.GetReturnValue().Set(load_scalar(context, s)); } void wasm_jit_memcmp_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const int32_t s1 = args[0]->Int32Value(context).ToChecked(); const int32_t s2 = args[1]->Int32Value(context).ToChecked(); const int32_t n = args[2]->Int32Value(context).ToChecked(); uint8_t *base = get_wasm_memory_base(context); int r = memcmp(base + s1, base + s2, n); args.GetReturnValue().Set(load_scalar(context, r)); } void wasm_jit___cxa_atexit_callback(const v8::FunctionCallbackInfo &args) { // nothing } void wasm_jit___extendhfsf2_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const uint16_t in = args[0]->NumberValue(context).ToChecked(); const float out = (float)float16_t::make_from_bits(in); args.GetReturnValue().Set(load_scalar(context, out)); } void wasm_jit___truncsfhf2_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const float in = args[0]->NumberValue(context).ToChecked(); const uint16_t out = float16_t(in).to_bits(); args.GetReturnValue().Set(load_scalar(context, out)); } template void wasm_jit_posix_math_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const T in = args[0]->NumberValue(context).ToChecked(); const T out = some_func(in); args.GetReturnValue().Set(load_scalar(context, out)); } template void wasm_jit_posix_math2_callback(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); Local context = isolate->GetCurrentContext(); HandleScope scope(isolate); const T in1 = args[0]->NumberValue(context).ToChecked(); const T in2 = args[1]->NumberValue(context).ToChecked(); const T out = some_func(in1, in2); args.GetReturnValue().Set(load_scalar(context, out)); } enum ExternWrapperFieldSlots { kTrampolineWrap, kArgTypesWrap }; void v8_extern_wrapper(const v8::FunctionCallbackInfo &args) { Isolate *isolate = args.GetIsolate(); HandleScope scope(isolate); Local context = isolate->GetCurrentContext(); Local wrapper_data = args.Data()->ToObject(context).ToLocalChecked(); Local trampoline_wrap = Local::Cast(wrapper_data->GetInternalField(kTrampolineWrap)); Local arg_types_wrap = Local::Cast(wrapper_data->GetInternalField(kArgTypesWrap)); TrampolineFn trampoline = (TrampolineFn)trampoline_wrap->Value(); size_t arg_types_len = (arg_types_wrap->ByteLength() / sizeof(ExternArgType)) - 1; std::shared_ptr backing = arg_types_wrap->GetBackingStore(); ExternArgType *arg_types = (ExternArgType *)backing->Data(); /* const ExternArgType *arg_types = (const ExternArgType *)arg_types_wrap->GetContents().Data(); */ const ExternArgType ret_type = *arg_types++; // There's wasted space here, but that's ok. std::vector> buffers(arg_types_len); std::vector scalars(arg_types_len); std::vector trampoline_args(arg_types_len); for (size_t i = 0; i < arg_types_len; ++i) { if (arg_types[i].is_ucon) { // We have to special-case ucon because Halide considers it an int64 everywhere // (even for wasm, where pointers are int32), and trying to extract it as an // int64 from a Value that is int32 will assert-fail. In JIT mode the value // doesn't even matter (except for guarding that it is our predicted constant). wassert(args[i].Get() == 0 || args[i].Get() == kMagicJitUserContextValue); store_scalar(context, args[i], &scalars[i]); trampoline_args[i] = &scalars[i]; } else if (arg_types[i].is_buffer) { const wasm32_ptr_t buf_ptr = args[i]->Int32Value(context).ToChecked(); wasmbuf_to_hostbuf(context, buf_ptr, buffers[i]); trampoline_args[i] = buffers[i].raw_buffer(); } else { store_scalar(context, arg_types[i].type, args[i], &scalars[i]); trampoline_args[i] = &scalars[i]; } } // The return value (if any) is always scalar. uint64_t ret_val = 0; const bool has_retval = !ret_type.is_void; internal_assert(!ret_type.is_buffer); if (has_retval) { trampoline_args.push_back(&ret_val); } (*trampoline)(trampoline_args.data()); if (has_retval) { dynamic_type_dispatch(ret_type.type, context, (void *)&ret_val, args.GetReturnValue()); } // Progagate buffer data backwards. Note that for arbitrary extern functions, // we have no idea which buffers might be "input only", so we copy all data for all of them. for (size_t i = 0; i < arg_types_len; ++i) { if (arg_types[i].is_buffer) { const wasm32_ptr_t buf_ptr = args[i]->Int32Value(context).ToChecked(); copy_hostbuf_to_existing_wasmbuf(context, buffers[i], buf_ptr); } } } bool should_skip_extern_symbol(const std::string &name) { static std::set symbols = { "halide_print", "halide_error"}; return symbols.count(name) > 0; } using JITExternMap = std::map; void add_extern_callbacks(const Local &context, const JITExternMap &jit_externs, const JITModule &trampolines, Local &imports_dict) { Isolate *isolate = context->GetIsolate(); Local extern_callback_template = ObjectTemplate::New(isolate); extern_callback_template->SetInternalFieldCount(4); for (const auto &it : jit_externs) { const auto &fn_name = it.first; if (should_skip_extern_symbol(fn_name)) { continue; } TrampolineFn trampoline_fn = nullptr; std::vector arg_types; if (!build_extern_arg_types(fn_name, jit_externs, trampolines, trampoline_fn, arg_types)) { internal_error << "Missing fn_name " << fn_name; } const size_t arg_types_bytes = sizeof(ExternArgType) * arg_types.size(); Local arg_types_wrap = ArrayBuffer::New(isolate, arg_types_bytes); std::shared_ptr backing = arg_types_wrap->GetBackingStore(); memcpy((ExternArgType *)backing->Data(), arg_types.data(), arg_types_bytes); Local wrapper_data = extern_callback_template->NewInstance(context).ToLocalChecked(); static_assert(sizeof(trampoline_fn) == sizeof(void *)); Local trampoline_wrap(External::New(isolate, (void *)trampoline_fn)); wrapper_data->SetInternalField(kTrampolineWrap, trampoline_wrap); wrapper_data->SetInternalField(kArgTypesWrap, arg_types_wrap); Local key = NewLocalString(isolate, fn_name.c_str()); Local value = FunctionTemplate::New(isolate, v8_extern_wrapper, wrapper_data) ->GetFunction(context) .ToLocalChecked(); (void)imports_dict->Set(context, key, value).ToChecked(); } } #endif // WITH_V8 } // namespace // clang-format off #if WITH_WABT using HostCallbackMap = std::unordered_map; #define DEFINE_CALLBACK(f) { #f, wabt_jit_##f##_callback }, #define DEFINE_POSIX_MATH_CALLBACK(t, f) { #f, wabt_posix_math_1 }, #define DEFINE_POSIX_MATH_CALLBACK2(t, f) { #f, wabt_posix_math_2 }, #endif #ifdef WITH_V8 using HostCallbackMap = std::unordered_map; #define DEFINE_CALLBACK(f) { #f, wasm_jit_##f##_callback }, #define DEFINE_POSIX_MATH_CALLBACK(t, f) { #f, wasm_jit_posix_math_callback }, #define DEFINE_POSIX_MATH_CALLBACK2(t, f) { #f, wasm_jit_posix_math2_callback }, #endif const HostCallbackMap &get_host_callback_map() { static HostCallbackMap m = { // General runtime functions. DEFINE_CALLBACK(__cxa_atexit) DEFINE_CALLBACK(__extendhfsf2) DEFINE_CALLBACK(__truncsfhf2) DEFINE_CALLBACK(abort) DEFINE_CALLBACK(fclose) DEFINE_CALLBACK(fileno) DEFINE_CALLBACK(fopen) DEFINE_CALLBACK(free) DEFINE_CALLBACK(fwrite) DEFINE_CALLBACK(getenv) DEFINE_CALLBACK(halide_error) DEFINE_CALLBACK(halide_print) DEFINE_CALLBACK(halide_trace_helper) DEFINE_CALLBACK(malloc) DEFINE_CALLBACK(memcmp) DEFINE_CALLBACK(memcpy) DEFINE_CALLBACK(memmove) DEFINE_CALLBACK(memset) DEFINE_CALLBACK(strlen) DEFINE_CALLBACK(write) // Posix math. DEFINE_POSIX_MATH_CALLBACK(double, acos) DEFINE_POSIX_MATH_CALLBACK(double, acosh) DEFINE_POSIX_MATH_CALLBACK(double, asin) DEFINE_POSIX_MATH_CALLBACK(double, asinh) DEFINE_POSIX_MATH_CALLBACK(double, atan) DEFINE_POSIX_MATH_CALLBACK(double, atanh) DEFINE_POSIX_MATH_CALLBACK(double, cos) DEFINE_POSIX_MATH_CALLBACK(double, cosh) DEFINE_POSIX_MATH_CALLBACK(double, exp) DEFINE_POSIX_MATH_CALLBACK(double, log) DEFINE_POSIX_MATH_CALLBACK(double, round) DEFINE_POSIX_MATH_CALLBACK(double, sin) DEFINE_POSIX_MATH_CALLBACK(double, sinh) DEFINE_POSIX_MATH_CALLBACK(double, tan) DEFINE_POSIX_MATH_CALLBACK(double, tanh) DEFINE_POSIX_MATH_CALLBACK(float, acosf) DEFINE_POSIX_MATH_CALLBACK(float, acoshf) DEFINE_POSIX_MATH_CALLBACK(float, asinf) DEFINE_POSIX_MATH_CALLBACK(float, asinhf) DEFINE_POSIX_MATH_CALLBACK(float, atanf) DEFINE_POSIX_MATH_CALLBACK(float, atanhf) DEFINE_POSIX_MATH_CALLBACK(float, cosf) DEFINE_POSIX_MATH_CALLBACK(float, coshf) DEFINE_POSIX_MATH_CALLBACK(float, expf) DEFINE_POSIX_MATH_CALLBACK(float, logf) DEFINE_POSIX_MATH_CALLBACK(float, roundf) DEFINE_POSIX_MATH_CALLBACK(float, sinf) DEFINE_POSIX_MATH_CALLBACK(float, sinhf) DEFINE_POSIX_MATH_CALLBACK(float, tanf) DEFINE_POSIX_MATH_CALLBACK(float, tanhf) DEFINE_POSIX_MATH_CALLBACK2(float, atan2f) DEFINE_POSIX_MATH_CALLBACK2(double, atan2) DEFINE_POSIX_MATH_CALLBACK2(float, fminf) DEFINE_POSIX_MATH_CALLBACK2(double, fmin) DEFINE_POSIX_MATH_CALLBACK2(float, fmaxf) DEFINE_POSIX_MATH_CALLBACK2(double, fmax) DEFINE_POSIX_MATH_CALLBACK2(float, powf) DEFINE_POSIX_MATH_CALLBACK2(double, pow) }; return m; } #undef DEFINE_CALLBACK #undef DEFINE_POSIX_MATH_CALLBACK #undef DEFINE_POSIX_MATH_CALLBACK2 // clang-format on #endif // WITH_WABT || WITH_V8 struct WasmModuleContents { mutable RefCount ref_count; const Target target; const std::vector arguments; std::map jit_externs; std::vector extern_deps; JITModule trampolines; #if WITH_WABT || WITH_V8 BDMalloc bdmalloc; #endif // WITH_WABT || WITH_V8 #if WITH_WABT wabt::interp::Store store; wabt::interp::Module::Ptr module; wabt::interp::Instance::Ptr instance; wabt::interp::Thread::Options thread_options; wabt::interp::Memory::Ptr memory; #endif #ifdef WITH_V8 v8::Isolate *isolate = nullptr; v8::ArrayBuffer::Allocator *array_buffer_allocator = nullptr; v8::Persistent v8_context; v8::Persistent v8_function; #endif WasmModuleContents( const Module &halide_module, const std::vector &arguments, const std::string &fn_name, const std::map &jit_externs, const std::vector &extern_deps); int run(const void *const *args); ~WasmModuleContents() = default; }; // clang-format off WasmModuleContents::WasmModuleContents( const Module &halide_module, const std::vector &arguments, const std::string &fn_name, const std::map &jit_externs, const std::vector &extern_deps) : target(halide_module.target()) , arguments(arguments) , jit_externs(jit_externs) , extern_deps(extern_deps) , trampolines(JITModule::make_trampolines_module(get_host_target(), jit_externs, kTrampolineSuffix, extern_deps)) #if WITH_WABT , store(wabt::interp::Store(calc_features(halide_module.target()))) #endif // clang-format on { #if WITH_WABT || WITH_V8 wdebug(1) << "Compiling wasm function " << fn_name << "\n"; #endif // WITH_WABT || WITH_V8 #if WITH_WABT user_assert(!target.has_feature(Target::WasmThreads)) << "wasm_threads requires Emscripten (or a similar compiler); it will never be supported under JIT."; user_assert(!target.has_feature(Target::WebGPU)) << "wasm_webgpu requires Emscripten (or a similar compiler); it will never be supported under JIT."; // Compile halide into wasm bytecode. std::vector final_wasm = compile_to_wasm(halide_module, fn_name); // Create a wabt Module for it. wabt::MemoryStream log_stream; constexpr bool kReadDebugNames = true; constexpr bool kStopOnFirstError = true; constexpr bool kFailOnCustomSectionError = true; wabt::ReadBinaryOptions options(store.features(), &log_stream, kReadDebugNames, kStopOnFirstError, kFailOnCustomSectionError); wabt::Errors errors; wabt::interp::ModuleDesc module_desc; wabt::Result r = wabt::interp::ReadBinaryInterp("", final_wasm.data(), final_wasm.size(), options, &errors, &module_desc); internal_assert(Succeeded(r)) << "ReadBinaryInterp failed:\n" << wabt::FormatErrorsToString(errors, wabt::Location::Type::Binary) << "\n" << " log: " << to_string(log_stream) << "\n"; if (WASM_DEBUG_LEVEL >= 2) { wabt::MemoryStream dis_stream; module_desc.istream.Disassemble(&dis_stream); wdebug(WASM_DEBUG_LEVEL) << "Disassembly:\n" << to_string(dis_stream) << "\n"; } module = wabt::interp::Module::New(store, module_desc); // Bind all imports to our callbacks. wabt::interp::RefVec imports; const HostCallbackMap &host_callback_map = get_host_callback_map(); for (const auto &import : module->desc().imports) { wdebug(1) << "import=" << import.type.module << "." << import.type.name << "\n"; if (import.type.type->kind == wabt::interp::ExternKind::Func && import.type.module == "env") { auto it = host_callback_map.find(import.type.name); if (it != host_callback_map.end()) { auto func_type = *wabt::cast(import.type.type.get()); auto host_func = wabt::interp::HostFunc::New(store, func_type, it->second); imports.push_back(host_func.ref()); continue; } // If it's not one of the standard host callbacks, assume it must be // a define_extern, and look for it in the jit_externs. auto host_func = make_extern_callback(store, jit_externs, trampolines, import); imports.push_back(host_func.ref()); continue; } // By default, just push a null reference. This won't resolve, and // instantiation will fail. imports.push_back(wabt::interp::Ref::Null); } wabt::interp::RefPtr trap; instance = wabt::interp::Instance::Instantiate(store, module.ref(), imports, &trap); internal_assert(instance) << "Error initializing module: " << trap->message() << "\n"; int32_t heap_base = -1; for (const auto &e : module_desc.exports) { if (e.type.name == "__heap_base") { internal_assert(e.type.type->kind == wabt::ExternalKind::Global); heap_base = store.UnsafeGet(instance->globals()[e.index])->Get().Get(); wdebug(1) << "__heap_base is " << heap_base << "\n"; continue; } if (e.type.name == "memory") { internal_assert(e.type.type->kind == wabt::ExternalKind::Memory); internal_assert(!memory.get()) << "Expected exactly one memory object but saw " << (void *)memory.get(); memory = store.UnsafeGet(instance->memories()[e.index]); wdebug(1) << "heap_size is " << memory->ByteSize() << "\n"; continue; } } internal_assert(heap_base >= 0) << "__heap_base not found"; internal_assert(memory->ByteSize() > 0) << "memory size is unlikely"; bdmalloc.init(memory->ByteSize(), heap_base); #endif // WITH_WABT #ifdef WITH_V8 static std::once_flag init_v8_once; std::call_once(init_v8_once, []() { // Initialize V8. V8::InitializeICU(); static std::unique_ptr platform = platform::NewDefaultPlatform(); V8::InitializePlatform(platform.get()); V8::Initialize(); std::vector flags = { // TODO: these need to match the flags we set in CodeGen_WebAssembly::mattrs(). // Note that we currently enable all features that *might* be used // (eg we enable simd even though we might not use it) as we may well end // using different Halide Targets across our lifespan. // Sometimes useful for debugging purposes: // "--print_all_exceptions=true", // "--abort_on_uncaught_exception", // "--trace-ignition-codegen", // "--trace_wasm_decoder", // "--no-liftoff", // "--wasm-interpret-all", // "--trace-wasm-memory", }; for (const auto &f : flags) { V8::SetFlagsFromString(f.c_str(), f.size()); } }); array_buffer_allocator = v8::ArrayBuffer::Allocator::NewDefaultAllocator(); Isolate::CreateParams isolate_params; isolate_params.snapshot_blob = nullptr; isolate_params.array_buffer_allocator = array_buffer_allocator; // Create a new Isolate and make it the current one. isolate = Isolate::New(isolate_params); Locker locker(isolate); Isolate::Scope isolate_scope(isolate); // Create a stack-allocated handle scope. HandleScope handle_scope(isolate); Local global = ObjectTemplate::New(isolate); Local context = Context::New(isolate, nullptr, global); v8_context.Reset(isolate, context); Context::Scope context_scope(context); TryCatch try_catch(isolate); try_catch.SetCaptureMessage(true); try_catch.SetVerbose(true); Local fn_name_str = NewLocalString(isolate, fn_name.c_str()); std::vector final_wasm = compile_to_wasm(halide_module, fn_name); MaybeLocal maybe_compiled = WasmModuleObject::Compile( isolate, /* wire_bytes */ {(const uint8_t *)final_wasm.data(), final_wasm.size()}); Local compiled; if (!maybe_compiled.ToLocal(&compiled)) { // Versions of V8 prior to 7.5 or so don't propagate the exception properly, // so don't attempt to print the exception info if it's not present. if (try_catch.HasCaught()) { String::Utf8Value error(isolate, try_catch.Exception()); internal_error << "Error compiling wasm: " << *error << "\n"; } else { internal_error << "Error compiling wasm: \n"; } } const HostCallbackMap &host_callback_map = get_host_callback_map(); Local imports_dict = Object::New(isolate); for (const auto &it : host_callback_map) { const std::string &name = it.first; FunctionCallback f = it.second; Local key = NewLocalString(isolate, name.c_str()); Local value = FunctionTemplate::New(isolate, f)->GetFunction(context).ToLocalChecked(); (void)imports_dict->Set(context, key, value).ToChecked(); }; add_extern_callbacks(context, jit_externs, trampolines, imports_dict); Local imports = Object::New(isolate); (void)imports->Set(context, NewLocalString(isolate, "env"), imports_dict).ToChecked(); Local instance_args[2] = {compiled, imports}; Local exports = context->Global() ->Get(context, NewLocalString(isolate, "WebAssembly")) .ToLocalChecked() .As() ->Get(context, NewLocalString(isolate, "Instance")) .ToLocalChecked() .As() ->CallAsConstructor(context, 2, instance_args) .ToLocalChecked() .As() ->Get(context, NewLocalString(isolate, "exports")) .ToLocalChecked() .As(); Local function_value = exports->Get(context, fn_name_str).ToLocalChecked(); Local function = Local::Cast(function_value); internal_assert(!function.IsEmpty()); internal_assert(!function->IsNullOrUndefined()); v8_function.Reset(isolate, function); context->SetEmbedderData(kWasmMemoryObject, exports->Get(context, NewLocalString(isolate, "memory")).ToLocalChecked().As()); context->SetAlignedPointerInEmbedderData(kBDMallocPtr, &bdmalloc); context->SetEmbedderData(kHeapBase, exports->Get(context, NewLocalString(isolate, "__heap_base")).ToLocalChecked().As()); context->SetEmbedderData(kString_buffer, NewLocalString(isolate, "buffer")); context->SetEmbedderData(kString_grow, NewLocalString(isolate, "grow")); internal_assert(!try_catch.HasCaught()); #endif } int WasmModuleContents::run(const void *const *args) { #if WITH_WABT const auto &module_desc = module->desc(); wabt::interp::FuncType *func_type = nullptr; wabt::interp::RefPtr func; std::string func_name; for (const auto &e : module_desc.exports) { if (e.type.type->kind == wabt::ExternalKind::Func) { wdebug(1) << "Selecting export '" << e.type.name << "'\n"; internal_assert(!func_type && !func) << "Multiple exported funcs found"; func_type = wabt::cast(e.type.type.get()); func = store.UnsafeGet(instance->funcs()[e.index]); func_name = e.type.name; continue; } } JITUserContext *jit_user_context = nullptr; for (size_t i = 0; i < arguments.size(); i++) { const Argument &arg = arguments[i]; const void *arg_ptr = args[i]; if (arg.name == "__user_context") { jit_user_context = *(JITUserContext **)const_cast(arg_ptr); } } WabtContext wabt_context(jit_user_context, *memory, bdmalloc); internal_assert(instance->host_info() == nullptr); instance->set_host_info(&wabt_context); wabt::interp::Values wabt_args; wabt::interp::Values wabt_results; wabt::interp::Trap::Ptr trap; std::vector wbufs(arguments.size(), 0); for (size_t i = 0; i < arguments.size(); i++) { const Argument &arg = arguments[i]; const void *arg_ptr = args[i]; if (arg.is_buffer()) { halide_buffer_t *buf = (halide_buffer_t *)const_cast(arg_ptr); // It's OK for this to be null (let Halide asserts handle it) wasm32_ptr_t wbuf = hostbuf_to_wasmbuf(wabt_context, buf); wbufs[i] = wbuf; wabt_args.push_back(load_value(wbuf)); } else { if (arg.name == "__user_context") { wabt_args.push_back(wabt::interp::Value::Make(kMagicJitUserContextValue)); } else { wabt_args.push_back(load_value(arg.type, arg_ptr)); } } } wabt::interp::Thread thread(store); auto r = func->Call(thread, wabt_args, wabt_results, &trap); if (WASM_DEBUG_LEVEL >= 2) { wabt::MemoryStream call_stream; WriteCall(&call_stream, func_name, *func_type, wabt_args, wabt_results, trap); wdebug(WASM_DEBUG_LEVEL) << to_string(call_stream) << "\n"; } internal_assert(Succeeded(r)) << "Func::Call failed: " << trap->message() << "\n"; internal_assert(wabt_results.size() == 1); int32_t result = wabt_results[0].Get(); wdebug(1) << "Result is " << result << "\n"; if (result == 0) { // Update any output buffers for (size_t i = 0; i < arguments.size(); i++) { const Argument &arg = arguments[i]; const void *arg_ptr = args[i]; if (arg.is_buffer()) { halide_buffer_t *buf = (halide_buffer_t *)const_cast(arg_ptr); copy_wasmbuf_to_existing_hostbuf(wabt_context, wbufs[i], buf); } } } for (wasm32_ptr_t p : wbufs) { wabt_free(wabt_context, p); } // Don't do this: things allocated by Halide runtime might need to persist // between multiple invocations of the same function. // bdmalloc.reset(); instance->set_host_info(nullptr); return result; #endif #if WITH_V8 Locker locker(isolate); Isolate::Scope isolate_scope(isolate); // Create a stack-allocated handle scope. HandleScope handle_scope(isolate); Local context = Local::New(isolate, v8_context); // Enter the context for compiling and running the hello world script. Context::Scope context_scope(context); TryCatch try_catch(isolate); try_catch.SetCaptureMessage(true); try_catch.SetVerbose(true); std::vector wbufs(arguments.size(), 0); std::vector> js_args; for (size_t i = 0; i < arguments.size(); i++) { const Argument &arg = arguments[i]; const void *arg_ptr = args[i]; if (arg.is_buffer()) { halide_buffer_t *buf = (halide_buffer_t *)const_cast(arg_ptr); // It's OK for this to be null (let Halide asserts handle it) wasm32_ptr_t wbuf = hostbuf_to_wasmbuf(context, buf); wbufs[i] = wbuf; js_args.push_back(load_scalar(context, wbuf)); } else { if (arg.name == "__user_context") { js_args.push_back(load_scalar(context, kMagicJitUserContextValue)); JITUserContext *jit_user_context = *(JITUserContext **)const_cast(arg_ptr); context->SetAlignedPointerInEmbedderData(kJitUserContext, jit_user_context); } else { js_args.push_back(load_scalar(context, arg.type, arg_ptr)); } } } Local function = Local::New(isolate, v8_function); MaybeLocal result = function->Call(context, context->Global(), js_args.size(), js_args.data()); if (result.IsEmpty()) { String::Utf8Value error(isolate, try_catch.Exception()); String::Utf8Value message(isolate, try_catch.Message()->GetSourceLine(context).ToLocalChecked()); internal_error << "Error running wasm: " << *error << " | Line: " << *message << "\n"; } int r = result.ToLocalChecked()->Int32Value(context).ToChecked(); if (r == 0) { // Update any output buffers for (size_t i = 0; i < arguments.size(); i++) { const Argument &arg = arguments[i]; const void *arg_ptr = args[i]; if (arg.is_buffer()) { halide_buffer_t *buf = (halide_buffer_t *)const_cast(arg_ptr); copy_wasmbuf_to_existing_hostbuf(context, wbufs[i], buf); } } } for (wasm32_ptr_t p : wbufs) { v8_WasmMemoryObject_free(context, p); } // Don't do this: things allocated by Halide runtime might need to persist // between multiple invocations of the same function. // bdmalloc.reset(); return r; #endif // WITH_V8 internal_error << "WasmExecutor is not configured correctly"; return -1; } template<> RefCount &ref_count(const WasmModuleContents *p) noexcept { return p->ref_count; } template<> void destroy(const WasmModuleContents *p) { delete p; } /*static*/ bool WasmModule::can_jit_target(const Target &target) { #if WITH_WABT || WITH_V8 if (target.arch == Target::WebAssembly) { return true; } #endif return false; } /*static*/ WasmModule WasmModule::compile( const Module &module, const std::vector &arguments, const std::string &fn_name, const std::map &jit_externs, const std::vector &extern_deps) { #if defined(WITH_WABT) || defined(WITH_V8) WasmModule wasm_module; wasm_module.contents = new WasmModuleContents(module, arguments, fn_name, jit_externs, extern_deps); return wasm_module; #else user_error << "Cannot run JITted WebAssembly without configuring a WebAssembly engine."; return WasmModule(); #endif } /** Run generated previously compiled wasm code with a set of arguments. */ int WasmModule::run(const void *const *args) { internal_assert(contents.defined()); return contents->run(args); } } // namespace Internal } // namespace Halide Halide-17.0.1/src/WasmExecutor.h000066400000000000000000000026351456515664200164000ustar00rootroot00000000000000#ifndef HALIDE_WASM_EXECUTOR_H #define HALIDE_WASM_EXECUTOR_H /** \file * * Support for running Halide-compiled Wasm code in-process. * Bindings for parameters, extern calls, etc. are established and the * Wasm code is executed. Allows calls to realize to work * exactly as if native code had been run, but via a JavaScript/Wasm VM. * Currently, only the WABT interpreter is supported. */ #include "Argument.h" #include "Parameter.h" #include "Type.h" #include #include #include namespace Halide { struct JITExtern; struct Target; namespace Internal { struct JITModule; struct WasmModuleContents; /** Handle to compiled wasm code which can be called later. */ struct WasmModule { Internal::IntrusivePtr contents; /** If the given target can be executed via the wasm executor, return true. */ static bool can_jit_target(const Target &target); /** Compile generated wasm code with a set of externs. */ static WasmModule compile( const Module &module, const std::vector &arguments, const std::string &fn_name, const std::map &externs, const std::vector &extern_deps); /** Run generated previously compiled wasm code with a set of arguments. */ int run(const void *const *args); }; } // namespace Internal } // namespace Halide #endif // HALIDE_WASM_EXECUTOR_H Halide-17.0.1/src/WrapCalls.cpp000066400000000000000000000173171456515664200162000ustar00rootroot00000000000000#include "WrapCalls.h" #include "FindCalls.h" #include "Function.h" #include "FunctionPtr.h" #include namespace Halide { namespace Internal { using std::map; using std::set; using std::string; typedef map SubstitutionMap; namespace { void insert_func_wrapper_helper(map &func_wrappers_map, const FunctionPtr &in_func, const FunctionPtr &wrapped_func, const FunctionPtr &wrapper) { internal_assert(in_func.defined() && wrapped_func.defined() && wrapper.defined()); internal_assert(func_wrappers_map[in_func].count(wrapped_func) == 0) << "Should only have one wrapper for each function call in a Func\n"; SubstitutionMap &wrappers_map = func_wrappers_map[in_func]; for (auto iter = wrappers_map.begin(); iter != wrappers_map.end(); ++iter) { if (iter->second.same_as(wrapped_func)) { debug(4) << "Merging wrapper of " << Function(in_func).name() << " [" << Function(iter->first).name() << ", " << Function(iter->second).name() << "] with [" << Function(wrapped_func).name() << ", " << Function(wrapper).name() << "]\n"; iter->second = wrapper; return; } else if (wrapper.same_as(iter->first)) { debug(4) << "Merging wrapper of " << Function(in_func).name() << " [" << Function(wrapped_func).name() << ", " << Function(wrapper).name() << "] with [" << Function(iter->first).name() << ", " << Function(iter->second).name() << "]\n"; wrappers_map.emplace(wrapped_func, iter->second); wrappers_map.erase(iter); return; } } wrappers_map[wrapped_func] = wrapper; } void validate_custom_wrapper(const Function &in_func, const Function &wrapped, const Function &wrapper) { map callees = find_direct_calls(in_func); if (!callees.count(wrapper.name())) { std::ostringstream callees_text; for (const auto &it : callees) { callees_text << " " << it.second.name() << "\n"; } user_error << "Cannot wrap \"" << wrapped.name() << "\" in \"" << in_func.name() << "\" because \"" << in_func.name() << "\" does not call \"" << wrapped.name() << "\"\n" << "Direct callees of \"" << in_func.name() << "\" are:\n" << callees_text.str(); } } } // anonymous namespace map wrap_func_calls(const map &env) { map wrapped_env; map func_wrappers_map; // In Func -> [wrapped Func -> wrapper] set global_wrappers; for (const auto &iter : env) { wrapped_env.emplace(iter.first, iter.second); func_wrappers_map[iter.second.get_contents()]; } for (const auto &it : env) { string wrapped_fname = it.first; FunctionPtr wrapped_func = it.second.get_contents(); const auto &wrappers = it.second.schedule().wrappers(); // Put the names of all wrappers of this Function into the set for // faster comparison during the substitution. set all_func_wrappers; for (const auto &iter : wrappers) { all_func_wrappers.insert(Function(iter.second).name()); } for (const auto &iter : wrappers) { string in_func = iter.first; FunctionPtr wrapper = iter.second; if (in_func.empty()) { // Global wrapper global_wrappers.insert(Function(wrapper).name()); for (const auto &wrapped_env_iter : wrapped_env) { in_func = wrapped_env_iter.first; if ((wrapped_fname == in_func) || (all_func_wrappers.find(in_func) != all_func_wrappers.end())) { // The wrapper should still call the original function, // so we don't want to rewrite the calls done by the // wrapper. We also shouldn't rewrite the original // function itself. debug(4) << "Skip over replacing \"" << in_func << "\" with \"" << Function(wrapper).name() << "\"\n"; continue; } if (wrappers.count(in_func)) { // If the 'in_func' already has custom wrapper for // 'wrapped_func', don't substitute in the global wrapper. // Custom wrapper always takes precedence over global wrapper continue; } debug(4) << "Global wrapper: replacing reference of \"" << wrapped_fname << "\" in \"" << in_func << "\" with \"" << Function(wrapper).name() << "\"\n"; insert_func_wrapper_helper(func_wrappers_map, wrapped_env_iter.second.get_contents(), wrapped_func, wrapper); } } else { // Custom wrapper debug(4) << "Custom wrapper: replacing reference of \"" << wrapped_fname << "\" in \"" << in_func << "\" with \"" << Function(wrapper).name() << "\"\n"; const auto &in_func_iter = wrapped_env.find(in_func); if (in_func_iter == wrapped_env.end()) { // We find a wrapper definition of 'wrapped_func 'for 'in_func' // which is not in this pipeline. We don't need to perform // the substitution since no function in this pipeline will ever // refer to 'in_func'. // // This situation might arise in the following case below: // f(x) = x; // g(x) = f(x) + 1; // f.in(g); // f.realize(..); debug(4) << " skip custom wrapper for " << in_func << " [" << wrapped_fname << " -> " << Function(wrapper).name() << "] since it's not in the pipeline\n"; continue; } insert_func_wrapper_helper(func_wrappers_map, wrapped_env[in_func].get_contents(), wrapped_func, wrapper); } } } // Perform the substitution for (auto &iter : wrapped_env) { const auto &substitutions = func_wrappers_map[iter.second.get_contents()]; if (!substitutions.empty()) { iter.second.substitute_calls(substitutions); } } // Assert that the custom wrappers are actually used, i.e. if f.in(g) is // called, but 'f' is never called inside 'g', this will throw a user error. // Perform the check after the wrapper substitution to handle multi-fold // wrappers, e.g. f.in(g).in(g). for (const auto &iter : wrapped_env) { const auto &substitutions = func_wrappers_map[iter.second.get_contents()]; for (const auto &pair : substitutions) { if (global_wrappers.find(Function(pair.second).name()) == global_wrappers.end()) { validate_custom_wrapper(iter.second, Function(pair.first), Function(pair.second)); } } } return wrapped_env; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/WrapCalls.h000066400000000000000000000007761456515664200156460ustar00rootroot00000000000000#ifndef HALIDE_WRAP_CALLS_H #define HALIDE_WRAP_CALLS_H /** \file * * Defines pass to replace calls to wrapped Functions with their wrappers. */ #include #include namespace Halide { namespace Internal { class Function; /** Replace every call to wrapped Functions in the Functions' definitions with * call to their wrapper functions. */ std::map wrap_func_calls(const std::map &env); } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/autoschedulers/000077500000000000000000000000001456515664200166255ustar00rootroot00000000000000Halide-17.0.1/src/autoschedulers/CMakeLists.txt000066400000000000000000000020741456515664200213700ustar00rootroot00000000000000# Ensure that plugins export only what is needed to load them. # Everything else should be omitted to keep binary size low. set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) set(CMAKE_CXX_VISIBILITY_PRESET hidden) set(CMAKE_VISIBILITY_INLINES_HIDDEN YES) function(add_autoscheduler) set(options) set(oneValueArgs NAME) set(multiValueArgs SOURCES) cmake_parse_arguments("arg" "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_library(Halide_${arg_NAME} MODULE ${arg_SOURCES}) add_library(Halide::${arg_NAME} ALIAS Halide_${arg_NAME}) target_compile_definitions(Halide_${arg_NAME} PRIVATE Halide_EXPORTS) target_link_libraries(Halide_${arg_NAME} PRIVATE Halide::Plugin) string(TOLOWER "${arg_NAME}" name_lower) set_target_properties(Halide_${arg_NAME} PROPERTIES EXPORT_NAME ${arg_NAME} OUTPUT_NAME autoschedule_${name_lower}) endfunction() add_subdirectory(common) add_subdirectory(adams2019) add_subdirectory(li2018) add_subdirectory(mullapudi2016) add_subdirectory(anderson2021) Halide-17.0.1/src/autoschedulers/adams2019/000077500000000000000000000000001456515664200202265ustar00rootroot00000000000000Halide-17.0.1/src/autoschedulers/adams2019/AutoSchedule.cpp000066400000000000000000000564021456515664200233260ustar00rootroot00000000000000/* This file is the core of the autoscheduler. Most of the code here is about navigating the search space and computing the featurization. This also contains the top-level interface into the autoscheduler. The most interesting classes to look at are: LoopNest Represents one node in our tree representation of loop nests. (Now in LoopNest.(h | cpp)). State A state in the beam search. Holds a root loop nest. (Now in State.(h | cpp)). Interesting functions below are: generate_schedule The top-level entrypoint, which computes and applies a schedule to a Halide pipeline optimal_schedule Runs the passes of the coarse-to-fine beam search optimal_schedule_pass Runs a single pass of beam search LoopNest::compute_features Recursively walks over a loop nest tree, computing our featurization using Halide's analysis tools. LoopNest::apply Actually apply a computed schedule to a Halide pipeline State::generate_children Generates successor states to a state in the beam search Environment variables used (directly or indirectly): HL_DEBUG_AUTOSCHEDULE If set, is used for the debug log level for auto-schedule generation (overriding the value of HL_DEBUG_CODEGEN, if any). HL_PERMIT_FAILED_UNROLL Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding. #ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS HL_CYOS "Choose-your-own-schedule". If set to 1, lets you navigate the search tree by hand in the terminal. Whee! This is for debugging the autoscheduler. Since it is generally only for use by developers/maintainers of this autoscheduler, it defaults to being omitted entirely unless you build Halide with HALIDE_AUTOSCHEDULER_ALLOW_CYOS defined. Even then, you must *also* set the env var to 1 to make use of it. #endif */ #include "HalidePlugin.h" #include #include #include #include #include #include #include #include #include #include #include "ASLog.h" #include "AutoSchedule.h" #include "Cache.h" #include "CostModel.h" #include "DefaultCostModel.h" #include "Errors.h" #include "Featurization.h" #include "FunctionDAG.h" #include "Halide.h" #include "LoopNest.h" #include "NetworkSize.h" #include "ParamParser.h" #include "PerfectHashMap.h" #include "State.h" #include "Timer.h" #ifdef _WIN32 #include #define _isatty isatty; #else #include #endif namespace Halide { namespace Internal { namespace Autoscheduler { using std::string; using std::vector; struct ProgressBar { void set(double progress) { if (!draw_progress_bar) { return; } auto &os = aslog(ProgressBarLogLevel).get_ostream(); counter++; const int bits = 11; if (counter & ((1 << bits) - 1)) { return; } const int pos = (int)(progress * 78); os << "["; for (int j = 0; j < 78; j++) { if (j < pos) { os << "."; } else if (j - 1 < pos) { os << "/-\\|"[(counter >> bits) % 4]; } else { os << " "; } } os << "]"; for (int j = 0; j < 80; j++) { os << "\b"; } } void clear() { if (counter) { auto &os = aslog(ProgressBarLogLevel).get_ostream(); for (int j = 0; j < 80; j++) { os << " "; } for (int j = 0; j < 80; j++) { os << "\b"; } } } private: uint32_t counter = 0; static constexpr int ProgressBarLogLevel = 1; const bool draw_progress_bar = isatty(2) && aslog::aslog_level() >= ProgressBarLogLevel; }; // Decide whether or not to drop a beam search state. Used for // randomly exploring the search tree for autotuning and to generate // training data. bool random_dropout(const Adams2019Params ¶ms, std::mt19937 &rng, size_t num_decisions) { if (params.random_dropout >= 100) { return false; } // The random dropout threshold is the chance that we operate // entirely greedily and never discard anything. double t = params.random_dropout; t /= 100; t = std::pow(t, 1.0f / num_decisions); t *= 100; uint32_t r = rng(); bool drop_it = (r % 100) >= t; return drop_it; } // A priority queue of states, sorted according to increasing // cost. Never shrinks, to avoid reallocations. // Can't use std::priority_queue because it doesn't support unique_ptr. class StateQueue { private: struct CompareStates { bool operator()(const IntrusivePtr &a, const IntrusivePtr &b) const { return a->cost > b->cost; } }; std::vector> storage; size_t sz = 0; public: void emplace(IntrusivePtr &&s) { if (sz >= storage.size()) { storage.resize(std::max(sz * 2, (size_t)64)); } internal_assert(sz < storage.size()) << sz << " " << storage.size() << "\n"; storage[sz] = std::move(s); sz++; std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{}); } IntrusivePtr pop() { internal_assert(sz <= storage.size()) << sz << " " << storage.size() << "\n"; std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{}); sz--; return std::move(storage[sz]); } const IntrusivePtr &top() { return storage[0]; } bool empty() const { return sz == 0; } size_t size() const { return sz; } void swap(StateQueue &other) { storage.swap(other.storage); std::swap(sz, other.sz); } IntrusivePtr operator[](int idx) const { return storage[idx]; } void resort() { std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{}); } void clear() { for (size_t i = 0; i < sz; i++) { storage[i] = IntrusivePtr{}; } sz = 0; } }; // Configure a cost model to process a specific pipeline. void configure_pipeline_features(const FunctionDAG &dag, const Adams2019Params ¶ms, CostModel *cost_model) { cost_model->reset(); cost_model->set_pipeline_features(dag, params); } // A single pass of coarse-to-fine beam search. IntrusivePtr optimal_schedule_pass(FunctionDAG &dag, const vector &outputs, const Adams2019Params ¶ms, CostModel *cost_model, std::mt19937 &rng, int pass_idx, int num_passes, ProgressBar &tick, std::unordered_set &permitted_hashes, Cache *cache) { if (cost_model) { configure_pipeline_features(dag, params, cost_model); } StateQueue q, pending; // The initial state, with no decisions made { IntrusivePtr initial{new State}; initial->root = new LoopNest; q.emplace(std::move(initial)); } int expanded = 0; std::function &&)> enqueue_new_children = [&](IntrusivePtr &&s) { // Each child should have one more decision made than its parent state. internal_assert(s->num_decisions_made == s->parent->num_decisions_made + 1); int progress = s->num_decisions_made * params.beam_size + expanded; size_t max_progress = dag.nodes.size() * params.beam_size * 2; // Update the progress bar tick.set(double(progress) / max_progress); s->penalized = false; // Add the state to the list of states to evaluate q.emplace(std::move(s)); }; #ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS string cyos_str = get_env_variable("HL_CYOS"); #endif // This loop is beam search over the sequence of decisions to make. for (;;) { std::unordered_map hashes; q.swap(pending); if (pending.empty()) { if ((false) && params.beam_size < 1000) { // Intentional dead code. Extra parens to pacify clang-tidy. // Total mortality. Double the beam size and // restart. Disabled for now because total mortality // may indicate a bug. Adams2019Params params2 = params; params2.beam_size *= 2; return optimal_schedule_pass(dag, outputs, params2, cost_model, rng, pass_idx, num_passes, tick, permitted_hashes, cache); } else { internal_error << "Ran out of legal states with beam size " << params.beam_size << "\n"; } } if ((int)pending.size() > params.beam_size * 10000) { aslog(1) << "*** Warning: Huge number of states generated (" << pending.size() << ").\n"; } expanded = 0; while (expanded < params.beam_size && !pending.empty()) { IntrusivePtr state{pending.pop()}; if (params.beam_size > 1 && num_passes > 1) { // We are doing coarse-to-fine beam search using the // hashing strategy mentioned in the paper. // // We will lazily apply cost penalties to the queue // according to structural uniqueness. if (!state->penalized) { uint64_t h1 = state->structural_hash(pass_idx + 1); uint64_t h0 = state->structural_hash(pass_idx - 1); // We penalize the cost of a state proportionately // to how many states we've already seen with that // hash. int penalty = ++hashes[h1]; if (pass_idx > 0 && !permitted_hashes.count(h0)) { // It's possible to get yourself into a state // where the only things in the beam that match // the hash were quick-rejected due to details not // captured in the hash, so we apply a huge // penalty, but leave the impermissible state in // the beam. penalty += 10; } if (penalty > 1) { state->penalized = true; state->cost *= penalty; // After penalizing this state, if it's no // longer the best, defer it. We set the // 'penalized' flag so that we know not to // penalize and defer it again. if (!pending.empty() && state->cost > pending.top()->cost) { pending.emplace(std::move(state)); continue; } } } } // Random dropout if (pending.size() > 1 && random_dropout(params, rng, dag.nodes.size() * 2)) { continue; } if (state->num_decisions_made == 2 * (int)dag.nodes.size()) { // We've reached the end of the pass. The first state // must be the best, because we're pulling off a // priority queue. auto best = state; // Bless the reasonable stuff in the beam as // permissible states to visit again. We define // reasonable as having a cost no more than 20% higher // than the cost of the best thing. Only do this if // there are more coarse-to-fine passes yet to come. if (pass_idx + 1 < num_passes) { int blessed = 0; while (state->cost <= 1.2 * best->cost && blessed < params.beam_size) { const State *s = state.get(); while (s) { uint64_t h1 = s->structural_hash(pass_idx); permitted_hashes.insert(h1); s = s->parent.get(); } if (pending.empty()) { break; } state = pending.pop(); blessed++; } } return best; } state->generate_children(dag, params, cost_model, enqueue_new_children, cache); expanded++; } // Drop the other states unconsidered. pending.clear(); if (cost_model) { // Now evaluate all the costs and re-sort them in the priority queue cost_model->evaluate_costs(); q.resort(); } #ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS if (cyos_str == "1") { // The user has set HL_CYOS, and wants to navigate the // search space manually. Discard everything in the queue // except for the user-chosen option. std::cout << "\n--------------------\n"; std::cout << "Select a schedule:\n"; for (int choice_label = (int)q.size() - 1; choice_label >= 0; choice_label--) { auto state = q[choice_label]; std::cout << "\n[" << choice_label << "]:\n"; state->dump(std::cout); constexpr int verbosity_level = 0; // always state->calculate_cost(dag, params, cost_model, cache->options, verbosity_level); } cost_model->evaluate_costs(); // Select next partial schedule to expand. int selection = -1; while (selection < 0 || selection >= (int)q.size()) { std::cout << "\nEnter selection: "; std::cin >> selection; } auto selected = q[selection]; selected->dump(std::cout); q.clear(); q.emplace(std::move(selected)); } #endif } } // Performance coarse-to-fine beam search and return the best state found. IntrusivePtr optimal_schedule(FunctionDAG &dag, const vector &outputs, const Adams2019Params ¶ms, CostModel *cost_model, std::mt19937 &rng, const CachingOptions &options) { IntrusivePtr best; std::unordered_set permitted_hashes; // Set up cache with options and size. Cache cache(options, dag.nodes.size()); // If the beam size is one, it's pointless doing multiple passes. int num_passes = (params.beam_size == 1) ? 1 : 5; #ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS string cyos_str = get_env_variable("HL_CYOS"); if (cyos_str == "1") { // If the user is manually navigating the search space, don't // ask them to do more than one pass. num_passes = 1; } #endif string num_passes_str = get_env_variable("HL_NUM_PASSES"); if (!num_passes_str.empty()) { // The user has requested a non-standard number of passes. num_passes = std::atoi(num_passes_str.c_str()); } for (int i = 0; i < num_passes; i++) { ProgressBar tick; Timer timer; auto pass = optimal_schedule_pass(dag, outputs, params, cost_model, rng, i, num_passes, tick, permitted_hashes, &cache); std::chrono::duration total_time = timer.elapsed(); auto milli = std::chrono::duration_cast(total_time).count(); tick.clear(); switch (aslog::aslog_level()) { case 0: // Silence break; case 1: aslog(1) << "Pass " << i << " of " << num_passes << ", cost: " << pass->cost << ", time (ms): " << milli << "\n"; break; default: aslog(2) << "Pass " << i << " result: "; pass->dump(aslog(2).get_ostream()); } if (i == 0 || pass->cost < best->cost) { // Track which pass produced the lowest-cost state. It's // not necessarily the final one. best = pass; } } aslog(1) << "Best cost: " << best->cost << "\n"; if (options.cache_blocks) { aslog(1) << "Cache (block) hits: " << cache.cache_hits << "\n"; aslog(1) << "Cache (block) misses: " << cache.cache_misses << "\n"; } return best; } // Keep track of how many times we evaluated a state. int State::cost_calculations = 0; // The main entrypoint to generate a schedule for a pipeline. void generate_schedule(const std::vector &outputs, const Target &target, const Adams2019Params ¶ms, AutoSchedulerResults *auto_scheduler_results) { aslog(1) << "generate_schedule for target=" << target.to_string() << "\n"; aslog(1) << "Adams2019.parallelism:" << params.parallelism << "\n"; aslog(1) << "Adams2019.beam_size:" << params.beam_size << "\n"; aslog(1) << "Adams2019.random_dropout:" << params.random_dropout << "\n"; aslog(1) << "Adams2019.random_dropout_seed:" << params.random_dropout_seed << "\n"; aslog(1) << "Adams2019.weights_path:" << params.weights_path << "\n"; aslog(1) << "Adams2019.disable_subtiling:" << params.disable_subtiling << "\n"; aslog(1) << "Adams2019.disable_memoized_features:" << params.disable_memoized_features << "\n"; aslog(1) << "Adams2019.disable_memoized_blocks:" << params.disable_memoized_blocks << "\n"; aslog(1) << "Adams2019.memory_limit:" << params.memory_limit << "\n"; // Start a timer HALIDE_TIC; State::cost_calculations = 0; std::mt19937 rng((uint32_t)params.random_dropout_seed); string weights_in_path = params.weights_path; string weights_out_path; // deliberately empty string randomize_weights_str = get_env_variable("HL_RANDOMIZE_WEIGHTS"); bool randomize_weights = randomize_weights_str == "1"; // Analyse the Halide algorithm and construct our abstract representation of it FunctionDAG dag(outputs, target); if (aslog::aslog_level() >= 2) { dag.dump(aslog(2).get_ostream()); } // Construct a cost model to use to evaluate states. Currently we // just have the one, but it's an abstract interface, so others // can be slotted in for experimentation. std::unique_ptr cost_model = make_default_cost_model(weights_in_path, weights_out_path, randomize_weights); internal_assert(cost_model != nullptr); IntrusivePtr optimal; // Options generated from environment variables, decide whether or not to cache features and/or tilings. CachingOptions cache_options = CachingOptions::MakeOptionsFromParams(params); // Run beam search optimal = optimal_schedule(dag, outputs, params, cost_model.get(), rng, cache_options); HALIDE_TOC; aslog(1) << "Cost evaluated this many times: " << State::cost_calculations << "\n"; // Dump the schedule found aslog(1) << "** Optimal schedule:\n"; // Just to get the debugging prints to fire optimal->calculate_cost(dag, params, cost_model.get(), cache_options, /*verbosity_level*/ 1); // Apply the schedules to the pipeline optimal->apply_schedule(dag, params); // Print out the schedule if (aslog::aslog_level() >= 2) { optimal->dump(aslog(2).get_ostream()); } if (auto_scheduler_results) { auto_scheduler_results->schedule_source = optimal->schedule_source; { std::ostringstream out; optimal->save_featurization(dag, params, cache_options, out); auto_scheduler_results->featurization.resize(out.str().size()); memcpy(auto_scheduler_results->featurization.data(), out.str().data(), out.str().size()); } } } struct Adams2019 { void operator()(const Pipeline &p, const Target &target, const AutoschedulerParams ¶ms_in, AutoSchedulerResults *results) { internal_assert(params_in.name == "Adams2019"); std::vector outputs; for (const Func &f : p.outputs()) { outputs.push_back(f.function()); } Adams2019Params params; { ParamParser parser(params_in.extra); parser.parse("parallelism", ¶ms.parallelism); parser.parse("beam_size", ¶ms.beam_size); parser.parse("random_dropout", ¶ms.random_dropout); parser.parse("random_dropout_seed", ¶ms.random_dropout_seed); parser.parse("weights_path", ¶ms.weights_path); parser.parse("disable_subtiling", ¶ms.disable_subtiling); parser.parse("disable_memoized_features", ¶ms.disable_memoized_features); parser.parse("disable_memoized_blocks", ¶ms.disable_memoized_blocks); parser.parse("memory_limit", ¶ms.memory_limit); parser.finish(); } Autoscheduler::generate_schedule(outputs, target, params, results); results->autoscheduler_params = params_in; } }; REGISTER_AUTOSCHEDULER(Adams2019) // An alternative entrypoint for other uses void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const Adams2019Params ¶ms, CostModel *cost_model, StageMap *schedule_features) { std::mt19937 rng(12345); CachingOptions cache_options = CachingOptions::MakeOptionsFromParams(params); IntrusivePtr optimal = optimal_schedule(dag, outputs, params, cost_model, rng, cache_options); // Apply the schedules optimal->apply_schedule(dag, params); if (schedule_features) { optimal->compute_featurization(dag, params, schedule_features, cache_options); } } } // namespace Autoscheduler // Intrusive shared ptr helpers. template<> RefCount &ref_count(const Autoscheduler::LoopNest *t) noexcept { return t->ref_count; } template<> void destroy(const Autoscheduler::LoopNest *t) { delete t; } template<> RefCount &ref_count(const Autoscheduler::State *t) noexcept { return t->ref_count; } template<> void destroy(const Autoscheduler::State *t) { delete t; } } // namespace Internal } // namespace Halide Halide-17.0.1/src/autoschedulers/adams2019/AutoSchedule.h000066400000000000000000000011471456515664200227670ustar00rootroot00000000000000#include "CostModel.h" #include "Featurization.h" #include "FunctionDAG.h" #include "Halide.h" #include "PerfectHashMap.h" #include namespace Halide { namespace Internal { namespace Autoscheduler { typedef PerfectHashMap StageMapOfScheduleFeatures; void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const Adams2019Params ¶ms, CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features); } // namespace Autoscheduler } // namespace Internal } // namespace Halide Halide-17.0.1/src/autoschedulers/adams2019/CMakeLists.txt000066400000000000000000000117621456515664200227750ustar00rootroot00000000000000## # Build rules for the Adams2019 autoscheduler library ## set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common") # ================================================================= # weights set(WF_CPP baseline.cpp) configure_file(baseline.weights baseline.weights COPYONLY) add_custom_command(OUTPUT ${WF_CPP} COMMAND binary2cpp baseline_weights < baseline.weights > ${WF_CPP} DEPENDS baseline.weights binary2cpp VERBATIM) add_library(adams2019_weights_obj OBJECT ${WF_CPP}) # cost_model, train_cost_model add_executable(adams2019_cost_model.generator cost_model_generator.cpp) target_link_libraries(adams2019_cost_model.generator PRIVATE Halide::Generator) add_halide_library(adams2019_cost_model FROM adams2019_cost_model.generator GENERATOR cost_model FUNCTION_NAME cost_model TARGETS cmake) add_halide_library(adams2019_train_cost_model FROM adams2019_cost_model.generator GENERATOR train_cost_model FUNCTION_NAME train_cost_model TARGETS cmake USE_RUNTIME adams2019_cost_model.runtime) # TODO: replace when we support "fat" objects in generators list(LENGTH CMAKE_OSX_ARCHITECTURES num_archs) if (APPLE AND num_archs GREATER 1) if (NOT "x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES OR NOT "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) message(FATAL_ERROR "Don't know how to compile for ${CMAKE_OSX_ARCHITECTURES}") endif () if (Halide_CMAKE_TARGET MATCHES "x86") set(arch arm) else () set(arch x86) endif () add_halide_library(adams2019_cost_model-arch FROM adams2019_cost_model.generator GENERATOR cost_model FUNCTION_NAME cost_model TARGETS osx-${arch}-64) add_halide_library(adams2019_train_cost_model-arch FROM adams2019_cost_model.generator GENERATOR train_cost_model FUNCTION_NAME train_cost_model TARGETS osx-${arch}-64 USE_RUNTIME adams2019_cost_model-arch.runtime) add_custom_command( TARGET adams2019_cost_model POST_BUILD COMMAND lipo -create $ $ -output $ VERBATIM ) add_custom_command( TARGET adams2019_cost_model.runtime POST_BUILD COMMAND lipo -create $ $ -output $ VERBATIM ) add_custom_command( TARGET adams2019_train_cost_model POST_BUILD COMMAND lipo -create $ $ -output $ VERBATIM ) endif () # adams2019_retrain_cost_model if (WITH_UTILS) add_executable(adams2019_retrain_cost_model DefaultCostModel.cpp Weights.cpp retrain_cost_model.cpp $) target_include_directories(adams2019_retrain_cost_model PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019") target_link_libraries(adams2019_retrain_cost_model PRIVATE ASLog adams2019_cost_model adams2019_train_cost_model Halide::Halide Halide::Plugin) endif () # ================================================================= ## # Main autoscheduler library ## add_autoscheduler( NAME Adams2019 SOURCES AutoSchedule.cpp Cache.cpp DefaultCostModel.cpp FunctionDAG.cpp LoopNest.cpp State.cpp Weights.cpp $ ) target_include_directories(Halide_Adams2019 PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019") target_link_libraries(Halide_Adams2019 PRIVATE ASLog ParamParser adams2019_cost_model adams2019_train_cost_model) # ==================================================== # Auto-tuning support utilities. if (WITH_UTILS) add_executable(adams2019_weightsdir_to_weightsfile weightsdir_to_weightsfile.cpp Weights.cpp) target_include_directories(adams2019_weightsdir_to_weightsfile PRIVATE ${COMMON_DIR}) target_link_libraries(adams2019_weightsdir_to_weightsfile PRIVATE Halide::Runtime) endif () # ================================================================= # Tests for private/internal functionality of Adams2019 (vs for public functionality, # which is handled in tests/autoschedulers/Adams2019) if (WITH_TESTS) add_executable(adams2019_test_function_dag test_function_dag.cpp FunctionDAG.cpp) target_link_libraries(adams2019_test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME adams2019_test_function_dag COMMAND adams2019_test_function_dag) set_tests_properties(adams2019_test_function_dag PROPERTIES LABELS "adams2019;autoschedulers_cpu") endif() Halide-17.0.1/src/autoschedulers/adams2019/Cache.cpp000066400000000000000000000065041456515664200217420ustar00rootroot00000000000000#include "Cache.h" #include "LoopNest.h" #include "State.h" namespace Halide { namespace Internal { namespace Autoscheduler { bool Cache::add_memoized_blocks(const State *state, std::function &&)> &accept_child, const FunctionDAG::Node *node, int &num_children, const FunctionDAG &dag, const Adams2019Params ¶ms, CostModel *cost_model) const { if (!options.cache_blocks || !memoized_compute_root_blocks.contains(node)) { // either memoization is turned off, or we haven't cached this node yet. return false; } // get correct vector dimension. int vector_dims = -1; for (const auto &child : state->root->children) { if (child->node == node && child->stage->index == 0) { vector_dims = child->vector_dim; break; } } const auto &vector_dim_map = memoized_compute_root_blocks.get(node); if (vector_dim_map.count(vector_dims) == 0) { // Never cached this vector dimension before. return false; } auto blocks = vector_dim_map.at(vector_dims); size_t num_stages = node->stages.size(); for (size_t i = 0; i < blocks.size(); i += num_stages) { // Construct child from memoization. IntrusivePtr child = state->make_child(); LoopNest *new_root = new LoopNest; new_root->copy_from(*(state->root)); child->root = new_root; child->num_decisions_made++; int block_index = 0; for (const auto &new_child : new_root->children) { if (new_child->node == node) { break; } block_index++; } // Copy all stages into new_root. for (size_t j = 0; j < num_stages; j++) { LoopNest *new_block = new LoopNest; new_block->copy_from_including_features(*blocks[i + j]); new_root->children[block_index++] = new_block; } if (child->calculate_cost(dag, params, cost_model, this->options)) { num_children++; accept_child(std::move(child)); cache_hits++; } } // succesfully added cached items! return true; } void Cache::memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root) { if (!options.cache_blocks) { return; } int vector_dim = -1; bool loop_nest_found = false; for (auto &child : new_root->children) { if (child->node == node && child->stage->index == 0) { vector_dim = child->vector_dim; loop_nest_found = true; break; } } internal_assert(loop_nest_found) << "memoize_blocks did not find loop nest!\n"; auto &blocks = memoized_compute_root_blocks.get_or_create(node)[vector_dim]; for (auto &child : new_root->children) { if (child->node == node) { // Need const reference for copy. const LoopNest *child_ptr = child.get(); LoopNest *new_block = new LoopNest; new_block->copy_from_including_features(*child_ptr); blocks.emplace_back(new_block); cache_misses++; } } } } // namespace Autoscheduler } // namespace Internal } // namespace Halide Halide-17.0.1/src/autoschedulers/adams2019/Cache.h000066400000000000000000000123531456515664200214060ustar00rootroot00000000000000#ifndef BLOCK_CACHE_H #define BLOCK_CACHE_H #include "ASLog.h" #include "CostModel.h" #include "Featurization.h" #include "FunctionDAG.h" #include "Halide.h" #include "LoopNest.h" #include "PerfectHashMap.h" namespace Halide { namespace Internal { namespace Autoscheduler { /* The adams2019 autoscheduler has two caching implementations within its schedule search: 1) Block (or tile) caching: handled by this file and Cache.cpp. If block caching is enabled the below data structure (Cache) is used to save the tilings that have been generated at prior passes of beam search. This allows for faster children generation when tiling is a scheduling option. As noted below, this cache is a mapping of the form: Node -> vector_dim -> vector. 2) Featurization caching: handled within a LoopNest. The featurization of a LoopNest is used at multiple points in beam search (i.e. whenever the featurization of a child LoopNest is computed), so it is useful to not repeatedly calculate featurizations. As noted in LoopNest.h, this mapping is of the form: (structural hash of producers) -> (StageMap of schedule features). Note that not all features can be safely cached (i.e. inlined features), so some must be recomputed (see LoopNest::recompute_inlined_features). Important changes that caching impacts, outside of this file and Cache.cpp: - LoopNest::compute_features If cache_features is enabled (i.e. disable_memoized_features==0) then this function caches the featurizations of its children, and if called again, reuses those cached featurizations. The features are saved in a LoopNest's member, std::map<> features_cache. Some features do not persist, and the FeaturesIntermediates struct (see Featurization.h) is used to cache useful values that aid in recomputing such features. - LoopNest::compute_working_set_from_features Used to re-compute the working_set from cached features. - LoopNest::recompute_inlined_features Recursively recomputes the features of all inlined Funcs based on the cached FeaturesIntermediates struct. - LoopNest::compute_hash_of_producers_stored_at_root Computes a structural hash for use in feature caching in a LoopNest. - LoopNest::collect_producers Collects all producers for a LoopNest for use in calculating the structural hash in LoopNest::compute_hash_of_producers_stored_at_root. - LoopNest::collect_stages Collects all stages referenced by a LoopNest for use in LoopNest::collect_producers. - State::compute_featurization Calculates and stores hash_of_producers_stored_at_root for each child if feature caching is enabled. - State::generate_children If block caching is enabled, and tilings for this States have been cached in the Cache object, then tilings are not generated again, and the cached tilings are used instead. See Cache::add_memoized_blocks below (and in Cache.cpp). Additionally, if a tiling has not been cached, and it is not pruned, then the tiling will be cached using Cache::memoize_blocks (see below and in Cache.cpp). */ struct State; /* Object stores caching options for autoscheduling. cache_blocks: decides if tilings are cached for decisions related to parallelizing the loops of a Func. cache_features: decides if LoopNest::compute_features will cache / will use cached featurizations. */ struct CachingOptions { bool cache_blocks = false; bool cache_features = false; static CachingOptions MakeOptionsFromParams(const Adams2019Params ¶ms) { CachingOptions options; options.cache_blocks = params.disable_memoized_blocks == 0; options.cache_features = params.disable_memoized_features == 0; return options; } }; // Node -> (vector_dim -> vector) using BlockCache = NodeMap>>>; // Cache for memoizing possible tilings. // Tracks hit/miss statistics for both block caching // and for feature caching (self-contained by LoopNests). struct Cache { CachingOptions options; BlockCache memoized_compute_root_blocks; mutable size_t cache_hits = 0; mutable size_t cache_misses = 0; Cache() = delete; Cache(const CachingOptions &_options, size_t nodes_size) : options(_options) { if (options.cache_blocks) { memoized_compute_root_blocks.make_large(nodes_size); } } ~Cache() = default; // check if we generated tilings for the current func on a previous pass // if so, add them and return true. // otherwise, return false (also return false if memoization is turned off). bool add_memoized_blocks(const State *state, std::function &&)> &accept_child, const FunctionDAG::Node *node, int &num_children, const FunctionDAG &dag, const Adams2019Params ¶ms, CostModel *cost_model) const; // Generate tilings for a specific vector dimension and memoize them. void memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root); }; } // namespace Autoscheduler } // namespace Internal } // namespace Halide #endif // BLOCK_CACHE_H Halide-17.0.1/src/autoschedulers/adams2019/CostModel.h000066400000000000000000000057121456515664200222750ustar00rootroot00000000000000#ifndef COST_MODEL_H #define COST_MODEL_H #include #include "Featurization.h" #include "FunctionDAG.h" #include "HalideBuffer.h" #include "PerfectHashMap.h" // An abstract base class for a cost model. namespace Halide { namespace Internal { namespace Autoscheduler { typedef PerfectHashMap StageMapOfScheduleFeatures; struct Adams2019Params { /** Maximum level of parallelism available. */ int parallelism = 16; /** Beam size to use in the beam search. Defaults to 32. Use 1 to get a greedy search instead. * Formerly HL_BEAM_SIZE */ int beam_size = 32; /** percent chance of accepting each state in the beam. * Normalized by the number of decisions made, so 5 would be there's a 5 percent chance of never rejecting any states. * Formerly HL_RANDOM_DROPOUT */ int random_dropout = 100; /** Random seed used by the random dropout. If 0, use time(). * Formerly HL_SEED */ int random_dropout_seed = 0; /** When training or schedule, read weights from this directory or file. * (If path ends in `.weights` it is written as a single file, otherwise a directory of files.) * Formerly HL_WEIGHTS_DIR */ std::string weights_path; /** If set to nonzero value: limits the search space to that of Mullapudi et al. * Formerly HL_NO_SUBTILING */ int disable_subtiling = 0; /** If set to nonzero value: features of possible schedules are always recalculated, and are not cached across passes. * Formerly HL_DISABLE_MEMOIZED_FEATURES */ int disable_memoized_features = 0; /** If set to nonzero value: tiling sizes are not cached across passes. * Formerly HL_DISABLE_MEMOIZED_BLOCKS */ int disable_memoized_blocks = 0; /** If >= 0, only consider schedules that allocate at most this much memory (measured in bytes). * Formerly HL_AUTOSCHEDULE_MEMORY_LIMIT */ int64_t memory_limit = -1; }; } // namespace Autoscheduler } // namespace Internal class CostModel { public: virtual ~CostModel() = default; // Configure the cost model for the algorithm to be scheduled. virtual void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, const Internal::Autoscheduler::Adams2019Params ¶ms) = 0; // Enqueue a schedule to be evaluated. Will annotate the value located at cost_ptr when the evaluation takes place. // Note that the dag argument should correspond to the dag specified previously when calling set_pipeline_features. virtual void enqueue(const Internal::Autoscheduler::FunctionDAG &dag, const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats, double *cost_ptr) = 0; // Evaluate all schedules in the queue. virtual void evaluate_costs() = 0; // Discard all schedules in the queue. virtual void reset() = 0; }; } // namespace Halide #endif // COST_MODEL_H Halide-17.0.1/src/autoschedulers/adams2019/DefaultCostModel.cpp000066400000000000000000000366431456515664200241440ustar00rootroot00000000000000// This file is a wrapper around the cost model that loads and saves // weights, and maintains state of various kinds. For the actual cost // model, see cost_model_generator.cpp #include #include #include #include #include #include #include #include "ASLog.h" #include "DefaultCostModel.h" #include "HalideBuffer.h" #include "NetworkSize.h" #include "adams2019_cost_model.h" #include "adams2019_train_cost_model.h" // This is an embedded version of `baseline.weights`. // The embedding is done using binary2cpp. extern "C" unsigned char baseline_weights[]; extern "C" int baseline_weights_length; namespace Halide { namespace { using Halide::Internal::aslog; using Halide::Internal::PipelineFeatures; using Halide::Internal::ScheduleFeatures; using Halide::Runtime::Buffer; bool ends_with(const std::string &str, const std::string &suffix) { if (str.size() < suffix.size()) { return false; } size_t off = str.size() - suffix.size(); for (size_t i = 0; i < suffix.size(); i++) { if (str[off + i] != suffix[i]) { return false; } } return true; } } // namespace void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, const Internal::Autoscheduler::Adams2019Params ¶ms) { const int pipeline_feat_size = head1_w * head1_h; // We ignore the first seven pipeline features in the cost // model. It's just a mask of which types are in use. static_assert(sizeof(PipelineFeatures) - 7 * sizeof(int) == sizeof(int) * pipeline_feat_size, "Incorrect size for pipeline features"); int num_stages = 0; for (const auto &n : dag.nodes) { if (!n.is_input) num_stages += (int)n.stages.size(); } Runtime::Buffer pipeline_features(head1_w, head1_h, num_stages); int stage = 0; for (const auto &n : dag.nodes) { if (n.is_input) continue; for (auto it = n.stages.rbegin(); it != n.stages.rend(); it++) { const auto &s = *it; const int *pipeline_feats = (const int *)(&(s.features)) + 7; // skip the first 7 features for (int i = 0; i < pipeline_feat_size; i++) { int x = i / 7; int y = i % 7; pipeline_features(x, y, stage) = pipeline_feats[i]; } stage += 1; } } internal_assert(stage == num_stages); pipeline_feat_queue = pipeline_features; internal_assert(params.parallelism > 0); num_cores = params.parallelism; } void DefaultCostModel::set_pipeline_features(const Runtime::Buffer &pipeline_feats, int n) { pipeline_feat_queue = pipeline_feats; internal_assert(n > 0); num_cores = n; } void DefaultCostModel::enqueue(const Internal::Autoscheduler::FunctionDAG &dag, const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats, double *cost_ptr) { num_stages = (int)schedule_feats.size(); Runtime::Buffer schedule_features; // Tell the cost model about this state. It won't actually // evaluate it until we call evaluate_costs (or if it runs out // of internal buffer space), so that the evaluations can be // batched. enqueue(num_stages, &schedule_features, cost_ptr); // index of current stage whose features we are reading int stage = 0; // load schedule features into input buffer for (const auto &n : dag.nodes) { // Inputs are computed outside of the pipeline and don't count. if (n.is_input) continue; // The remaining stages are not yet // scheduled. Optimistically assume their internal costs // will not depend on the decisions made already, so // there's no point adding it on to the total because it's // the same across all states. An underestimate of the // cost for loading from these unscheduled stages is // already baked into the scheduled stages that consume // them. if (stage >= num_stages) break; // Load up the schedule features for all stages of this Func. for (auto it = n.stages.rbegin(); it != n.stages.rend(); it++) { internal_assert(schedule_feats.contains(&*it)) << n.func.name() << "\n"; const auto &feat = schedule_feats.get(&*it); for (size_t i = 0; i < ScheduleFeatures::num_features(); i++) { schedule_features(i, stage) = feat[i]; } stage += 1; } } // Check we considered everything we were supposed to. internal_assert(stage == num_stages); } void DefaultCostModel::enqueue(int ns, Runtime::Buffer *schedule_feats, double *cost_ptr) { num_stages = ns; // We know the most stages that will ever be enqueued from the pipeline features internal_assert(pipeline_feat_queue.data() && "Call set_pipeline_features before calling enqueue\n"); const int max_num_stages = pipeline_feat_queue.dim(2).extent(); internal_assert(num_stages <= max_num_stages) << "schedule features has more stages (" << num_stages << ") than pipeline features (" << max_num_stages << ")\n"; const int batch_size = 1024; if (!schedule_feat_queue.data() || schedule_feat_queue.dim(2).extent() < max_num_stages) { internal_assert(cursor == 0); schedule_feat_queue = Runtime::Buffer(batch_size, head2_w, max_num_stages); if (!costs.data()) { internal_assert(!cost_ptrs.data()); costs = Runtime::Buffer(batch_size); cost_ptrs = Runtime::Buffer(batch_size); } } if (cursor == batch_size) { evaluate_costs(); } *schedule_feats = schedule_feat_queue.sliced(0, cursor); cost_ptrs(cursor) = cost_ptr; cursor++; } // namespace Halide // Backprop state. To run ADAM we need a running average of the // gradients and gradients squared. We add an outer dimension of // size 3 to the new weight outputs to track this state. So buf(_, // 0) is the new weight, buf(_, 1) is the ADAM running average of // the first moment, and buf(_, 2) is the ADAM running average of // the second moment. float DefaultCostModel::backprop(const Runtime::Buffer &true_runtimes, float learning_rate) { internal_assert(cursor != 0); internal_assert(pipeline_feat_queue.data()); internal_assert(schedule_feat_queue.data()); auto loss = Runtime::Buffer::make_scalar(); if (!head1_filter_update.data()) { auto weight_update_buffer = [](const Runtime::Buffer &w) { std::vector size; for (int i = 0; i < w.dimensions(); i++) { size.push_back(w.dim(i).extent()); } size.push_back(4); auto buf = Runtime::Buffer(size); buf.fill(0.0f); return buf; }; head1_filter_update = weight_update_buffer(weights.head1_filter); head1_bias_update = weight_update_buffer(weights.head1_bias); head2_filter_update = weight_update_buffer(weights.head2_filter); head2_bias_update = weight_update_buffer(weights.head2_bias); conv1_filter_update = weight_update_buffer(weights.conv1_filter); conv1_bias_update = weight_update_buffer(weights.conv1_bias); timestep = 0; } Runtime::Buffer dst = costs.cropped(0, 0, cursor); int fastest_idx = 0; for (int i = 0; i < cursor; i++) { if (true_runtimes(i) < true_runtimes(fastest_idx)) { fastest_idx = i; } } int result = train_cost_model(num_stages, cursor, num_cores, pipeline_feat_queue, schedule_feat_queue, weights.head1_filter, weights.head1_bias, weights.head2_filter, weights.head2_bias, weights.conv1_filter, weights.conv1_bias, learning_rate, timestep++, fastest_idx, true_runtimes.alias(), head1_filter_update, head1_bias_update, head2_filter_update, head2_bias_update, conv1_filter_update, conv1_bias_update, dst, loss); (void)result; internal_assert(result == 0); bool any_nans = false; for (int i = 0; i < cursor; i++) { internal_assert(cost_ptrs(i)); *(cost_ptrs(i)) = dst(i); if (std::isnan(dst(i))) { any_nans = true; aslog(1) << "Prediction " << i << " is NaN. True runtime is " << true_runtimes(i) << "\n"; aslog(1) << "Checking pipeline features for NaNs...\n"; pipeline_feat_queue.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); aslog(1) << "None found\n"; aslog(1) << "Checking schedule features for NaNs...\n"; schedule_feat_queue.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); aslog(1) << "None found\n"; aslog(1) << "Checking network weights for NaNs...\n"; weights.for_each_buffer([&](const Runtime::Buffer &buf) { buf.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); }); aslog(1) << "None found\n"; } internal_assert(true_runtimes(i) > 0); } if (any_nans) { abort(); } // Update weights locally auto update_weight = [](const Runtime::Buffer &src, Runtime::Buffer &dst) { dst.copy_from(src.sliced(src.dimensions() - 1, 0)); }; update_weight(head1_filter_update, weights.head1_filter); update_weight(head1_bias_update, weights.head1_bias); update_weight(head2_filter_update, weights.head2_filter); update_weight(head2_bias_update, weights.head2_bias); update_weight(conv1_filter_update, weights.conv1_filter); update_weight(conv1_bias_update, weights.conv1_bias); internal_assert(cursor != 0); return loss(); } void DefaultCostModel::evaluate_costs() { if (cursor == 0 || !schedule_feat_queue.data()) { return; } internal_assert(pipeline_feat_queue.data()); internal_assert(schedule_feat_queue.data()); Runtime::Buffer dst = costs.cropped(0, 0, cursor); auto loss = Runtime::Buffer::make_scalar(); int result = cost_model(num_stages, cursor, num_cores, pipeline_feat_queue, schedule_feat_queue, weights.head1_filter, weights.head1_bias, weights.head2_filter, weights.head2_bias, weights.conv1_filter, weights.conv1_bias, 0.0f, 0, 0, nullptr, dst, loss); (void)result; internal_assert(result == 0); for (int i = 0; i < cursor; i++) { internal_assert(cost_ptrs(i)); *(cost_ptrs(i)) = dst(i); } cursor = 0; } void DefaultCostModel::load_weights() { bool need_randomize = randomize_weights; if (weights_in_path.empty()) { aslog(1) << "Loading weights from built-in data...\n"; // This copy shouldn't be necessary, but std::istream in C++ doesn't seem // to have a convenient wrap-around-constant-data variant... and since // this isn't much data, just copy it. const std::string baseline_weights_data((const char *)&baseline_weights[0], baseline_weights_length); std::istringstream i(baseline_weights_data); if (!weights.load(i)) { std::cerr << "The built-in baseline weights should never fail to load\n"; internal_assert(0); } } else if (ends_with(weights_in_path, ".weights")) { aslog(1) << "Loading weights from " << weights_in_path << " ...\n"; if (!weights.load_from_file(weights_in_path)) { // Emit to cout (rather than cerr) because the latter is hidden during the autotune loop, // and we want this to be seen. std::cout << "WARNING, error in reading weights from " << weights_in_path << ", randomizing...\n"; need_randomize = true; } } else { aslog(1) << "Loading weights from directory " << weights_in_path << " ...\n"; std::cerr << "Loading weights from a directory is deprecated; please convert to a .weights file\n"; if (!weights.load_from_dir(weights_in_path)) { std::cout << "WARNING, error in reading weights from " << weights_in_path << ", randomizing...\n"; need_randomize = true; } } if (!need_randomize && weights.pipeline_features_version != PipelineFeatures::version()) { // Emit to cout (rather than cerr) because the latter is hidden during the autotune loop, // and we want this to be seen. std::cout << "WARNING: loaded weights have pipeline_version = " << weights.pipeline_features_version << " but current pipeline_version is " << PipelineFeatures::version() << "; the weights may be invalid. Using anyway.\n"; } if (!need_randomize && weights.schedule_features_version != ScheduleFeatures::version()) { // Emit to cout (rather than cerr) because the latter is hidden during the autotune loop, // and we want this to be seen. std::cout << "WARNING: loaded weights have schedule_features_version = " << weights.schedule_features_version << " but current schedule_features_version is " << ScheduleFeatures::version() << "; the weights may be invalid. Using anyway.\n"; } if (need_randomize) { auto seed = time(nullptr); std::cout << "Randomizing weights using seed = " << seed << "\n"; weights.randomize((uint32_t)seed); } // Update so that any version of this we save will have the current version weights.pipeline_features_version = PipelineFeatures::version(); weights.schedule_features_version = ScheduleFeatures::version(); } void DefaultCostModel::save_weights() { internal_assert(!weights_out_path.empty()) << "Unable to save weights: no output path specified\n"; if (ends_with(weights_out_path, ".weights")) { internal_assert(weights.save_to_file(weights_out_path)) << "Unable to save weights to file: " << weights_out_path << "\n"; } else { std::cerr << "Saving weights to a directory is deprecated; please convert to a .weights file\n"; internal_assert(weights.save_to_dir(weights_out_path)) << "Unable to save weights to file: " << weights_out_path << "\n"; } } // Discard any enqueued but unevaluated schedules void DefaultCostModel::reset() { cursor = 0; } std::unique_ptr make_default_cost_model(const std::string &weights_in_path, const std::string &weights_out_path, bool randomize_weights) { return std::unique_ptr(new DefaultCostModel(weights_in_path, weights_out_path, randomize_weights)); } } // namespace Halide Halide-17.0.1/src/autoschedulers/adams2019/DefaultCostModel.h000066400000000000000000000052171456515664200236020ustar00rootroot00000000000000#ifndef DEFAULT_COST_MODEL_H #define DEFAULT_COST_MODEL_H #include "CostModel.h" #include "Weights.h" #include namespace Halide { namespace Internal { namespace Autoscheduler { struct Adams2019Params; } // namespace Autoscheduler } // namespace Internal class DefaultCostModel : public CostModel { private: Internal::Weights weights; Runtime::Buffer schedule_feat_queue, pipeline_feat_queue, costs; Runtime::Buffer cost_ptrs; int cursor, num_stages, num_cores; const std::string weights_in_path, weights_out_path; const bool randomize_weights; Runtime::Buffer head1_filter_update, head1_bias_update, head2_filter_update, head2_bias_update, conv1_filter_update, conv1_bias_update; int timestep = 0; public: DefaultCostModel(const std::string &weights_in_path, const std::string &weights_out_path, bool randomize_weights) : weights_in_path(weights_in_path), weights_out_path(weights_out_path), randomize_weights(randomize_weights) { load_weights(); } ~DefaultCostModel() override = default; // Configure the cost model for the algorithm to be scheduled. void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, const Internal::Autoscheduler::Adams2019Params ¶ms) override; void set_pipeline_features(const Runtime::Buffer &, int n); // Enqueue a schedule to be evaluated. The second version of this method returns a buffer of // schedule_features that should be filled in by the caller. void enqueue(const Internal::Autoscheduler::FunctionDAG &dag, const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats, double *cost_ptr) override; void enqueue(int ns, Runtime::Buffer *schedule_feats, double *cost_ptr); // Evaluate all schedules in the queue. void evaluate_costs() override; // Discard all schedules in the queue. void reset() override; // Update model weights using true measured runtimes. float backprop(const Runtime::Buffer &true_runtimes, float learning_rate); // Save/Load the model weights to/from disk. void save_weights(); void load_weights(); }; std::unique_ptr make_default_cost_model(const std::string &weights_in_dir = "", const std::string &weights_out_dir = "", bool randomize_weights = false); } // namespace Halide #endif // DEFAULT_COST_MODEL_H Halide-17.0.1/src/autoschedulers/adams2019/Featurization.h000066400000000000000000000405371456515664200232340ustar00rootroot00000000000000#ifndef FEATURIZATION_H #define FEATURIZATION_H #include #include #include namespace Halide { namespace Internal { // The algorithm-specific features. For legacy reasons these are // called PipelineFeatures in the code. struct PipelineFeatures { static constexpr size_t num_features() { return sizeof(PipelineFeatures) / sizeof(int); } static constexpr uint32_t version() { return 3; } // Access them by index. int &operator[](int idx) { return ((int *)(this))[idx]; } int operator[](int idx) const { return ((const int *)(this))[idx]; } enum class OpType { Const, Cast, Variable, Param, Add, Sub, Mod, Mul, Div, Min, Max, EQ, NE, LT, LE, And, Or, Not, Select, ImageCall, // Loads to an input buffer FuncCall, // Calls to another pipeline stage SelfCall, // Recursive calls from a Func to itself ExternCall, // Math intrinsics, typically Let, NumOpTypes }; enum class ScalarType { Bool, UInt8, // or Int8 UInt16, // or Int16 UInt32, // or Int32 UInt64, // or Int64 Float, Double, NumScalarTypes }; // Not fed into the network, but helps avoid printing huge numbers of zeros while debugging things int types_in_use[(int)ScalarType::NumScalarTypes] = {}; int op_histogram[(int)OpType::NumOpTypes][(int)ScalarType::NumScalarTypes] = {}; enum class AccessType { LoadFunc, LoadSelf, LoadImage, Store, NumAccessTypes }; // Finer granularity call/store node properties. These are a // function of the matrix of derivatives of each arg to a // call w.r.t the loop variables of the Stage. Each row of // the matrix corresponds to one of the call arguments. In // each case we illustrate such a call, assuming that the // variables of this Func are x, y, z, and that the // dimension vectorized over is the first (x). // Square identity matrix. f(x - 2, y + 8, z + param) int pointwise_accesses[(int)AccessType::NumAccessTypes][(int)ScalarType::NumScalarTypes] = {}; // Square permutation matrix. f(y + 1, z - 3, x) int transpose_accesses[(int)AccessType::NumAccessTypes][(int)ScalarType::NumScalarTypes] = {}; // Each row sums to 1. Each column sums to 1 or 0. f(y, x) int broadcast_accesses[(int)AccessType::NumAccessTypes][(int)ScalarType::NumScalarTypes] = {}; // Each row sums to 1 or 0. Each column sums to 1. f(z, y, x, 4) int slice_accesses[(int)AccessType::NumAccessTypes][(int)ScalarType::NumScalarTypes] = {}; void dump(std::ostream &os) const { for (int i = 0; i < (int)ScalarType::NumScalarTypes; i++) { const char *type_names[] = {"Bool", "UInt8", "UInt16", "UInt32", "UInt64", "Float", "Double"}; // Skip printing for types not used if (!types_in_use[i]) { continue; } os << " Featurization for type " << type_names[i] << "\n" << " Op histogram:\n" << " Constant: " << op_histogram[(int)OpType::Const][i] << "\n" << " Cast: " << op_histogram[(int)OpType::Cast][i] << "\n" << " Variable: " << op_histogram[(int)OpType::Variable][i] << "\n" << " Param: " << op_histogram[(int)OpType::Param][i] << "\n" << " Add: " << op_histogram[(int)OpType::Add][i] << "\n" << " Sub: " << op_histogram[(int)OpType::Sub][i] << "\n" << " Mod: " << op_histogram[(int)OpType::Mod][i] << "\n" << " Mul: " << op_histogram[(int)OpType::Mul][i] << "\n" << " Div: " << op_histogram[(int)OpType::Div][i] << "\n" << " Min: " << op_histogram[(int)OpType::Min][i] << "\n" << " Max: " << op_histogram[(int)OpType::Max][i] << "\n" << " EQ: " << op_histogram[(int)OpType::EQ][i] << "\n" << " NE: " << op_histogram[(int)OpType::NE][i] << "\n" << " LT: " << op_histogram[(int)OpType::LT][i] << "\n" << " LE: " << op_histogram[(int)OpType::LE][i] << "\n" << " And: " << op_histogram[(int)OpType::And][i] << "\n" << " Or: " << op_histogram[(int)OpType::Or][i] << "\n" << " Not: " << op_histogram[(int)OpType::Not][i] << "\n" << " Select: " << op_histogram[(int)OpType::Select][i] << "\n" << " ImageCall: " << op_histogram[(int)OpType::ImageCall][i] << "\n" << " FuncCall: " << op_histogram[(int)OpType::FuncCall][i] << "\n" << " SelfCall: " << op_histogram[(int)OpType::SelfCall][i] << "\n" << " ExternCall: " << op_histogram[(int)OpType::ExternCall][i] << "\n" << " Let: " << op_histogram[(int)OpType::Let][i] << "\n" << " Memory access patterns. Columns are calls to other Funcs, self-calls, input image access, and stores\n" << " Pointwise: " << pointwise_accesses[0][i] << " " << pointwise_accesses[1][i] << " " << pointwise_accesses[2][i] << " " << pointwise_accesses[3][i] << "\n" << " Transpose: " << transpose_accesses[0][i] << " " << transpose_accesses[1][i] << " " << transpose_accesses[2][i] << " " << transpose_accesses[3][i] << "\n" << " Broadcast: " << broadcast_accesses[0][i] << " " << broadcast_accesses[1][i] << " " << broadcast_accesses[2][i] << " " << broadcast_accesses[3][i] << "\n" << " Slice: " << slice_accesses[0][i] << " " << slice_accesses[1][i] << " " << slice_accesses[2][i] << " " << slice_accesses[3][i] << "\n"; } } }; // The schedule-dependent portion of the featurization of a stage struct ScheduleFeatures { static constexpr size_t num_features() { return sizeof(ScheduleFeatures) / sizeof(double); } static constexpr uint32_t version() { return 3; } double &operator[](int idx) { return ((double *)(this))[idx]; } double operator[](int idx) const { return ((const double *)(this))[idx]; } // The number of times storage for this stage is allocated. The // product of outer loops at store_at site double num_realizations = 0; // The number of times a tile of the stage is computed. The // product of outer loops at compute_at site. Always at least as // large as num_realizations. double num_productions = 0; // Number of times the innermost loop happens per allocation. double points_computed_per_realization = 0; // Number of times the innermost stmt happens per tile computed. double points_computed_per_production = 0; // The total trip count of the innermost loop over the entire program. // == num_realizations * points_computed_per_realization // ~= num_productions * points_computed_per_production // Only approximately equal because of the simplifications made // regarding the modeling of sliding window double points_computed_total = 0; // The minimum number of points that are actually required to be // computed to produce a correct output. Not actually a function // of the schedule, but a useful reference point to see if a // schedule has gone off the rails. double points_computed_minimum = 0; // Trip count of innermost loop nest. double innermost_loop_extent = 0; // Trip count of just the pure loops in the innermost loop // (i.e. excludes loops representing reductions). double innermost_pure_loop_extent = 0; // If this is to be unrolled, what is the product of the unrolling // factors. double unrolled_loop_extent = 0; // The number of parallel jobs launched in the production of this // stage. Always 1 unless the Func is compute_root, because we // place all parallelism at the outermost level. double inner_parallelism = 0; // The number of times this Func could be realized in parallel. 1 // when the Func is compute_root. Product of the containing // parallel loops for other stages. double outer_parallelism = 0; // Size of the region computed at the store_at site, measured in // bytes. Does not take storage-folding optimizations into account. double bytes_at_realization = 0; // Size of the region computed per tile (at the compute_at site), // measured in bytes. This includes the effect of storage-folding, // so it's a better number to look at to estimate memory usage. double bytes_at_production = 0; // If the stage were hypothetically scheduled at root, how much // memory would it consumed. Doesn't vary w.r.t. the schedule, but // a useful reference. double bytes_at_root = 0; // Same as the above, but only measuring the extent along the // innermost dimension, so that we can reason about spatial // locality, cache lines, prefetchers, etc. double innermost_bytes_at_realization = 0; double innermost_bytes_at_production = 0; double innermost_bytes_at_root = 0; // For inlined Funcs, how many calls are made to this Func total. double inlined_calls = 0; // Number of unique bytes and unique continguous segments of // memory loaded from all inputs over a single trip of the loop // containing the allocation site. double unique_bytes_read_per_realization = 0; double unique_lines_read_per_realization = 0; // The sum of the sizes of the allocations accessed at this // site. Gives a hint as to the likely locality of it. double allocation_bytes_read_per_realization = 0; // The sum of the sizes of the temporary allocations while // computing one tile of this Func. Probably a good thing if it // fits in cache. double working_set = 0; // The vectorization factor (#simd lanes) to be used to compute // this stage. Wasted work if it's smaller than the stage's native // vector size. double vector_size = 0; // The native vector size for the narrowest type used. Does not // vary with the schedule, but a useful reference point. double native_vector_size = 0; // Number of SIMD vectors computed double num_vectors = 0; // Number of scalars computed (e.g. from tails of loops) double num_scalars = 0; // The number of loads done per vector or scalar computed. Vector // gathers count as a batch of scalar loads. These get amortized // across unrolled blocks if some loads can be reused across the // unrolled dimension. double scalar_loads_per_vector = 0; double vector_loads_per_vector = 0; double scalar_loads_per_scalar = 0; // The memory footprint written over one per parallel task. The // union of the regions if the stage is computed at finer // granularity that one parallel task of some consumer. double bytes_at_task = 0; double innermost_bytes_at_task = 0; // The memory footprint accessed while computing a single vector. double unique_bytes_read_per_vector = 0; double unique_lines_read_per_vector = 0; // The memory footprint accessed per parallel task. Only counts // loads from things computed outside of that parallel task (to // measure the amount of traffic coming from another core). double unique_bytes_read_per_task = 0; double unique_lines_read_per_task = 0; // The sum of the sizes of all live allocations at various sites. double working_set_at_task = 0; double working_set_at_production = 0; double working_set_at_realization = 0; double working_set_at_root = 0; void dump(std::ostream &os) const { os << " num_realizations: " << num_realizations << "\n" << " num_productions: " << num_productions << "\n" << " points_computed_per_realization: " << points_computed_per_realization << "\n" << " points_computed_per_production: " << points_computed_per_production << "\n" << " points_computed_total: " << points_computed_total << "\n" << " points_computed_minimum: " << points_computed_minimum << "\n" << " innermost_loop_extent: " << innermost_loop_extent << "\n" << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n" << " unrolled_loop_extent: " << unrolled_loop_extent << "\n" << " inner_parallelism: " << inner_parallelism << "\n" << " outer_parallelism: " << outer_parallelism << "\n" << " bytes_at_realization: " << bytes_at_realization << "\n" << " bytes_at_production: " << bytes_at_production << "\n" << " bytes_at_root: " << bytes_at_root << "\n" << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n" << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n" << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n" << " inlined_calls: " << inlined_calls << "\n" << " unique_bytes_read_per_realization: " << unique_bytes_read_per_realization << "\n" << " unique_lines_read_per_realization: " << unique_lines_read_per_realization << "\n" << " allocation_bytes_read_per_realization: " << allocation_bytes_read_per_realization << "\n" << " working_set: " << working_set << "\n" << " vector_size: " << vector_size << "\n" << " native_vector_size: " << native_vector_size << "\n" << " num_vectors: " << num_vectors << "\n" << " num_scalars: " << num_scalars << "\n" << " scalar_loads_per_vector: " << scalar_loads_per_vector << "\n" << " vector_loads_per_vector: " << vector_loads_per_vector << "\n" << " scalar_loads_per_scalar: " << scalar_loads_per_scalar << "\n" << " bytes_at_task: " << bytes_at_task << "\n" << " innermost_bytes_at_task: " << innermost_bytes_at_task << "\n" << " unique_bytes_read_per_vector: " << unique_bytes_read_per_vector << "\n" << " unique_lines_read_per_vector: " << unique_lines_read_per_vector << "\n" << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n" << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n" << " working_set_at_task: " << working_set_at_task << "\n" << " working_set_at_production: " << working_set_at_production << "\n" << " working_set_at_realization: " << working_set_at_realization << "\n" << " working_set_at_root: " << working_set_at_root << "\n"; } bool equal(const ScheduleFeatures &other) const { const size_t n_features = ScheduleFeatures::num_features(); for (size_t i = 0; i < n_features; i++) { if ((*this)[i] != other[i]) { return false; } } return true; } }; /* Some feature values cannot be cached, and need to be recomputed. These intermediates allow for faster recomputation of such features. */ struct FeatureIntermediates { double inlined_calls; double num_scalars; double innermost_pure_loop_extent; double outer_parallelism; }; } // namespace Internal } // namespace Halide #endif Halide-17.0.1/src/autoschedulers/adams2019/FunctionDAG.cpp000066400000000000000000001302041456515664200230330ustar00rootroot00000000000000#include "FunctionDAG.h" #include #include "ASLog.h" namespace Halide { namespace Internal { template<> RefCount &ref_count(const Autoscheduler::BoundContents *t) noexcept { return t->ref_count; } template<> void destroy(const Autoscheduler::BoundContents *t) { // Release it back into the memory pool to be reused t->layout->release(t); } namespace Autoscheduler { namespace { class Featurizer : public IRVisitor { using IRVisitor::visit; Function &func; FunctionDAG::Node::Stage &stage; int &op_bucket(PipelineFeatures::OpType op_type, Type scalar_type) { int type_bucket = (int)classify_type(scalar_type); stage.features.types_in_use[type_bucket] = true; return stage.features.op_histogram[(int)op_type][type_bucket]; } PipelineFeatures::ScalarType classify_type(Type t) { if (t.is_float() && t.bits() > 32) { return PipelineFeatures::ScalarType::Double; } else if (t.is_float()) { return PipelineFeatures::ScalarType::Float; } else if (t.bits() == 1) { return PipelineFeatures::ScalarType::Bool; } else if (t.bits() <= 8) { return PipelineFeatures::ScalarType::UInt8; } else if (t.bits() <= 16) { return PipelineFeatures::ScalarType::UInt16; } else if (t.bits() <= 32) { return PipelineFeatures::ScalarType::UInt32; } else { return PipelineFeatures::ScalarType::UInt64; } } void visit(const Variable *op) override { if (op->param.defined()) { op_bucket(PipelineFeatures::OpType::Param, op->type)++; } else { op_bucket(PipelineFeatures::OpType::Variable, op->type)++; } } void visit(const IntImm *op) override { op_bucket(PipelineFeatures::OpType::Const, op->type)++; } void visit(const UIntImm *op) override { op_bucket(PipelineFeatures::OpType::Const, op->type)++; } void visit(const FloatImm *op) override { op_bucket(PipelineFeatures::OpType::Const, op->type)++; } void visit(const Add *op) override { op_bucket(PipelineFeatures::OpType::Add, op->type)++; IRVisitor::visit(op); } void visit(const Sub *op) override { op_bucket(PipelineFeatures::OpType::Sub, op->type)++; IRVisitor::visit(op); } void visit(const Mul *op) override { op_bucket(PipelineFeatures::OpType::Mul, op->type)++; IRVisitor::visit(op); } void visit(const Mod *op) override { op_bucket(PipelineFeatures::OpType::Mod, op->type)++; IRVisitor::visit(op); } void visit(const Div *op) override { op_bucket(PipelineFeatures::OpType::Div, op->type)++; IRVisitor::visit(op); } void visit(const Min *op) override { op_bucket(PipelineFeatures::OpType::Min, op->type)++; IRVisitor::visit(op); } void visit(const Max *op) override { op_bucket(PipelineFeatures::OpType::Max, op->type)++; IRVisitor::visit(op); } void visit(const EQ *op) override { op_bucket(PipelineFeatures::OpType::EQ, op->type)++; IRVisitor::visit(op); } void visit(const NE *op) override { op_bucket(PipelineFeatures::OpType::NE, op->type)++; IRVisitor::visit(op); } void visit(const LT *op) override { op_bucket(PipelineFeatures::OpType::LT, op->type)++; IRVisitor::visit(op); } void visit(const LE *op) override { op_bucket(PipelineFeatures::OpType::LE, op->type)++; IRVisitor::visit(op); } void visit(const GT *op) override { // Treat as a flipped LT op_bucket(PipelineFeatures::OpType::LT, op->type)++; IRVisitor::visit(op); } void visit(const GE *op) override { op_bucket(PipelineFeatures::OpType::LE, op->type)++; IRVisitor::visit(op); } void visit(const And *op) override { op_bucket(PipelineFeatures::OpType::And, op->type)++; IRVisitor::visit(op); } void visit(const Or *op) override { op_bucket(PipelineFeatures::OpType::Or, op->type)++; IRVisitor::visit(op); } void visit(const Not *op) override { op_bucket(PipelineFeatures::OpType::Not, op->type)++; IRVisitor::visit(op); } void visit(const Select *op) override { op_bucket(PipelineFeatures::OpType::Select, op->type)++; IRVisitor::visit(op); } Scope lets; void visit(const Let *op) override { ScopedBinding bind(lets, op->name, op->value); op_bucket(PipelineFeatures::OpType::Let, op->type)++; IRVisitor::visit(op); } void visit(const Call *op) override { IRVisitor::visit(op); if (op->call_type == Call::Halide) { if (op->name == func.name()) { visit_memory_access(op->name, op->type, op->args, PipelineFeatures::AccessType::LoadSelf); op_bucket(PipelineFeatures::OpType::SelfCall, op->type)++; } else { visit_memory_access(op->name, op->type, op->args, PipelineFeatures::AccessType::LoadFunc); op_bucket(PipelineFeatures::OpType::FuncCall, op->type)++; } } else if (op->call_type == Call::Extern || op->call_type == Call::PureExtern || op->call_type == Call::Intrinsic || op->call_type == Call::PureIntrinsic) { op_bucket(PipelineFeatures::OpType::ExternCall, op->type)++; } else if (op->call_type == Call::Image) { visit_memory_access(op->name, op->type, op->args, PipelineFeatures::AccessType::LoadImage); op_bucket(PipelineFeatures::OpType::ImageCall, op->type)++; } // TODO: separate out different math calls a little better (sqrt vs sin vs lerp) } // Take the derivative of an integer index expression. If it's // a rational constant, return it, otherwise return a sentinel // value. // The derivative of each let w.r.t each var. The keys are // just the var names separated by a space. Scope dlets; OptionalRational differentiate(const Expr &e, const string &v) { if (!expr_uses_var(e, v, lets)) { return {true, 0, 1}; } else if (const Variable *var = e.as()) { if (var->name == v) { return {true, 1, 1}; } for (const auto &l : stage.loop) { if (var->name == l.var) { // Some other loop variable return {true, 0, 1}; } } if (var->param.defined()) { // An argument return {true, 0, 1}; } else if (lets.contains(var->name)) { string key = v + " " + var->name; if (dlets.contains(key)) { return dlets.get(key); } auto a = differentiate(lets.get(var->name), v); dlets.push(key, a); return a; } // Some mystery variable. Who knows what it depends on. internal_error << "Encountered unbound variable in call args: " << var->name << "\n"; return {false, 0, 0}; } else if (const Add *op = e.as()) { auto a = differentiate(op->a, v); a += differentiate(op->b, v); return a; } else if (const Sub *op = e.as()) { auto a = differentiate(op->a, v); auto b = differentiate(op->b, v); b.numerator = -b.numerator; a += b; return a; } else if (const Mul *op = e.as()) { auto a = differentiate(op->a, v); if (const int64_t *ib = as_const_int(op->b)) { a.numerator *= *ib; return a; } else { return {false, 0, 0}; } } else if (const Div *op = e.as
()) { auto a = differentiate(op->a, v); if (const int64_t *ib = as_const_int(op->b)) { if (a.numerator != 0) { a.denominator *= *ib; } return a; } else { return {false, 0, 0}; } } else if (const Call *op = e.as()) { if (op->is_intrinsic(Call::likely)) { // TODO: Should a likely on one side of a min/max dominate? return differentiate(op->args[0], v); } } return {false, 0, 0}; } void visit_memory_access(const std::string &name, Type t, const vector &args, PipelineFeatures::AccessType type) { // Compute matrix of partial derivatives of args w.r.t. loop params vector> matrix; vector ones_per_row(args.size(), 0), zeros_per_row(args.size(), 0), ones_per_col(stage.loop.size(), 0), zeros_per_col(stage.loop.size(), 0); matrix.resize(args.size()); bool is_pointwise = args.size() == stage.loop.size(); for (size_t i = 0; i < args.size(); i++) { matrix[i].resize(stage.loop.size()); for (size_t j = 0; j < stage.loop.size(); j++) { auto deriv = differentiate(args[i], stage.loop[j].var); zeros_per_row[i] += deriv == 0; ones_per_row[i] += deriv == 1; zeros_per_col[j] += deriv == 0; ones_per_col[j] += deriv == 1; is_pointwise &= (i == j ? deriv == 1 : deriv == 0); matrix[i][j] = deriv; } } bool is_transpose = (args.size() == stage.loop.size()); bool is_broadcast = true, is_slice = true; for (size_t i = 0; i < args.size(); i++) { bool single_one = (ones_per_row[i] == 1) && (zeros_per_row[i] == stage.loop.size() - 1); bool all_zero = (zeros_per_row[i] == stage.loop.size()); is_transpose &= single_one; is_broadcast &= single_one; is_slice &= single_one || all_zero; } for (size_t j = 0; j < stage.loop.size(); j++) { bool single_one = (ones_per_col[j] == 1) && (zeros_per_col[j] == args.size() - 1); bool all_zero = (zeros_per_col[j] == args.size()); is_transpose &= single_one || all_zero; is_broadcast &= single_one; is_slice &= single_one; } auto type_class = classify_type(t); stage.features.pointwise_accesses[(int)type][(int)type_class] += is_pointwise; stage.features.transpose_accesses[(int)type][(int)type_class] += is_transpose; stage.features.broadcast_accesses[(int)type][(int)type_class] += is_broadcast; stage.features.slice_accesses[(int)type][(int)type_class] += is_slice; for (auto *e : stage.incoming_edges) { if (e->producer->func.name() == name) { // The same name can be encountered multiple times // (e.g. a+a, where a is a trivial function), // so we can't use std::move(matrix) here without making a copy vector> copy = matrix; e->add_load_jacobian(LoadJacobian(std::move(copy))); } } } public: Featurizer(Function &func, FunctionDAG::Node::Stage &stage) : func(func), stage(stage) { } void visit_store_args(const std::string &name, Type t, vector args) { for (auto &e : args) { e = common_subexpression_elimination(simplify(e)); // Get things into canonical form } visit_memory_access(name, t, args, PipelineFeatures::AccessType::Store); } }; } // namespace void LoadJacobian::dump(std::ostream &os, const char *prefix) const { if (count() > 1) { os << prefix << count() << " x\n"; } for (size_t i = 0; i < producer_storage_dims(); i++) { os << prefix << " ["; for (size_t j = 0; j < consumer_loop_dims(); j++) { const auto &c = (*this)(i, j); if (!c.exists) { os << " _ "; } else if (c.denominator == 1) { os << " " << c.numerator << " "; } else { os << c.numerator << "/" << c.denominator << " "; } } os << "]\n"; } os << "\n"; } void BoundContents::validate() const { for (int i = 0; i < layout->total_size; i++) { auto p = data()[i]; if (p.max() < p.min()) { std::ostringstream err; err << "Bad bounds object:\n"; for (int j = 0; j < layout->total_size; j++) { if (i == j) { err << "=> "; } else { err << " "; } err << j << ": " << data()[j].min() << ", " << data()[j].max() << "\n"; } err << "Aborting"; internal_error << err.str(); } } } BoundContents::Layout::~Layout() { internal_assert(num_live == 0) << "Destroying a Layout without returning all the BoundContents. " << num_live << " are still live\n"; for (auto *b : pool) { b->~BoundContents(); } for (auto *b : blocks) { free(b); } } void BoundContents::Layout::allocate_some_more() const { size_t size_of_one = sizeof(BoundContents) + total_size * sizeof(Span); const size_t number_per_block = std::max((size_t)8, 4096 / size_of_one); // Make a page of them, or 8, whichever is larger. const size_t bytes_to_allocate = std::max(size_of_one * number_per_block, (size_t)4096); unsigned char *mem = (unsigned char *)malloc(bytes_to_allocate); blocks.push_back(mem); static_assert((sizeof(BoundContents) & 7) == 0, "BoundContents header is not aligned"); for (size_t i = 0; i < number_per_block; i++) { BoundContents *b = (BoundContents *)(mem + i * size_of_one); new (b) BoundContents; b->layout = this; pool.push_back(b); } internal_assert(((unsigned char *)(pool[0]) + size_of_one) == (unsigned char *)(pool[1])); } BoundContents *BoundContents::Layout::make() const { if (pool.empty()) { allocate_some_more(); } BoundContents *b = pool.back(); pool.pop_back(); num_live++; return b; } void BoundContents::Layout::release(const BoundContents *b) const { internal_assert(b->layout == this) << "Releasing BoundContents onto the wrong pool!"; b->~BoundContents(); pool.push_back(const_cast(b)); num_live--; } void FunctionDAG::Node::loop_nest_for_region(int stage_idx, const Span *computed, Span *loop) const { const auto &s = stages[stage_idx]; map computed_map; if (!s.loop_nest_all_common_cases) { for (int i = 0; i < func.dimensions(); i++) { computed_map[region_required[i].min.name()] = (int)computed[i].min(); computed_map[region_required[i].max.name()] = (int)computed[i].max(); } } for (size_t i = 0; i < s.loop.size(); i++) { const auto &l = s.loop[i]; if (l.equals_region_computed) { loop[i] = computed[l.region_computed_dim]; } else if (l.bounds_are_constant) { loop[i] = Span(l.c_min, l.c_max, true); } else { Expr min = simplify(substitute(computed_map, l.min)); Expr max = simplify(substitute(computed_map, l.max)); const int64_t *imin = as_const_int(min); const int64_t *imax = as_const_int(max); internal_assert(imin && imax) << min << ", " << max << "\n"; loop[i] = Span(*imin, *imax, false); } } } void FunctionDAG::Node::required_to_computed(const Span *required, Span *computed) const { map required_map; if (!region_computed_all_common_cases) { // Make a binding for the value of each symbolic variable for (int i = 0; i < func.dimensions(); i++) { required_map[region_required[i].min.name()] = (int)required[i].min(); required_map[region_required[i].max.name()] = (int)required[i].max(); } } for (int i = 0; i < func.dimensions(); i++) { const auto &comp = region_computed[i]; if (comp.equals_required) { computed[i] = required[i]; } else if (comp.equals_union_of_required_with_constants) { computed[i] = Span(std::min(required[i].min(), comp.c_min), std::max(required[i].max(), comp.c_max), false); } else { Expr min = simplify(substitute(required_map, comp.in.min)); Expr max = simplify(substitute(required_map, comp.in.max)); const int64_t *imin = as_const_int(min); const int64_t *imax = as_const_int(max); internal_assert(imin && imax) << min << ", " << max << "\n"; computed[i] = Span(*imin, *imax, false); } } } FunctionDAG::Edge::BoundInfo::BoundInfo(const Expr &e, const Node::Stage &consumer, bool dependent) : expr(e), depends_on_estimate(dependent) { // Do the analysis to detect if this is a simple case // that can be evaluated more cheaply. Currently this // acceleration recognises affine expressions. In the // future we may consider quasi-affine, or even // piecewise-quasi-affine. If the bounds are // non-affine, we use the symbolic expression. const Add *add = expr.as(); const Mul *mul = add ? add->a.as() : expr.as(); const IntImm *coeff_imm = mul ? mul->b.as() : nullptr; const IntImm *constant_imm = add ? add->b.as() : nullptr; // clang-format off Expr v = (mul ? mul->a : add ? add->a : expr); // clang-format on const Variable *var = v.as(); if (const IntImm *c = e.as()) { affine = true; coeff = 0; constant = c->value; } else if (var && (!mul || coeff_imm) && (!add || constant_imm)) { affine = true; coeff = mul ? coeff_imm->value : 1; constant = add ? constant_imm->value : 0; consumer_dim = -1; for (int i = 0; i < (int)consumer.loop.size(); i++) { const auto &in = consumer.loop[i]; if (var->name == consumer.node->func.name() + "." + in.var + ".min") { consumer_dim = i; uses_max = false; break; } else if (var->name == consumer.node->func.name() + "." + in.var + ".max") { consumer_dim = i; uses_max = true; break; } } internal_assert(consumer_dim >= 0) << "Could not find consumer loop variable: " << var->name << "\n"; aslog(2) << "Bound is affine: " << e << " == " << var->name << " * " << coeff << " + " << constant << "\n"; } else { affine = false; aslog(2) << "Bound is non-affine: " << e << "\n"; } } void FunctionDAG::Edge::add_load_jacobian(LoadJacobian j1) { for (auto &j2 : load_jacobians) { if (j2.merge(j1)) { return; } } load_jacobians.emplace_back(std::move(j1)); } void FunctionDAG::Edge::expand_footprint(const Span *consumer_loop, Span *producer_required) const { // Create a map from the symbolic loop variables to the actual loop size const auto &symbolic_loop = consumer->loop; map s; if (!all_bounds_affine) { for (size_t i = 0; i < symbolic_loop.size(); i++) { auto p = consumer_loop[i]; const string &var = symbolic_loop[i].var; s[consumer->node->func.name() + "." + var + ".min"] = (int)p.min(); s[consumer->node->func.name() + "." + var + ".max"] = (int)p.max(); } } // Apply that map to the bounds relationship encoded // in the edge to expand the bounds of the producer to // satisfy the consumer for (int i = 0; i < producer->func.dimensions(); i++) { // Get bounds required of this dimension of the // producer in terms of a symbolic region of the // consumer. bool bounds_are_constant = true; auto eval_bound = [&](const BoundInfo &b) { bounds_are_constant &= !b.depends_on_estimate; if (b.affine) { // Common-case performance optimization if (b.coeff == 0) { return b.constant; } else { const auto &src_pair = consumer_loop[b.consumer_dim]; int64_t src = b.uses_max ? src_pair.max() : src_pair.min(); bounds_are_constant &= src_pair.constant_extent(); return src * b.coeff + b.constant; } } else { Expr substituted = substitute(s, b.expr); Expr e = simplify(substituted); const int64_t *i = as_const_int(e); internal_assert(i) << "Should be constant: " << b.expr << " -> " << substituted << " -> " << e << "\n"; bounds_are_constant = false; return *i; } }; int64_t a = eval_bound(bounds[i].first); int64_t b = eval_bound(bounds[i].second); producer_required[i].union_with(Span(a, b, bounds_are_constant)); } } class DependsOnEstimate : public IRVisitor { public: bool found_estimate = false; private: using IRVisitor::visit; void visit(const Variable *op) override { found_estimate |= op->param.defined(); } }; bool depends_on_estimate(const Expr &expr) { DependsOnEstimate dependency_checker; expr.accept(&dependency_checker); return dependency_checker.found_estimate; } FunctionDAG::FunctionDAG(const vector &outputs, const Target &target) { map env = build_environment(outputs); // A mutator to apply parameter estimates to the expressions // we encounter while constructing the graph. class ApplyParamEstimates : public IRMutator { using IRMutator::visit; Expr visit(const Variable *op) override { Expr expr; if (op->param.defined()) { if (!op->param.is_buffer()) { expr = op->param.estimate(); } else { for (int i = 0; i < op->param.dimensions(); i++) { if (op->name == op->param.name() + ".min." + std::to_string(i)) { expr = op->param.min_constraint_estimate(i); } else if (op->name == op->param.name() + ".extent." + std::to_string(i)) { expr = op->param.extent_constraint_estimate(i); } } } internal_assert(expr.defined()) << "Missing estimate for " << op->name << "\n"; return expr; } else { return op; } } } apply_param_estimates; // Compute a realization order vector order = topological_order(outputs, env); // Construct the mapping from Funcs to Nodes nodes.resize(order.size()); map node_map; for (size_t i = 0; i < order.size(); i++) { Function f = env[order[order.size() - i - 1]]; nodes[i].func = f; nodes[i].id = (int)i; nodes[i].max_id = (int)order.size(); nodes[i].dag = this; node_map[f] = &nodes[i]; } int stage_count = 0; for (size_t i = order.size(); i > 0; i--) { Node &node = nodes[order.size() - i]; Function consumer = node.func; Scope scope; // Create a symbolic region for this Func. for (int j = 0; j < consumer.dimensions(); j++) { Halide::Var min_var(consumer.name() + "." + consumer.args()[j] + ".min"); Halide::Var max_var(consumer.name() + "." + consumer.args()[j] + ".max"); Interval interval(min_var, max_var); scope.push(consumer.args()[j], interval); node.region_required.emplace_back(SymbolicInterval{min_var, max_var}); } auto pure_args = node.func.args(); for (int s = 0; s <= (int)consumer.updates().size(); s++) { stage_count++; if (s == 0) { node.stages.emplace_back(Stage(consumer, consumer.definition(), 0)); } else { node.stages.emplace_back(Stage(consumer, consumer.update(s - 1), s)); } } for (int s = 0; s <= (int)consumer.updates().size(); s++) { auto &stage = node.stages[s]; stage.node = &node; stage.name = consumer.name(); if (s > 0) { stage.name += ".update(" + std::to_string(s - 1) + ")"; } const Definition &def = (s == 0) ? consumer.definition() : consumer.update(s - 1); const StageSchedule &sched = def.schedule(); Scope stage_scope_with_concrete_rvar_bounds, stage_scope_with_symbolic_rvar_bounds; stage_scope_with_concrete_rvar_bounds.set_containing_scope(&scope); stage_scope_with_symbolic_rvar_bounds.set_containing_scope(&scope); for (const auto &rv : sched.rvars()) { Expr min = simplify(apply_param_estimates.mutate(rv.min)); Expr max = simplify(apply_param_estimates.mutate(rv.min + rv.extent - 1)); stage_scope_with_concrete_rvar_bounds.push(rv.var, Interval(min, max)); min = Variable::make(Int(32), consumer.name() + "." + rv.var + ".min"); max = Variable::make(Int(32), consumer.name() + "." + rv.var + ".max"); stage_scope_with_symbolic_rvar_bounds.push(rv.var, Interval(min, max)); } // Figure out the region computed of the stage by taking bounds of the LHS Exprs if (s == 0) { node.region_computed.resize(consumer.dimensions()); } FuncValueBounds func_value_bounds = compute_function_value_bounds(order, env); for (int j = 0; j < consumer.dimensions(); j++) { // The region computed always uses the full extent of the rvars Interval in = bounds_of_expr_in_scope(def.args()[j], stage_scope_with_concrete_rvar_bounds, func_value_bounds); internal_assert(in.is_bounded()) << "Region computed of " << consumer.name() << " is unbounded: [" << in.min << " " << in.max << "]\n"; if (s == 0) { node.region_computed[j].in = in; } else { node.region_computed[j].in.include(in); } } if (s == (int)consumer.updates().size()) { // Simplify region computed and perform additional // special-case analysis to make it faster to evaluate. node.region_computed_all_common_cases = true; for (int j = 0; j < consumer.dimensions(); j++) { const auto &req = node.region_required[j]; auto &comp = node.region_computed[j]; comp.depends_on_estimate = depends_on_estimate(comp.in.min) || depends_on_estimate(comp.in.max); comp.in.min = simplify(apply_param_estimates.mutate(comp.in.min)); comp.in.max = simplify(apply_param_estimates.mutate(comp.in.max)); if (equal(comp.in.min, req.min) && equal(comp.in.max, req.max)) { comp.equals_required = true; } else { const Min *min = comp.in.min.as(); const Max *max = comp.in.max.as(); const int64_t *min_b = min ? as_const_int(min->b) : nullptr; const int64_t *max_b = max ? as_const_int(max->b) : nullptr; if (min_b && max_b && equal(min->a, req.min) && equal(max->a, req.max)) { comp.equals_union_of_required_with_constants = true; comp.c_min = *min_b; comp.c_max = *max_b; } else { node.region_computed_all_common_cases = false; } } } } // We'll take any existing reordering, but won't handle existing splits user_assert(sched.splits().empty()) << "The Func \"" << consumer.name() << "\" has scheduling directive(s) " << "applied to it; you must remove these, or conditionalize them " << "using `if (!auto_schedule)`, to use the autoscheduler on this pipeline."; stage.loop_nest_all_common_cases = true; for (size_t i = 0; i < sched.dims().size(); i++) { const auto &d = sched.dims()[i]; // Skip synthetic loops like "__outermost" if (!stage_scope_with_symbolic_rvar_bounds.contains(d.var)) { continue; } Node::Loop l; l.var = d.var; l.accessor = stage.name + ".get_schedule().dims()[" + std::to_string(i) + "].var"; // We already have the right variable names in the stage scope Interval in = stage_scope_with_concrete_rvar_bounds.get(l.var); l.min = in.min; l.max = in.max; l.pure = d.is_pure(); l.rvar = d.is_rvar(); l.pure_dim = -1; // Additional analysis to speed up evaluation of // common cases. Loop bounds that are just one of // the dimensions of the symbolic region computed // are common, as are constant bounds. l.equals_region_computed = false; for (int j = 0; j < consumer.dimensions(); j++) { if (l.var == pure_args[j]) { l.pure_dim = j; } if (equal(l.min, node.region_computed[j].in.min) && equal(l.max, node.region_computed[j].in.max)) { l.equals_region_computed = true; l.region_computed_dim = j; break; } } if (!l.equals_region_computed) { const int64_t *c_min = as_const_int(l.min), *c_max = as_const_int(l.max); if (c_min && c_max) { l.bounds_are_constant = true; l.c_min = *c_min; l.c_max = *c_max; } else { l.bounds_are_constant = false; } } stage.loop_nest_all_common_cases &= (l.bounds_are_constant || l.equals_region_computed); stage.loop.emplace_back(std::move(l)); } // Bundle all expressions associated with the definition into a single dummy call node vector exprs_vector = def.args(); exprs_vector.insert(exprs_vector.end(), def.values().begin(), def.values().end()); if (def.predicate().defined()) { exprs_vector.push_back(def.predicate()); } Expr exprs = Call::make(Int(32), "dummy", exprs_vector, Call::Extern); // Walk over the expressions involved sniffing types class CheckTypes : public IRVisitor { using IRVisitor::visit; void visit(const IntImm *op) override { check_type(op->type); } void visit(const UIntImm *op) override { check_type(op->type); } void visit(const FloatImm *op) override { check_type(op->type); } void visit(const Variable *op) override { check_type(op->type); } void visit(const Call *op) override { calls[op->name]++; IRVisitor::visit(op); check_type(op->type); if (op->call_type == Call::Halide || op->call_type == Call::Image) { is_pointwise &= op->args.size() == func.args().size(); if (is_pointwise) { for (size_t i = 0; i < op->args.size(); i++) { const Variable *v = op->args[i].as(); is_pointwise &= (v != nullptr) && (v->name == func.args()[i]); } } } } void visit(const Cast *op) override { IRVisitor::visit(op); check_type(op->type); } void visit(const Reinterpret *op) override { IRVisitor::visit(op); check_type(op->type); } void check_type(Type t) { if (t.bits() > 1 && (!narrowest_type.bits() || t.bits() < narrowest_type.bits())) { narrowest_type = t; } } Function func; public: bool is_pointwise = true; int leaves = 0; Type narrowest_type; map calls; explicit CheckTypes(const Function &f) : func(f) { } }; CheckTypes checker(consumer); exprs.accept(&checker); Type widest_output_type = def.values()[0].type(); int bytes_per_point = 0; for (const auto &e : def.values()) { bytes_per_point += e.type().bytes(); if (e.type().bytes() > widest_output_type.bytes()) { widest_output_type = e.type(); } } if (s == 0) { node.bytes_per_point = bytes_per_point; } stage.vector_size = target.natural_vector_size(checker.narrowest_type); if (s == 0) { node.vector_size = stage.vector_size; } else { node.vector_size = std::max(node.vector_size, stage.vector_size); } node.is_output = false; for (const auto &o : outputs) { node.is_output |= o.same_as(node.func); } if (node.is_output) { // Get the bounds estimate map estimates; for (const auto &b : consumer.schedule().estimates()) { const int64_t *i_min = as_const_int(b.min); const int64_t *i_extent = as_const_int(b.extent); user_assert(i_min && i_extent) << "Min/extent of estimate or bound is not constant in \"" << consumer.name() << "\", var:" << b.var << ", min:" << b.min << ", extent:" << b.extent; if ((false)) { // Intentional dead code. Extra parens to pacify clang-tidy. // Some methods we compare to compile for // statically known input/output sizes. We // don't need to - we take estimates but // the compiled code doesn't enforce // them. If you want to make a comparison // fair and target a fixed size, use this // branch of the if. In practice we don't // see a runtime difference, so we left it // disabled. In theory, Sizes being // constant makes it possible to do things // like unroll across color channels, so // it affects the scheduling space. Func(node.func).bound(b.var, b.min, b.extent); estimates[b.var] = Span(*i_min, *i_min + *i_extent - 1, true); } else { estimates[b.var] = Span(*i_min, *i_min + *i_extent - 1, false); } } for (const auto &b : consumer.schedule().bounds()) { const int64_t *i_min = as_const_int(b.min); const int64_t *i_extent = as_const_int(b.extent); if (i_min && i_extent) { // It's a true bound, not just an estimate estimates[b.var] = Span(*i_min, *i_min + *i_extent - 1, true); } } // Set the bounds using the estimates for (int i = 0; i < consumer.dimensions(); i++) { auto it = estimates.find(consumer.args()[i]); user_assert(it != estimates.end()) << "Need an estimate on dimension " << i << " of \"" << consumer.name() << "\""; node.estimated_region_required.push_back(it->second); } } stage.index = s; exprs = apply_param_estimates.mutate(exprs); // For this stage scope we want symbolic bounds for the rvars // Now create the edges that lead to this func bool any_incoming_edges = false; node.is_pointwise = !node.func.has_update_definition(); // TODO: peephole the boundary condition call pattern instead of assuming the user used the builtin node.is_boundary_condition = node.is_pointwise && starts_with(node.func.name(), "repeat_edge"); auto boxes = boxes_required(exprs, stage_scope_with_symbolic_rvar_bounds, func_value_bounds); for (auto &p : boxes) { auto it = env.find(p.first); if (it != env.end() && p.first != consumer.name()) { // Discard loads from input images and self-loads Edge edge; edge.consumer = &stage; edge.producer = node_map.at(env[p.first]); edge.all_bounds_affine = true; for (Interval &in : p.second.bounds) { // Whenever a relationship is unbounded, we must inline internal_assert(in.is_bounded()) << "Unbounded producer->consumer relationship: " << edge.producer->func.name() << " -> " << edge.consumer->name << "\n"; bool min_dependent = depends_on_estimate(in.min); bool max_dependent = depends_on_estimate(in.max); Expr min_value = simplify(apply_param_estimates.mutate(in.min)); Expr max_value = simplify(apply_param_estimates.mutate(in.max)); Edge::BoundInfo min(min_value, *edge.consumer, min_dependent); Edge::BoundInfo max(max_value, *edge.consumer, max_dependent); edge.bounds.emplace_back(std::move(min), std::move(max)); edge.all_bounds_affine &= edge.bounds.back().first.affine; edge.all_bounds_affine &= edge.bounds.back().second.affine; } edge.calls = checker.calls[edge.producer->func.name()]; any_incoming_edges = true; node.is_pointwise &= checker.is_pointwise; edges.emplace_back(std::move(edge)); } } node.is_wrapper = node.func.is_wrapper(); node.is_input = !node.is_output && !node.func.has_update_definition() && node.is_wrapper && !any_incoming_edges; node.dimensions = node.func.dimensions(); } } // Initialize the memory layouts for the bounds structs for (auto &n : nodes) { n.bounds_memory_layout = std::make_unique(); auto &l = *(n.bounds_memory_layout); l.computed_offset = n.func.dimensions(); l.total_size = l.computed_offset + n.func.dimensions(); for (const auto &s : n.stages) { l.loop_offset.push_back(l.total_size); l.total_size += (int)s.loop.size(); } } // Give all the stages unique ids to support perfect hashing of them { int i = 0; for (auto &n : nodes) { for (auto &s : n.stages) { s.id = i; s.max_id = stage_count; i++; } } } for (auto &edge : edges) { edge.producer->outgoing_edges.push_back(&edge); edge.consumer->incoming_edges.push_back(&edge); } // Compute transitive dependencies for (size_t i = nodes.size(); i > 0; i--) { auto &n = nodes[i - 1]; for (auto &s : n.stages) { s.dependencies.resize(nodes.size(), false); for (auto *e : s.incoming_edges) { s.dependencies[e->producer->id] = true; for (auto &s2 : e->producer->stages) { for (size_t j = 0; j < nodes.size(); j++) { s.dependencies[j] = s.dependencies[j] || s2.dependencies[j]; } } } } } // Compute the algorithm-specific features for the neural net featurize(); } void FunctionDAG::featurize() { for (Node &node : nodes) { for (size_t stage_idx = 0; stage_idx < node.stages.size(); stage_idx++) { Node::Stage &stage = node.stages[stage_idx]; Featurizer featurizer(node.func, stage); if (node.func.extern_definition_proxy_expr().get()) { // Extern function call with a proxy implementation specified: generate the featurization from the proxy Expr v = simplify(node.func.extern_definition_proxy_expr()); v = common_subexpression_elimination(v); v.accept(&featurizer); } else { Definition def = node.func.definition(); if (stage_idx > 0) { def = node.func.updates()[stage_idx - 1]; } stage.features = PipelineFeatures(); for (auto v : def.values()) { featurizer.visit_store_args(node.func.name(), v.type(), def.args()); v = common_subexpression_elimination(simplify(v)); // Get things into canonical form v.accept(&featurizer); } for (auto v : def.args()) { v = common_subexpression_elimination(simplify(v)); // Get things into canonical form v.accept(&featurizer); } } } } } void FunctionDAG::dump(std::ostream &os) const { for (const Node &n : nodes) { os << "Node: " << n.func.name() << "\n" << " Symbolic region required: \n"; for (const SymbolicInterval &i : n.region_required) { os << " " << i.min << ", " << i.max << "\n"; } os << " Region computed: \n"; for (const auto &i : n.region_computed) { os << " " << i.in.min << ", " << i.in.max << "\n"; } for (size_t i = 0; i < n.stages.size(); i++) { os << " Stage " << i << ":\n"; for (const auto &l : n.stages[i].loop) { os << " " << l.var << " " << l.min << " " << l.max << "\n"; } n.stages[i].features.dump(os); } os << " pointwise: " << n.is_pointwise << " boundary condition: " << n.is_boundary_condition << " wrapper: " << n.is_wrapper << " input: " << n.is_input << " output: " << n.is_output << "\n"; } for (const Edge &e : edges) { os << "Edge: " << e.producer->func.name() << " -> " << e.consumer->name << "\n" << " Footprint: \n"; int j = 0; for (const auto &i : e.bounds) { os << " Min " << j << ": " << i.first.expr << "\n"; os << " Max " << j << ": " << i.second.expr << "\n"; j++; } os << " Load Jacobians:\n"; for (const auto &jac : e.load_jacobians) { jac.dump(os, " "); } } } } // namespace Autoscheduler } // namespace Internal } // namespace Halide Halide-17.0.1/src/autoschedulers/adams2019/FunctionDAG.h000066400000000000000000000435241456515664200225100ustar00rootroot00000000000000/** This file defines the class FunctionDAG, which is our * representation of a Halide pipeline, and contains methods to using * Halide's bounds tools to query properties of it. */ #ifndef FUNCTION_DAG_H #define FUNCTION_DAG_H #include #include #include #include #include #include #include "Errors.h" #include "Featurization.h" #include "Halide.h" namespace Halide { namespace Internal { namespace Autoscheduler { using std::map; using std::pair; using std::string; using std::unique_ptr; using std::vector; struct Adams2019Params; // First we have various utility classes. // An optional rational type used when analyzing memory dependencies. struct OptionalRational { bool exists = false; int64_t numerator = 0, denominator = 0; OptionalRational() = default; OptionalRational(bool e, int64_t n, int64_t d) : exists(e), numerator(n), denominator(d) { } void operator+=(const OptionalRational &other) { if (!exists || !other.exists) { exists = false; return; } if (denominator == other.denominator) { numerator += other.numerator; return; } int64_t l = lcm(denominator, other.denominator); numerator *= l / denominator; denominator = l; numerator += other.numerator * (l / other.denominator); int64_t g = gcd(numerator, denominator); numerator /= g; denominator /= g; } OptionalRational operator*(const OptionalRational &other) const { if ((*this) == 0) { return *this; } if (other == 0) { return other; } int64_t num = numerator * other.numerator; int64_t den = denominator * other.denominator; bool e = exists && other.exists; return OptionalRational{e, num, den}; } // Because this type is optional (exists may be false), we don't // have a total ordering. These methods all return false when the // operators are not comparable, so a < b is not the same as !(a // >= b). bool operator<(int x) const { if (!exists) { return false; } if (denominator > 0) { return numerator < x * denominator; } else { return numerator > x * denominator; } } bool operator<=(int x) const { if (!exists) { return false; } if (denominator > 0) { return numerator <= x * denominator; } else { return numerator >= x * denominator; } } bool operator>(int x) const { if (!exists) { return false; } return !((*this) <= x); } bool operator>=(int x) const { if (!exists) { return false; } return !((*this) < x); } bool operator==(int x) const { return exists && (numerator == (x * denominator)); } bool operator==(const OptionalRational &other) const { return (exists == other.exists) && (numerator * other.denominator == denominator * other.numerator); } }; // A LoadJacobian records the derivative of the coordinate accessed in // some producer w.r.t the loops of the consumer. class LoadJacobian { vector> coeffs; int64_t c; public: explicit LoadJacobian(vector> &&matrix, int64_t c = 1) : coeffs(matrix), c(c) { } size_t producer_storage_dims() const { return coeffs.size(); } size_t consumer_loop_dims() const { if (coeffs.empty() || coeffs[0].empty()) { // The producer is scalar, and we don't know how // many consumer loops there are. return 0; } return coeffs[0].size(); } OptionalRational operator()(int producer_storage_dim, int consumer_loop_dim) const { if (coeffs.empty()) { // The producer is scalar, so all strides are zero. return {true, 0, 1}; } internal_assert(producer_storage_dim < (int)coeffs.size()); const auto &p = coeffs[producer_storage_dim]; if (p.empty()) { // The consumer is scalar, so all strides are zero. return {true, 0, 1}; } internal_assert(consumer_loop_dim < (int)p.size()); return p[consumer_loop_dim]; } // To avoid redundantly re-recording copies of the same // load Jacobian, we keep a count of how many times a // load with this Jacobian occurs. int64_t count() const { return c; } // Try to merge another LoadJacobian into this one, increasing the // count if the coefficients match. bool merge(const LoadJacobian &other) { if (other.coeffs.size() != coeffs.size()) { return false; } for (size_t i = 0; i < coeffs.size(); i++) { if (other.coeffs[i].size() != coeffs[i].size()) { return false; } for (size_t j = 0; j < coeffs[i].size(); j++) { if (!(other.coeffs[i][j] == coeffs[i][j])) { return false; } } } c += other.count(); return true; } // Multiply Jacobians, used to look at memory dependencies through // inlined functions. LoadJacobian operator*(const LoadJacobian &other) const { vector> matrix; internal_assert(consumer_loop_dims() == 0 || (consumer_loop_dims() == other.producer_storage_dims())); matrix.resize(producer_storage_dims()); for (size_t i = 0; i < producer_storage_dims(); i++) { matrix[i].resize(other.consumer_loop_dims()); for (size_t j = 0; j < other.consumer_loop_dims(); j++) { matrix[i][j] = OptionalRational{true, 0, 1}; for (size_t k = 0; k < consumer_loop_dims(); k++) { matrix[i][j] += (*this)(i, k) * other(k, j); } } } LoadJacobian result(std::move(matrix), count() * other.count()); return result; } void dump(std::ostream &os, const char *prefix) const; }; // Classes to represent a concrete set of bounds for a Func. A Span is // single-dimensional, and a Bound is a multi-dimensional box. For // each dimension we track the estimated size, and also whether or not // the size is known to be constant at compile-time. For each Func we // track three different types of bounds: // 1) The region required by consumers of the Func, which determines // 2) The region actually computed, which in turn determines // 3) The min and max of all loops in the loop next. // 3 in turn determines the region required of the inputs to a Func, // which determines their region computed, and hence their loop nest, // and so on back up the Function DAG from outputs back to inputs. class Span { int64_t min_, max_; bool constant_extent_; public: int64_t min() const { return min_; } int64_t max() const { return max_; } int64_t extent() const { return max_ - min_ + 1; } bool constant_extent() const { return constant_extent_; } void union_with(const Span &other) { min_ = std::min(min_, other.min()); max_ = std::max(max_, other.max()); constant_extent_ = constant_extent_ && other.constant_extent(); } void set_extent(int64_t e) { max_ = min_ + e - 1; } void translate(int64_t x) { min_ += x; max_ += x; } Span(int64_t a, int64_t b, bool c) : min_(a), max_(b), constant_extent_(c) { } Span() = default; Span(const Span &other) = default; static Span empty_span() { return Span(INT64_MAX, INT64_MIN, true); } }; // Bounds objects are created and destroyed very frequently while // exploring scheduling options, so we have a custom allocator and // memory pool. Much like IR nodes, we treat them as immutable once // created and wrapped in a Bound object so that they can be shared // safely across scheduling alternatives. struct BoundContents { mutable RefCount ref_count; class Layout; const Layout *layout = nullptr; Span *data() const { // This struct is a header return (Span *)(const_cast(this) + 1); } Span ®ion_required(int i) { return data()[i]; } Span ®ion_computed(int i) { return data()[i + layout->computed_offset]; } Span &loops(int i, int j) { return data()[j + layout->loop_offset[i]]; } const Span ®ion_required(int i) const { return data()[i]; } const Span ®ion_computed(int i) const { return data()[i + layout->computed_offset]; } const Span &loops(int i, int j) const { return data()[j + layout->loop_offset[i]]; } BoundContents *make_copy() const { auto *b = layout->make(); size_t bytes = sizeof(data()[0]) * layout->total_size; memcpy(b->data(), data(), bytes); return b; } void validate() const; // We're frequently going to need to make these concrete bounds // arrays. It makes things more efficient if we figure out the // memory layout of those data structures once ahead of time, and // make each individual instance just use that. Note that this is // not thread-safe. class Layout { // A memory pool of free BoundContent objects with this layout mutable std::vector pool; // All the blocks of memory allocated mutable std::vector blocks; mutable size_t num_live = 0; void allocate_some_more() const; public: // number of Span to allocate int total_size; // region_required has size func->dimensions() and comes first in the memory layout // region_computed comes next at the following index int computed_offset; // the loop for each stage starts at the following index std::vector loop_offset; Layout() = default; ~Layout(); Layout(const Layout &) = delete; void operator=(const Layout &) = delete; Layout(Layout &&) = delete; void operator=(Layout &&) = delete; // Make a BoundContents object with this layout BoundContents *make() const; // Release a BoundContents object with this layout back to the pool void release(const BoundContents *b) const; }; }; using Bound = IntrusivePtr; // A representation of the function DAG. The nodes and edges are both // in reverse realization order, so if you want to walk backwards up // the DAG, just iterate the nodes or edges in-order. struct FunctionDAG { // An edge is a producer-consumer relationship struct Edge; struct SymbolicInterval { Halide::Var min; Halide::Var max; }; // A Node represents a single Func struct Node { // A pointer back to the owning DAG FunctionDAG *dag; // The Halide Func this represents Function func; // The number of bytes per point stored. double bytes_per_point; // The min/max variables used to denote a symbolic region of // this Func. Used in the cost above, and in the Edges below. vector region_required; // A concrete region required from a bounds estimate. Only // defined for outputs. vector estimated_region_required; // The region computed of a Func, in terms of the region // required. For simple Funcs this is identical to the // region_required. However, in some Funcs computing one // output requires computing other outputs too. You can't // really ask for a single output pixel from something blurred // with an IIR without computing the others, for example. struct RegionComputedInfo { // The min and max in their full symbolic glory. We use // these in the general case. Interval in; bool depends_on_estimate = false; // Analysis used to accelerate common cases bool equals_required = false, equals_union_of_required_with_constants = false; int64_t c_min = 0, c_max = 0; }; vector region_computed; bool region_computed_all_common_cases = false; // Expand a region required into a region computed, using the // symbolic intervals above. void required_to_computed(const Span *required, Span *computed) const; // Metadata about one symbolic loop in a Func's default loop nest. struct Loop { string var; bool pure, rvar; Expr min, max; // Which pure dimension does this loop correspond to? Invalid if it's an rvar int pure_dim; // Precomputed metadata to accelerate common cases: // If true, the loop bounds are just the region computed in the given dimension bool equals_region_computed = false; int region_computed_dim = 0; // If true, the loop bounds are a constant with the given min and max bool bounds_are_constant = false; int64_t c_min = 0, c_max = 0; // A persistent fragment of source for getting this Var // from its owner Func. Used for printing source code // equivalent to a computed schedule. string accessor; }; // Get the loop nest shape as a function of the region computed void loop_nest_for_region(int stage_idx, const Span *computed, Span *loop) const; // One stage of a Func struct Stage { // The owning Node Node *node; // Which stage of the Func is this. 0 = pure. int index; // The loop nest that computes this stage, from innermost out. vector loop; bool loop_nest_all_common_cases = false; // The vectorization width that will be used for // compute. Corresponds to the natural width for the // narrowest type used. int vector_size; // The featurization of the compute done PipelineFeatures features; // The actual Halide front-end stage object Halide::Stage stage; // The name for scheduling (e.g. "foo.update(3)") string name; // Ids for perfect hashing on stages. int id, max_id; vector incoming_edges; vector dependencies; bool downstream_of(const Node &n) const { return dependencies[n.id]; }; explicit Stage(Halide::Stage s) : stage(std::move(s)) { } }; vector stages; vector outgoing_edges; // Max vector size across the stages int vector_size; // A unique ID for this node, allocated consecutively starting // at zero for each pipeline. int id, max_id; // Just func->dimensions(), but we ask for it so many times // that's it's worth avoiding the function call into // libHalide. int dimensions; // Is a single pointwise call to another Func bool is_wrapper; // We represent the input buffers as node, though we do not attempt to schedule them. bool is_input; // Is one of the pipeline outputs bool is_output; // Only uses pointwise calls bool is_pointwise; // Only uses pointwise calls + clamping on all indices bool is_boundary_condition; std::unique_ptr bounds_memory_layout; BoundContents *make_bound() const { return bounds_memory_layout->make(); } }; // A representation of a producer-consumer relationship struct Edge { struct BoundInfo { // The symbolic expression for the bound in this dimension Expr expr; // Fields below are the results of additional analysis // used to evaluate this bound more quickly. int64_t coeff, constant; int64_t consumer_dim; bool affine, uses_max, depends_on_estimate; BoundInfo(const Expr &e, const Node::Stage &consumer, bool dependent); }; // Memory footprint on producer required by consumer. vector> bounds; FunctionDAG::Node *producer; FunctionDAG::Node::Stage *consumer; // The number of calls the consumer makes to the producer, per // point in the loop nest of the consumer. int calls; bool all_bounds_affine; vector load_jacobians; void add_load_jacobian(LoadJacobian j1); // Given a loop nest of the consumer stage, expand a region // required of the producer to be large enough to include all // points required. void expand_footprint(const Span *consumer_loop, Span *producer_required) const; }; vector nodes; vector edges; // Create the function DAG, and do all the dependency and cost // analysis. This is done once up-front before the tree search. FunctionDAG(const vector &outputs, const Target &target); void dump(std::ostream &os) const; private: // Compute the featurization for the entire DAG void featurize(); public: // This class uses a lot of internal pointers, so we'll make it uncopyable/unmovable. FunctionDAG(const FunctionDAG &other) = delete; FunctionDAG &operator=(const FunctionDAG &other) = delete; FunctionDAG(FunctionDAG &&other) = delete; FunctionDAG &operator=(FunctionDAG &&other) = delete; }; } // namespace Autoscheduler } // namespace Internal } // namespace Halide #endif // FUNCTION_DAG_H Halide-17.0.1/src/autoschedulers/adams2019/LoopNest.cpp000066400000000000000000002602531456515664200225050ustar00rootroot00000000000000#include "LoopNest.h" #include "Cache.h" using std::set; using std::vector; namespace Halide { namespace Internal { namespace Autoscheduler { // How small should an innermost loop cluster be before you just // entirely unroll the thing. Sized for an architecture with 16 vector // registers. const int kUnrollLimit = 12; // Given a multi-dimensional box of dimensionality d, generate a list // of candidate tile sizes for it, logarithmically spacing the sizes // using the given factor. If 'allow_splits' is false, every dimension // must either be one, or the full extent of the box. This function is // used to generate candidate tilings when tiling for // producer-consumer fusion, or tiling for parallelism. vector> generate_tilings(const vector &s, int d, int factor, bool allow_splits) { vector> result; if (d == -1) { result.emplace_back(); } else { vector> v = generate_tilings(s, d - 1, factor, allow_splits); // If we're already generated too many tiling configurations // for the inner loops, search the outer loops with coarser // granularity. while (v.size() > (size_t)factor * 100) { factor *= 2; } for (auto &t : v) { bool is_full = false, is_const_one = false; // Skip trivial tilings if ((size_t)d == s.size() - 1) { is_const_one = is_full = true; for (int i = 0; i < d; i++) { is_const_one &= (t[i] == 1); is_full &= (t[i] == s[i]); } } t.push_back(0); if (!allow_splits) { if (!is_const_one) { t.back() = 1; result.push_back(t); } if (s[d] != 1 && !is_full) { t.back() = s[d]; result.push_back(t); } } else { int max_inner = 0; for (int inner = 1; inner < s[d]; inner *= factor) { int outer = (s[d] + inner - 1) / inner; if (is_const_one && outer == 1) { continue; } if (is_full && outer == s[d]) { continue; } // Stop when we hit inner sizes that would do too much recompute if (inner > 1 && inner * outer * 7 > s[d] * 8) { break; } max_inner = inner; t.back() = outer; result.push_back(t); } for (int outer = 1; outer <= s[d]; outer *= factor) { int inner = (s[d] + outer - 1) / outer; if (is_const_one && outer == 1) { continue; } if (is_full && outer == s[d]) { continue; } // Stop when we get into the regime covered by the loop above. if (outer > 1 && inner < max_inner * 2) { break; } // Or when the wasted compute gets too bad. if (inner * outer * 7 > s[d] * 8) { break; } t.back() = outer; result.push_back(t); } // The sequence above (in terms of the inner loop) // goes 1 2 4 8 16 ... but 3 is an important inner // tiling factor for matrix multiply/gemm-type loops // which try to use 12 vector registers. int inner3 = 3; int outer3 = (s[d] + inner3 - 1) / inner3; if (factor == 2 && inner3 < s[d] && outer3 < s[d] && outer3 > 1) { if (inner3 * outer3 * 7 <= s[d] * 8) { t.back() = outer3; result.push_back(t); } } } } } return result; } void LoopNest::copy_from(const LoopNest &n) { size = n.size; children = n.children; inlined = n.inlined; store_at = n.store_at; bounds = n.bounds; node = n.node; stage = n.stage; innermost = n.innermost; tileable = n.tileable; parallel = n.parallel; vector_dim = n.vector_dim; vectorized_loop_index = n.vectorized_loop_index; }; // Hash the loop structure and sizes up to a fixed depth. This is // used as the hash function for the coarse-to-fine beam search in // the paper. void LoopNest::structural_hash(uint64_t &h, int depth) const { if (depth < 0) { return; } // Which Funcs are store_at this level? for (const auto *n : store_at) { hash_combine(h, n->id); } hash_combine(h, -1); // Which Funcs are compute_at this level? for (const auto &c : children) { hash_combine(h, c->stage->id); } // Add a barrier to ensure that moving something from the last // compute_at to the first inlined doesn't result in the same // hash. hash_combine(h, -1); // Which Funcs are inlined at this level? for (auto it = inlined.begin(); it != inlined.end(); it++) { hash_combine(h, it.key()->id); } hash_combine(h, -1); if (depth > 0) { // What are the loop sizes of the children? for (const auto &c : children) { for (int64_t s : c->size) { if (depth == 1) { // Just take the most significant bit: is it one or not? s = (s > 1) ? 1 : 0; } hash_combine(h, s); } } // Which dimension are we vectorized over? hash_combine(h, vectorized_loop_index); } if (depth > 1) { // Descend into children for (const auto &c : children) { c->structural_hash(h, depth - 2); } } } // Compute all the sites of interest for each pipeline stage void LoopNest::get_sites(StageMap &sites, const LoopNest *task, const LoopNest *parent) const { if (!task && !is_root()) { task = this; } for (const auto &c : children) { c->get_sites(sites, task, this); } if (parent && node != parent->node) { auto &s = sites.get_or_create(stage); s.compute = parent; s.produce = this; s.task = task; } for (const auto *f : store_at) { for (const auto &s : f->stages) { sites.get_or_create(&s).store = this; } } for (auto it = inlined.begin(); it != inlined.end(); it++) { auto &s = sites.get_or_create(&(it.key()->stages[0])); s.inlined = true; s.compute = s.store = s.produce = s.innermost = this; s.task = task; } if (innermost) { sites.get_or_create(stage).innermost = this; } } // Do a recursive walk over the loop nest computing features to feed the cost model. void LoopNest::compute_features(const FunctionDAG &dag, const Adams2019Params ¶ms, const StageMap &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, int64_t *working_set, StageMap *features, bool use_cached_features) const { int64_t working_set_here = 0; int64_t loop_instances = 1, parallel_tasks = 1; bool in_impure = false; for (int idx = (int)size.size() - 1; idx >= 0; idx--) { size_t i = size[idx]; loop_instances *= i; if (stage->loop[idx].pure && !in_impure) { if (params.parallelism > 1 && (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) { // Either we've picked our parallel tiling, or // it's not yet determined. Assume we'll not split // any loops and just stop after we hit the // required number of cores parallel_tasks *= i; // If we haven't picked out parallel tiling yet, // assume that we'll target 8*cores when we do, // which is a common rule of thumb. if (!parallel && parallel_tasks > params.parallelism * 8) { // We would split this loop parallel_tasks = params.parallelism * 8; } } } else if (i != 1) { in_impure = true; } } int64_t subinstances = instances * loop_instances; for (const auto *node : store_at) { // Figure out the features at the store_at level const auto &bounds = get_bounds(node); for (size_t s = 0; s < node->stages.size(); s++) { // TODO: Lift invariants from this loop. Most of it's the same for every stage. internal_assert(!node->is_input); ScheduleFeatures &feat = features->get_or_create(&(node->stages[s])); feat.num_realizations = subinstances; feat.points_computed_per_realization = 1; feat.num_scalars = feat.num_vectors = subinstances; bool vectorized = false; for (int i = 0; i < (int)node->stages[s].loop.size(); i++) { const auto &p = bounds->loops(s, i); int64_t extent = p.extent(); feat.points_computed_per_realization *= extent; if (i == sites.get(&(node->stages[s])).produce->vectorized_loop_index) { // Assumes that we're not going to split // things such that non-native-width // vectorization is a problem, except for the // tail. feat.num_vectors *= extent / node->stages[s].vector_size; feat.num_scalars *= extent % node->stages[s].vector_size; vectorized = true; } else { feat.num_vectors *= extent; feat.num_scalars *= extent; } } if (!vectorized) { feat.num_vectors = 0; } feat.points_computed_total = feat.points_computed_per_realization * feat.num_realizations; feat.bytes_at_realization = node->bytes_per_point; for (int i = 0; i < node->dimensions; i++) { const auto &p = bounds->region_computed(i); feat.bytes_at_realization *= p.extent(); } int64_t innermost_storage_extent = 1; int v = sites.get(&(node->stages[s])).produce->vector_dim; if (v >= 0 && node->dimensions > 0) { innermost_storage_extent = bounds->region_computed(v).extent(); } feat.innermost_bytes_at_realization = node->bytes_per_point * innermost_storage_extent; if (!is_root()) { feat.bytes_at_task = feat.bytes_at_realization; feat.innermost_bytes_at_task = feat.innermost_bytes_at_realization; } } } if (is_root()) { // TODO: This block of code is repeated below. Refactor for (const auto &c : children) { const uint64_t hash_of_producers = sites.get(c->stage).hash_of_producers_stored_at_root; if (use_cached_features) { // Checks if the features cache has seen this state before, and use the cached features if so. if (c->features_cache.count(hash_of_producers) > 0) { const auto &entry = c->features_cache.at(hash_of_producers); for (auto it = entry.begin(); it != entry.end(); it++) { const auto *stage_ptr = it.key(); const auto &feat = it.value(); features->insert(stage_ptr, feat); } // 'working_set_here' is required below for computing the // root-level features so we compute the value that it // would have had if the current loop nest had not been // memoized int64_t working_set_c{0}; c->compute_working_set_from_features(&working_set_c, features); working_set_here += working_set_c; continue; // no need to recompute fetures } } c->compute_features(dag, params, sites, subinstances, parallelism, this, parent, root, &working_set_here, features, use_cached_features); if (use_cached_features) { // Cache these features for future reference. c->features_cache[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id); c->memoize_features(c->features_cache[hash_of_producers], features); } } for (const auto *node : store_at) { auto &feat = features->get(&(node->stages[0])); working_set_here += feat.bytes_at_production; } for (const auto *node : store_at) { for (const auto &s : node->stages) { auto &feat = features->get(&s); feat.working_set_at_realization = working_set_here; } } for (const auto &c : children) { if (c->node != node) { auto &feat = features->get(c->stage); feat.working_set_at_production = working_set_here; } } // Figure out the root-level features for every Func for (auto it = features->begin(); it != features->end(); it++) { const auto *stage = it.key(); const auto *node = stage->node; auto &feat = it.value(); const auto &root_bounds = root.get_bounds(node); feat.bytes_at_root = node->bytes_per_point; for (int i = 0; i < node->dimensions; i++) { const auto &p = root_bounds->region_computed(i); feat.bytes_at_root *= p.extent(); } feat.working_set_at_root = working_set_here; const auto *p = sites.get(stage).produce; if (p) { // Extent of the innermost dimension in the storage layout int64_t innermost_storage_extent = 1; int v = p->vector_dim; if (v >= 0 && v < node->dimensions) { innermost_storage_extent = root_bounds->region_computed(v).extent(); } feat.innermost_bytes_at_root = node->bytes_per_point * innermost_storage_extent; } else { feat.innermost_bytes_at_root = 0; } feat.points_computed_minimum = 1; for (int i = 0; i < (int)stage->loop.size(); i++) { const auto &p = root_bounds->loops(stage->index, i); feat.points_computed_minimum *= p.extent(); } if (node->stages.size() == 1 && !node->is_output) { int64_t points_computed_minimum_if_inlined = 0; for (auto *e : node->outgoing_edges) { points_computed_minimum_if_inlined += features->get(e->consumer).points_computed_minimum * e->calls; } feat.points_computed_minimum = std::min(feat.points_computed_minimum, (double)points_computed_minimum_if_inlined); } // When memoizing, we need to recompute features for inlined Funcs // so we reset them here if (use_cached_features && sites.get(stage).inlined) { feat.inlined_calls = 0; feat.num_scalars = 0; feat.innermost_pure_loop_extent = 0; feat.outer_parallelism = 0; } } if (use_cached_features) { for (const auto &c : children) { uint64_t hash_of_producers = sites.get(c->stage).hash_of_producers_stored_at_root; // When computing feat.points_computed_minimum above, the order // of nodes considered is possibly different from the loop nest // traversal order so 'features->get(e->consumer).points_computed_minimum' // may not have been computed when it is accessed as a memoized // feature. We memoize 'points_computed_minimum' here to ensure // its value is always available if (c->features_cache.count(hash_of_producers) > 0) { c->memoize_points_computed_minimum(c->features_cache[hash_of_producers], features); } } recompute_inlined_features(sites, features); } return; } int64_t subparallelism = parallel_tasks * parallelism; // Figure out the features at the compute_at level internal_assert(!stage->node->is_input); ScheduleFeatures &feat = features->get_or_create(stage); if (innermost) { if (vectorized_loop_index >= 0 && vectorized_loop_index < (int)size.size()) { feat.vector_size = size[vectorized_loop_index]; } else { feat.vector_size = 1; } if (feat.vector_size == 1) { // They're all scalars feat.num_scalars += feat.num_vectors; feat.num_vectors = 0; } } else { // We want these features just outside the innermost loop, // so just set them at every level and let them get // progressively overwritten as we descend the loop nest // tree. size_t idx = 0; feat.innermost_loop_extent = 1; feat.innermost_pure_loop_extent = 1; for (const auto &l : stage->loop) { feat.innermost_loop_extent *= size[idx]; if (!l.rvar) { feat.innermost_pure_loop_extent *= size[idx]; } idx++; } } const bool at_task = parent->is_root(); const bool at_production = parent->node != node; const bool at_pure_production = at_production && stage->index == 0; if (at_task) { if (parallel) { const auto &bounds = get_bounds(node); feat.bytes_at_task = node->bytes_per_point; int64_t innermost_storage_extent = 1; for (int i = 0; i < node->dimensions; i++) { int64_t outer = 1; for (size_t l = 0; l < stage->loop.size(); l++) { if (stage->loop[l].var == node->func.args()[i]) { outer = size[l]; break; } } const auto &p = bounds->region_computed(i); int64_t extent = p.extent(); extent /= outer; feat.bytes_at_task *= extent; if (i == vector_dim) { innermost_storage_extent = extent; } } feat.innermost_bytes_at_task = node->bytes_per_point * innermost_storage_extent; } else { // How this loop will be parallelized is not yet // determined. Use optimistic values for the features. feat.bytes_at_task = (feat.bytes_at_realization + params.parallelism - 1) / params.parallelism; feat.innermost_bytes_at_task = std::min(feat.bytes_at_task, feat.innermost_bytes_at_realization); } feat.unique_bytes_read_per_task = 0; feat.unique_lines_read_per_task = 0; // We're at a parallel for loop. Check all the accesses // done by Funcs inside this loop to values computed // outside of it to figure out how much data we'll be // streaming onto the core. vector pending; set done; for (const auto *e : stage->incoming_edges) { pending.push_back(e); } while (!pending.empty()) { const auto *e = pending.back(); pending.pop_back(); if (done.count(e->producer)) { continue; } done.insert(e->producer); const auto &site = sites.get(&(e->producer->stages[0])); if (site.store->is_root()) { const auto &b = get_bounds(e->producer); int64_t bytes = e->producer->bytes_per_point, lines = 1; int64_t max_extent = 1; // clang-format off int vector_dim = (e->producer->is_input ? 0 : site.produce != nullptr ? site.produce->vector_dim : -1); // clang-format on for (int i = 0; i < e->producer->dimensions; i++) { int64_t extent = b->region_required(i).extent(); max_extent = std::max(extent, max_extent); bytes *= extent; if (i != vector_dim) { lines *= extent; } } if (!e->producer->is_input && site.produce == nullptr) { // We haven't scheduled the producer so we // don't know the memory layout yet. Assume // the best case. lines /= max_extent; } feat.unique_bytes_read_per_task += bytes; feat.unique_lines_read_per_task += lines; } else if (site.produce != nullptr) { // Computation must be nested inside this task or inlined into it. for (const auto &s : e->producer->stages) { for (const auto *e2 : s.incoming_edges) { pending.push_back(e2); } } } } } if (at_production) { feat.num_productions = instances; feat.inner_parallelism = parallel_tasks; feat.outer_parallelism = parallelism; feat.native_vector_size = stage->vector_size; const auto &bounds = parent->get_bounds(node); feat.bytes_at_production = node->bytes_per_point; for (int i = 0; i < node->dimensions; i++) { const auto &p = bounds->region_computed(i); feat.bytes_at_production *= p.extent(); } int64_t innermost_storage_extent = 1; if (vector_dim >= 0 && node->dimensions > 0) { innermost_storage_extent = bounds->region_computed(vector_dim).extent(); } feat.innermost_bytes_at_production = node->bytes_per_point * innermost_storage_extent; } // Recurse inwards for (const auto &c : children) { c->compute_features(dag, params, sites, subinstances, subparallelism, this, parent, root, &working_set_here, features, use_cached_features); } for (const auto *node : store_at) { auto &feat = features->get(&(node->stages[0])); working_set_here += feat.bytes_at_production; } for (const auto *node : store_at) { for (const auto &s : node->stages) { auto &feat = features->get(&s); feat.working_set_at_realization = working_set_here; } } for (const auto &c : children) { if (c->node != node) { auto &feat = features->get(c->stage); feat.working_set_at_production = working_set_here; } } if (at_task) { set_working_set_at_task_feature(working_set_here, features); } if (at_production) { feat.working_set = working_set_here; } if (innermost) { bool parent_unrolled = (feat.innermost_pure_loop_extent <= kUnrollLimit && parent->node == node); if (parent_unrolled) { const auto &grandparent_bounds = grandparent->get_bounds(node); for (size_t i = 0; i < parent->size.size(); i++) { if (!stage->loop[i].rvar) { const auto &l = grandparent_bounds->loops(parent->stage->index, i); parent_unrolled &= l.constant_extent(); } } } if (parent_unrolled) { feat.unrolled_loop_extent = feat.innermost_pure_loop_extent; } else { feat.unrolled_loop_extent = 1; } } *working_set += working_set_here; // Analyze all memory dependencies of this stage, looking // through any Funcs inlined into it. This is where we track // things like vector gathers. int64_t bytes_loaded = 0, lines_loaded = 0, allocation_bytes_loaded = 0; double num_dense_loads = 0, num_broadcasts = 0, num_gathers = 0, num_stride_2_loads = 0, num_stride_3_loads = 0, num_stride_4_loads = 0, num_loads = 0; if (innermost || at_production) { // These are the sites at which we compute load footprints // Pick the site at which we will compute the footprint relationship const auto &consumer_site = sites.get(stage); // The store_at location of the consumer const auto *consumer_store_site = innermost ? parent : consumer_site.store; // The parallel loop of the consumer const auto *consumer_task_site = consumer_site.task; int64_t consumer_instances = innermost ? instances : feat.num_realizations; internal_assert(consumer_instances != 0); vector pending; pending.emplace_back(stage); vector> jacobians; set done; while (!pending.empty()) { const auto *p = pending.back(); pending.pop_back(); const auto &next_edges = p->incoming_edges; for (const auto *e : next_edges) { internal_assert(sites.contains(&(e->producer->stages[0]))) << "No site found for " << e->producer->func.name() << "\n"; const auto &site = sites.get(&(e->producer->stages[0])); bool producer_has_been_scheduled = e->producer->is_input || (site.produce != nullptr); if (innermost) { if (e->consumer == stage) { for (const auto &j : e->load_jacobians) { jacobians.emplace_back(j, e->producer); } } else { // Consumer was inlined. Multiply the Jacobians to look through it. decltype(jacobians) new_jacobians; for (auto &j1 : jacobians) { if (e->consumer->node == j1.second) { for (const auto &j2 : e->load_jacobians) { LoadJacobian j = j2 * j1.first; new_jacobians.emplace_back(j, e->producer); } } else { new_jacobians.emplace_back(std::move(j1)); } } jacobians.swap(new_jacobians); } } if (site.inlined) { // Recursively examine the inputs pending.emplace_back(&(e->producer->stages[0])); continue; } // The producer's compute_at site const auto *producer_compute_site = site.compute; // The producer's store_at site const auto *producer_store_site = site.store; // The region required of the producer at various sites. const auto &bounds = consumer_store_site->get_bounds(e->producer); const auto &task_bounds = consumer_task_site->get_bounds(e->producer); const auto &producer_compute_bounds = producer_compute_site->get_bounds(e->producer); const auto &producer_store_bounds = producer_store_site->get_bounds(e->producer); // Compute memory footprints in terms of the // number of contiguous lines, and the number of // bytes. int64_t footprint = e->producer->bytes_per_point; int64_t compute_footprint = footprint; int64_t store_footprint = footprint; int64_t line_footprint = 1; int64_t compute_line_footprint = 1; int64_t store_line_footprint = 1; int64_t task_line_footprint = 1; if (e->producer->is_input) { // This node represents an input. Its sites // should be at the root level. internal_assert(producer_store_site->is_root()); internal_assert(producer_compute_site->is_root()); } if (innermost) { // Grab the Jacobians that describe the memory dependence for (const auto &jac : jacobians) { if (jac.second != e->producer) { continue; } double n = jac.first.count(); // Classify them to figure out what's going on in the vector dimension. bool vector_broadcast = true; bool dense_vector_load = true; bool stride_2_vector_load = true; bool stride_3_vector_load = true; bool stride_4_vector_load = true; int producer_innermost_dim = (e->producer->is_input ? 0 : // Assume default storage layout for inputs !producer_has_been_scheduled ? -1 : site.produce->vector_dim); if (vectorized_loop_index >= 0) { if (!producer_has_been_scheduled) { // Operate optimistically and just // see if *any* dimension of the // producer would make for a good // load. int count[5] = {0, 0, 0, 0, 0}; for (int i = 0; i < e->producer->dimensions; i++) { auto stride = jac.first(i, vectorized_loop_index); // stride is a rational. Check to see if it's a small integer. if (stride == 0) { count[0]++; } else if (stride == 1) { count[1]++; } else if (stride == 2) { count[2]++; } else if (stride == 3) { count[3]++; } else if (stride == 4) { count[4]++; } } vector_broadcast = (count[0] == e->producer->dimensions); dense_vector_load = (count[0] == e->producer->dimensions - 1 && count[1] == 1); stride_2_vector_load = (count[0] == e->producer->dimensions - 1 && count[2] == 1); stride_3_vector_load = (count[0] == e->producer->dimensions - 1 && count[3] == 1); stride_4_vector_load = (count[0] == e->producer->dimensions - 1 && count[4] == 1); } else { for (int i = 0; i < e->producer->dimensions; i++) { auto stride = jac.first(i, vectorized_loop_index); vector_broadcast &= stride == 0; if (i == producer_innermost_dim) { dense_vector_load &= stride == 1; stride_2_vector_load &= stride == 2; stride_3_vector_load &= stride == 3; stride_4_vector_load &= stride == 4; } else { dense_vector_load &= stride == 0; stride_2_vector_load &= stride == 0; stride_3_vector_load &= stride == 0; stride_4_vector_load &= stride == 0; // TODO: Check for strided // loads across non-innermost // dims, and use to count the // number of pages, cache // lines, cache conflict misses, etc. } } } } // Is this load loop-invariant over an // unrolled block? If so, we amortize the // number of loads to account for // LICM. This is the key performance // optimization you get from unrolling the // inner loop of a gemm or conv, so it's // important to capture it in the // featurization. int64_t amortization = 1; if (feat.unrolled_loop_extent > 1) { for (size_t idx = 0; idx < stage->loop.size(); idx++) { if (!stage->loop[idx].rvar) { bool loop_invariant = true; for (int i = 0; i < e->producer->dimensions; i++) { if (!(jac.first(i, idx) == 0)) { loop_invariant = false; break; } } if (loop_invariant) { amortization *= parent->size[idx]; } } } } n /= amortization; num_loads += n; if (vector_broadcast) { num_broadcasts += n; } else if (dense_vector_load) { num_dense_loads += n; } else if (stride_2_vector_load) { num_stride_2_loads += n; } else if (stride_3_vector_load) { num_stride_3_loads += n; } else if (stride_4_vector_load) { num_stride_4_loads += n; } else { num_gathers += n; } } } // Already dealt with the footprints for this producer via some other path if (done.find(e->producer) != done.end()) { continue; } done.insert(e->producer); // Now look at the shapes of the regions read from // the producer at various sites. int64_t max_extent = 1, max_compute_extent = 1, max_store_extent = 1, max_task_extent = 1; for (int i = 0; i < e->producer->dimensions; i++) { auto p = bounds->region_required(i); auto compute_p = producer_compute_bounds->region_computed(i); auto store_p = producer_store_bounds->region_required(i); auto task_p = task_bounds->region_required(i); // Check some invariants internal_assert(store_p.min() <= store_p.max()) << store_p.min() << " " << store_p.max() << "\n"; internal_assert(compute_p.min() <= compute_p.max()) << compute_p.min() << " " << compute_p.max() << "\n"; internal_assert(task_p.min() <= task_p.max()) << task_p.min() << " " << task_p.max() << "\n"; int64_t extent = p.extent(); int64_t compute_extent = compute_p.extent(); int64_t store_extent = store_p.extent(); int64_t task_extent = task_p.extent(); max_extent = std::max(extent, max_extent); max_compute_extent = std::max(compute_extent, max_compute_extent); max_store_extent = std::max(store_extent, max_store_extent); max_task_extent = std::max(task_extent, max_task_extent); footprint *= extent; compute_footprint *= compute_extent; store_footprint *= store_extent; bool dense = ((e->producer->is_input && i == 0) || (site.produce != nullptr && i == site.produce->vector_dim)); if (!dense) { line_footprint *= extent; compute_line_footprint *= compute_extent; store_line_footprint *= store_extent; task_line_footprint *= task_extent; } } if (!producer_has_been_scheduled) { // Optimistically assume it gets vectorized // along whatever dimension makes these // numbers the smallest. line_footprint /= max_extent; compute_line_footprint /= max_compute_extent; store_line_footprint /= max_store_extent; task_line_footprint /= max_task_extent; } int64_t store_instances_per_consumption = 1; if (producer_has_been_scheduled && !e->producer->is_input) { const auto &producer_feat = features->get_or_create(&(e->producer->stages[0])); if (producer_feat.num_realizations) { // The producer's realization is nested inside this Func's realization const int64_t producer_store_instances = producer_feat.num_realizations; if (producer_store_instances > consumer_instances) { store_instances_per_consumption = producer_store_instances / consumer_instances; } } } allocation_bytes_loaded += compute_footprint; if (store_instances_per_consumption > 1) { // The producer is nested inside the consumer bytes_loaded += store_footprint; // Due to folding, the actual buffer size is smaller than the bounds at the store level lines_loaded += store_line_footprint; } else { // The consumer is consuming some portion of a larger producer computed earlier bytes_loaded += footprint; lines_loaded += line_footprint; } // We compute (but never use) these; computing them is cheap, // so let's leave in for future reference, but mark as 'ignore me' // to avoid clang-tidy warnings. (void)compute_line_footprint; (void)task_line_footprint; } } } if (at_production) { // Properties of the realization, but the values are // computable at the production site because that's where // the consumers are. internal_assert(bytes_loaded >= 0) << "Negative bytes loaded: " << bytes_loaded << "\n"; feat.allocation_bytes_read_per_realization = allocation_bytes_loaded; feat.unique_bytes_read_per_realization = bytes_loaded; feat.unique_lines_read_per_realization = lines_loaded; if (!at_pure_production) { // Also pessimistically assume this update definition relies on the entirety of the produced region so far. // TODO: This overbills scatters, or writes to a sub-window. internal_assert(bytes_loaded >= 0) << "Negative bytes at production: " << feat.bytes_at_production << "\n"; feat.unique_bytes_read_per_realization += feat.bytes_at_production; feat.unique_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; feat.allocation_bytes_read_per_realization += feat.bytes_at_production; } } if (innermost) { feat.points_computed_per_production = subinstances / feat.num_productions; // Halide codegens strided loads for small strides as a // large dense vector load and a cheap swizzle. ARM even // has instructions that do this for free on load // (e.g. vld4). feat.vector_loads_per_vector = (num_dense_loads + 2 * num_stride_2_loads + 3 * num_stride_3_loads + 4 * num_stride_4_loads); feat.scalar_loads_per_vector = num_broadcasts + feat.vector_size * num_gathers; feat.scalar_loads_per_scalar = num_loads; if (stage->index > 0) { // Assume at update definitions we do a self-load feat.vector_loads_per_vector++; feat.scalar_loads_per_scalar++; } feat.unique_bytes_read_per_vector = bytes_loaded; feat.unique_lines_read_per_vector = lines_loaded; } // Track features for inlined Funcs for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *f = it.key(); internal_assert(f); auto &inlined_feat = features->get_or_create(&(f->stages[0])); inlined_feat.inlined_calls += it.value() * subinstances; inlined_feat.num_vectors += it.value() * feat.num_vectors; inlined_feat.num_scalars += it.value() * feat.num_scalars; inlined_feat.native_vector_size = stage->vector_size; if (inlined_feat.vector_size > 0) { inlined_feat.vector_size = std::min(inlined_feat.vector_size, (double)stage->vector_size); } else { inlined_feat.vector_size = feat.vector_size; } if (inlined_feat.innermost_pure_loop_extent > 0) { inlined_feat.innermost_pure_loop_extent = std::min(inlined_feat.innermost_pure_loop_extent, feat.innermost_pure_loop_extent); } else { inlined_feat.innermost_pure_loop_extent = feat.innermost_pure_loop_extent; } inlined_feat.inner_parallelism = 1; inlined_feat.outer_parallelism = parallelism; // Memoize intermediate features, based on stage. if (use_cached_features) { const auto &block = sites.get(stage).task; uint64_t hash_of_producers = sites.get(block->stage).hash_of_producers_stored_at_root; auto &intermediate_map = block->feature_intermediates_cache[hash_of_producers].get_or_create(&(f->stages[0])); auto &intermediate = intermediate_map.get_or_create(stage); intermediate.inlined_calls = it.value() * subinstances; intermediate.num_scalars = it.value() * feat.num_scalars; intermediate.innermost_pure_loop_extent = feat.innermost_pure_loop_extent; intermediate.outer_parallelism = parallelism; } } } // Get the region required of a Func at this site, from which we // know what region would be computed if it were scheduled here, // and what its loop nest would be. const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const { if (bounds.contains(f)) { const Bound &b = bounds.get(f); // Expensive validation for debugging // b->validate(); return b; } auto *bound = f->make_bound(); // Compute the region required if (f->is_output && is_root()) { // It's an output. Use the bounds estimate. for (int i = 0; i < f->dimensions; i++) { bound->region_required(i) = f->estimated_region_required[i]; } } else { internal_assert(!f->outgoing_edges.empty()) << "No consumers of " << f->func.name() << " at loop over " << (is_root() ? "root" : node->func.name()) << "\n"; auto init = Span::empty_span(); for (int i = 0; i < f->dimensions; i++) { bound->region_required(i) = init; } for (const auto *e : f->outgoing_edges) { // Ignore consumers outside of this loop nest if (!is_root() && (stage != e->consumer) && !stage->downstream_of(*(e->consumer->node))) { continue; } const auto &c_bounds = get_bounds(e->consumer->node); // Get the concrete sizes of the consuming loop const auto *consumer_loop = &(c_bounds->loops(e->consumer->index, 0)); // Use the bounds relationship between the nodes to // map from the consumer's loop to the required region // of the producer. e->expand_footprint(consumer_loop, &(bound->region_required(0))); } } // Given a required region of this producer, use the bounds // analysis to figure out what region actually gets // computed. For most funcs, these are the same. Some things, // like histograms or scans, you can only really compute all // of at once. f->required_to_computed(&(bound->region_required(0)), &(bound->region_computed(0))); // Finally, figure out what loop nests will be used to compute // this region. for (int i = 0; i < (int)f->stages.size(); i++) { f->loop_nest_for_region(i, &(bound->region_computed(0)), &(bound->loops(i, 0))); } const Bound &b = set_bounds(f, bound); // Validation is expensive, turn if off by default. // b->validate(); return b; } // Recursively print a loop nest representation to stderr void LoopNest::dump(std::ostream &os, string prefix, const LoopNest *parent) const { if (!is_root()) { // Non-root nodes always have parents. internal_assert(parent != nullptr); os << prefix << node->func.name(); prefix += " "; for (size_t i = 0; i < size.size(); i++) { os << " " << size[i]; // The vectorized loop gets a 'v' suffix if (innermost && i == (size_t)vectorized_loop_index) { os << "v"; } // Loops that have a known constant size get a // 'c'. Useful for knowing what we can unroll. if (parent->get_bounds(node)->loops(stage->index, i).constant_extent()) { os << "c"; } } // Uncomment when debugging the representative loop bounds selected. /* const auto &bounds = get_bounds(node); for (size_t i = 0; i < size.size(); i++) { const auto &p = bounds->loops(stage->index, i); os << " [" << p.first << ", " << p.second << "]"; } */ os << " (" << vectorized_loop_index << ", " << vector_dim << ")"; } if (tileable) { os << " t"; } if (innermost) { os << " *\n"; } else if (parallel) { os << " p\n"; } else { os << "\n"; } for (const auto *p : store_at) { os << prefix << "realize: " << p->func.name() << "\n"; } for (size_t i = children.size(); i > 0; i--) { children[i - 1]->dump(os, prefix, this); } for (auto it = inlined.begin(); it != inlined.end(); it++) { os << prefix << "inlined: " << it.key()->func.name() << " " << it.value() << "\n"; } } // Does this loop nest access the given Func bool LoopNest::calls(const FunctionDAG::Node *f) const { for (const auto &c : children) { if (c->calls(f)) { return true; } } for (const auto *e : f->outgoing_edges) { if (e->consumer == stage) { return true; } if (inlined.contains(e->consumer->node)) { return true; } } return false; } // What is the maximum number of inlined calls to a Func that // occur within this loop. Used to prune states that would // generate too much code. int64_t LoopNest::max_inlined_calls() const { int64_t result = 0; for (auto it = inlined.begin(); it != inlined.end(); it++) { result = std::max(result, it.value()); } for (const auto &c : children) { result = std::max(result, c->max_inlined_calls()); } return result; } // Does this loop nest access an input buffer? Used to select // trail strategies when splitting loops. We don't want to read // out of bounds on inputs, even if we don't intend to use the // values read. It could create annoying assertion failures for // the user. It's OK to read out of range of the values computed // on internal Funcs though. Allocation bounds inference just pads // out the bounds so that it won't fault. bool LoopNest::accesses_input_buffer() const { for (const auto &c : children) { if (c->accesses_input_buffer()) { return true; } } if (is_root()) { return false; } auto check = [&](const FunctionDAG::Node::Stage *s) { for (const auto *e : s->incoming_edges) { if (e->producer->is_input) { return true; } } for (int t = 0; t < (int)PipelineFeatures::ScalarType::NumScalarTypes; t++) { if (s->features.op_histogram[(int)PipelineFeatures::OpType::ImageCall][t] > 0) { return true; } } return false; }; if (check(stage)) { return true; } for (auto it = inlined.begin(); it != inlined.end(); it++) { if (check(&(it.key()->stages[0]))) { return true; } } return false; } // Does this loop nest contain a computation of the given Func. bool LoopNest::computes(const FunctionDAG::Node *f) const { if (f == node) { return true; } if (inlined.contains(f)) { return true; } for (const auto &c : children) { if (c->computes(f)) { return true; } } return false; } // Above here most methods query the loop nest. Below we have // methods that mutate the loop nest. // Inline a Func into all consumers within this loop. void LoopNest::inline_func(const FunctionDAG::Node *f) { // Inline it into the children for (auto &child : children) { if (child->calls(f)) { auto new_child = std::make_unique(); new_child->copy_from(*child); new_child->inline_func(f); child = new_child.release(); } } // Inline it here if there are any direct calls if (innermost) { int64_t calls = 0; for (const auto *e : f->outgoing_edges) { if (inlined.contains(e->consumer->node)) { calls += inlined.get(e->consumer->node) * e->calls; } if (e->consumer == stage) { calls += e->calls; } } if (calls) { inlined.insert(f, calls); } } } // Compute a Func at this site. void LoopNest::compute_here(const FunctionDAG::Node *f, bool tileable, int v, const Adams2019Params ¶ms) { const auto &bounds = get_bounds(f); const bool may_subtile = (params.disable_subtiling == 0); if (!may_subtile) { // If we are restricting ourselves to the Mullapudi et al // scheduling space, then once something is computed here // we may not subtile this loop. this->tileable = false; } for (int s = (int)f->stages.size() - 1; s >= 0; s--) { LoopNest *node = new LoopNest; node->node = f; node->stage = &f->stages[s]; node->innermost = true; node->vectorized_loop_index = -1; node->tileable = tileable && (is_root() || may_subtile); // Set up a bound for the inside of the // loop. computed/required is still the full region, but // the loop nest will be a single representative point. auto *single_point = bounds->make_copy(); size_t loop_dim = f->stages[s].loop.size(); node->size.resize(loop_dim); int64_t vector_size = 1; for (size_t i = 0; i < loop_dim; i++) { const auto &l = bounds->loops(s, i); // Initialize the loop nest node->size[i] = l.extent(); // Use the first loop iteration to represent the inner // loop. We'll shift it to a later one once we decide // on vectorization. single_point->loops(s, i) = Span(l.min(), l.min(), true); internal_assert(l.max() >= l.min()) << i << " " << l.max() << " " << l.min() << "\n"; if (f->dimensions && node->size[i] >= 1 && f->stages[s].loop[i].var == f->func.args()[v]) { node->vectorized_loop_index = (int)i; vector_size = (int64_t)(node->stage->vector_size); single_point->loops(s, i).set_extent(vector_size); node->size[i] += vector_size - 1; node->size[i] /= vector_size; // Shift the loops along by some multiple of the // vector size, to pick a more representative vector // than the first. We use the middle-most. int64_t shift = vector_size * (node->size[i] / 2); single_point->loops(s, i).translate(shift); } else { int64_t shift = node->size[i] / 2; single_point->loops(s, i).translate(shift); } } // Leave region required blank inside the computation of a Func node->set_bounds(f, single_point); node->vector_dim = v; if (node->vectorized_loop_index >= 0) { // Split off the single vector as an inner loop nest. node->innermost = false; LoopNest *one_vector = new LoopNest; one_vector->node = node->node; one_vector->stage = node->stage; one_vector->tileable = false; one_vector->vectorized_loop_index = node->vectorized_loop_index; one_vector->vector_dim = v; one_vector->size.resize(loop_dim, 1); one_vector->innermost = true; auto *b = node->get_bounds(f)->make_copy(); // Set the region computed inside this node to be the first vector lane b->loops(s, node->vectorized_loop_index).set_extent(1); one_vector->set_bounds(f, b); one_vector->size[node->vectorized_loop_index] = vector_size; node->children.emplace_back(one_vector); } children.emplace_back(node); } } // Parallelize this loop according to the given tiling. IntrusivePtr LoopNest::parallelize_in_tiles(const Adams2019Params ¶ms, const vector &tiling, const LoopNest *parent) const { const bool may_subtile = (params.disable_subtiling == 0); // Split this loop and move factors to the inner loop LoopNest *inner = new LoopNest, *outer = new LoopNest; inner->node = outer->node = node; inner->stage = outer->stage = stage; inner->tileable = outer->tileable = tileable && may_subtile; inner->vector_dim = outer->vector_dim = vector_dim; inner->vectorized_loop_index = outer->vectorized_loop_index = vectorized_loop_index; outer->size = size; outer->innermost = false; outer->parallel = true; outer->tileable = may_subtile; // First make an inner loop representing a 1x1x1... tile inner->size.resize(size.size(), 1); inner->innermost = innermost; inner->children = children; inner->inlined = inlined; inner->bounds = bounds; inner->store_at = store_at; auto *b = inner->get_bounds(node)->make_copy(); // Then move factors from the outer loop to the inner loop const auto &parent_bounds = parent->get_bounds(node); for (size_t i = 0; i < stage->loop.size(); i++) { int l = stage->loop[i].pure_dim; int64_t outer_extent; if (l >= 0) { internal_assert(l < (int)tiling.size()) << l << " " << tiling.size() << "\n"; outer_extent = tiling[l]; } else { // RVars are moved inwards outer_extent = 1; } inner->size[i] = (outer->size[i] + outer_extent - 1) / outer_extent; // Recompute the outer size given the selected inner size outer_extent = (outer->size[i] + inner->size[i] - 1) / inner->size[i]; outer->size[i] = outer_extent; const auto &p = parent_bounds->loops(stage->index, i); int64_t min = p.min(); int64_t extent = p.extent(); extent = (extent + outer_extent - 1) / outer_extent; // Pick a better representative loop iteration for the // inner loops. min += (outer_extent / 2) * extent; bool compile_time_constant_bounds = p.constant_extent() || ((outer_extent > 1) && stage->loop[i].pure); b->loops(stage->index, i) = Span(min, min + extent - 1, compile_time_constant_bounds); } outer->set_bounds(node, b); outer->children.emplace_back(inner); return outer; } // Return all possible ways to compute f in tiles somewhere within // this loop nest. vector> LoopNest::compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Adams2019Params ¶ms, int v, bool in_realization) const { const bool may_subtile = (params.disable_subtiling == 0); internal_assert(f); vector> result; // Some pruning to not waste time on terrible states bool must_tile_to_vectorize = false; if (parent) { const auto &bounds_here = get_bounds(f); const auto &bounds_at_parent = parent->get_bounds(f); // Don't descend into loops that break our ability to // vectorize if we could have vectorized one level up. const auto &p = bounds_here->region_computed(v); const auto &p_parent = bounds_at_parent->region_computed(v); int64_t e = p.extent(); int64_t ep = p_parent.extent(); if (ep >= f->vector_size && e < f->vector_size) { must_tile_to_vectorize = true; } // Don't descend into loops if the bounds required don't // shrink. int64_t total_here = 1, total_at_parent = 1; for (int i = 0; i < f->dimensions; i++) { const auto &range_here = bounds_here->region_computed(i); const auto &range_at_parent = bounds_at_parent->region_computed(i); total_here *= range_here.extent(); total_at_parent *= range_at_parent.extent(); } if (total_here >= total_at_parent) { return result; } } // Figure out which child we can fuse this into int child = -1; bool called_by_multiple_children = false; for (int i = 0; i < (int)children.size(); i++) { if (children[i]->calls(f)) { if (child != -1) { called_by_multiple_children = true; } child = i; } } // Place the computation directly inside this loop (provided it's not a SIMD loop) if (!must_tile_to_vectorize && !innermost && (!in_realization || size.empty() || vector_dim == -1 || size[vector_dim] == 1)) { auto r = std::make_unique(); r->copy_from(*this); r->compute_here(f, true, v, params); if (!in_realization) { r->store_at.insert(f); } else { r->tileable = false; } result.emplace_back(r.release()); } if (f->is_output) { // Outputs must be compute_root, so we're done. return result; } if (tileable) { // The root node is not tileable, so all tileable nodes have parents. internal_assert(parent != nullptr); // Generate a list of tile sizes to try auto tilings = generate_tilings(size, (int)(size.size() - 1), 2, !in_realization); if (tilings.size() > 10000) { aslog(1) << "Warning: lots of tilings: " << tilings.size() << "\n"; } for (auto t : tilings) { if (parallel) { const auto &l = stage->loop; // More pruning. Skip root-level tilings that // would leave too many cores idle, and root-level // tilings that would force serialization of // dimensions we have decided to parallelize over // in an earlier pass. int total = 1; size_t idx = 0; for (auto s : t) { if (l[idx].pure) { total *= s; } idx++; } const double tasks_per_core = (double)total / params.parallelism; const double idle_cores = std::ceil(tasks_per_core) / tasks_per_core; if (idle_cores > 1.1) { continue; } } // Tile this loop and place the computation at some coarser granularity LoopNest *inner = new LoopNest, *outer = new LoopNest; inner->node = outer->node = node; inner->stage = outer->stage = stage; inner->tileable = outer->tileable = tileable && may_subtile; inner->vector_dim = outer->vector_dim = vector_dim; inner->vectorized_loop_index = outer->vectorized_loop_index = vectorized_loop_index; outer->size = size; outer->innermost = false; outer->parallel = parallel; inner->parallel = false; // First make an inner loop representing a 1x1x1... tile inner->size.resize(size.size(), 1); inner->innermost = innermost; inner->children = children; inner->inlined = inlined; inner->bounds = bounds; inner->store_at = store_at; { auto *b = inner->get_bounds(node)->make_copy(); // Then move factors from the outer loop to the inner loop const auto &parent_bounds = parent->get_bounds(node); for (size_t i = 0; i < t.size(); i++) { int64_t outer_extent = t[i]; inner->size[i] = (outer->size[i] + outer_extent - 1) / outer_extent; outer->size[i] = outer_extent; const auto &p = parent_bounds->loops(stage->index, i); int64_t min = p.min(); int64_t original_extent = p.extent(); int64_t inner_extent = (original_extent + outer_extent - 1) / outer_extent; // Pick a more representative loop iteration min += (outer_extent / 2) * inner_extent; bool compile_time_constant_extent = (p.constant_extent() || outer_extent > 1) && (inner_extent == 1 || outer_extent == 1 || stage->index == 0); b->loops(stage->index, i) = Span(min, min + inner_extent - 1, compile_time_constant_extent); } // Region_{computed/required} on outer is now // wrong, but it doesn't matter because consumers // only look at the loops in get_bounds. Still, // this is weird. outer->set_bounds(node, b); } if (!in_realization) { outer->store_at.insert(f); } outer->children.emplace_back(inner); // HACK // bool may_slide = false; bool may_slide = (!in_realization && f->stages.size() == 1); if (may_slide) { // Store here, but compute further in. Currently // don't have to worry about the constraints this // places on parallelism, as we forced all the // parallelism to the outer loop. auto opts = inner->compute_in_tiles(f, outer, params, v, true); for (IntrusivePtr &n : opts) { LoopNest *store_at_outer_compute_further_in = new LoopNest; store_at_outer_compute_further_in->copy_from(*outer); store_at_outer_compute_further_in->children.pop_back(); store_at_outer_compute_further_in->children.emplace_back(std::move(n)); result.emplace_back(store_at_outer_compute_further_in); } } // Site the computation inside the outer loop outer->compute_here(f, true, v, params); outer->tileable &= !in_realization; result.emplace_back(outer); } } if (child >= 0 && !called_by_multiple_children && !in_realization && (may_subtile || is_root())) { // Push the Func further inwards in the loop nest // See if it's appropriate to slide over this loop Can't // slide at the root level if we intend to parallelize it. bool may_slide = (params.parallelism == 1) || !is_root(); const auto &c = children[child]; int num_ones = 0; for (int64_t s : c->size) { num_ones += (s == 1) ? 1 : 0; } // Some pruning: // Only slide over single-dimensional loops may_slide &= num_ones == ((int)c->size.size() - 1); // Don't slide funcs with update stages may_slide &= f->stages.size() == 1; // Don't slide over the vector dimension may_slide &= (c->vectorized_loop_index == -1 || c->size[c->vectorized_loop_index] == 1); for (int store_here = 0; store_here < 2; store_here++) { if (store_here && !may_slide) { // We place all our parallel loops at the root // level, so this would constrain parallelism. continue; } if (is_root() && num_ones == (int)c->size.size() && params.parallelism > 1) { // Don't fuse into serial loops, or we could never parallelize this Func. continue; } auto opts = children[child]->compute_in_tiles(f, this, params, v, store_here); for (IntrusivePtr &n : opts) { // (Only valid if one child calls f) Push the // computation into the child. Possibly leaving // the storage out here. LoopNest *r = new LoopNest; r->copy_from(*this); if (store_here) { r->store_at.insert(f); } r->children[child] = n; result.emplace_back(r); } } } return result; } // Apply the schedule represented by this loop nest to a Halide pipeline. void LoopNest::apply(LoopLevel here, StageMap> &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site) const { if (is_root()) { for (const auto &c : children) { Func(c->node->func).compute_root(); c->apply(LoopLevel::root(), state_map, num_cores, 1, this, c.get()); if (c->stage->index == 0) { auto &state = state_map.get(c->stage); state->schedule_source << "\n .compute_root()"; // TODO: Omitting logic for printing store_root() assumes everything store_root is also compute root } } } else { // Non-root nodes always have parents. internal_assert(parent != nullptr); if (parent->node != node) { compute_site = this; } const auto &symbolic_loop = stage->loop; const auto &parent_bounds = parent->get_bounds(node); if (!state_map.contains(stage)) { StageScheduleState *state = new StageScheduleState; state->num_cores = num_cores; state->vector_dim = vector_dim; state->vectorized_loop_index = vectorized_loop_index; for (size_t i = 0; i < symbolic_loop.size(); i++) { StageScheduleState::FuncVar fv; const auto &l = symbolic_loop[i]; fv.var = VarOrRVar(l.var, !l.pure); fv.orig = fv.var; fv.accessor = l.accessor; const auto &p = parent_bounds->loops(stage->index, i); fv.extent = p.extent(); fv.constant_extent = p.constant_extent(); fv.outermost = true; fv.parallel = l.pure && parallel; fv.exists = true; fv.pure = l.pure; fv.index = i; fv.innermost_pure_dim = (i == (size_t)vectorized_loop_index); state->vars.push_back(fv); } // Bubble the innermost pure dimension to the front of the pure dimensions for (int i = vectorized_loop_index - 1; i >= 0 && state->vars[i].pure; i--) { std::swap(state->vars[i], state->vars[i + 1]); } state_map.emplace(stage, std::unique_ptr(state)); } auto &state = *(state_map.get(stage)); // The getter for grabbing Func handles is reverse topological order Stage s = Func(node->func); if (stage->index > 0) { s = Func(node->func).update(stage->index - 1); } if (stage->index == 0 && parent->node != node) { // Pick a memory type double bytes = node->bytes_per_point; for (int i = 0; i < node->dimensions; i++) { const auto &p = parent_bounds->region_computed(i); bytes *= p.extent(); } if (bytes < 64000 && depth > 2) { // If it's probably a small allocation, and it's // made more than once, use stack-scoped // storage. Otherwise let the compiler pick heap // or stack as it likes. Func(node->func).store_in(MemoryType::Stack); state.schedule_source << "\n .store_in(MemoryType::Stack)"; } } // Pick a tail strategy for any splits of pure vars. RVars always use guardwithif auto pure_var_tail_strategy = TailStrategy::Auto; if (!compute_site->accesses_input_buffer() && !node->is_output) { // Roundup is lowest overhead, provided it doesn't // expand the bounds read on the input or written on // the output. However, you can only really use it on // pure stages that don't access the input anywhere in // their loop nest. pure_var_tail_strategy = TailStrategy::RoundUp; } else if (stage->index == 0) { // Pure stages that access the input use shiftinwards pure_var_tail_strategy = TailStrategy::ShiftInwards; } else { // For pure vars in update stages that access the // input, it's not safe to round up or redundantly // recompute pure_var_tail_strategy = TailStrategy::GuardWithIf; } if (!size.empty()) { if (innermost) { if (vectorized_loop_index >= 0) { size_t i = 0; while (!state.vars[i].innermost_pure_dim) { i++; } auto &v = state.vars[i]; internal_assert(v.innermost_pure_dim && v.exists) << v.var.name() << "\n"; // Is the result of a split state.schedule_source << "\n .vectorize(" << v.var.name() << ")"; s.vectorize(v.var); } } else { // Grab the innermost loop for this node const LoopNest *innermost_loop = this, *child = nullptr; while (!innermost_loop->innermost) { for (const auto &c : innermost_loop->children) { if (c->node == node) { if (!child) { child = c.get(); } innermost_loop = c.get(); break; } } } // Do the implied splits vector new_inner; for (size_t i = 0; i < symbolic_loop.size(); i++) { StageScheduleState::FuncVar v; StageScheduleState::FuncVar &parent = state.vars[i]; int64_t factor = (parent.extent + size[parent.index] - 1) / size[parent.index]; int64_t innermost_size = innermost_loop->size[parent.index]; if (child && parent.innermost_pure_dim) { // Ensure the split is a multiple of the // vector size. With all these rounded // divs going on it can drift. factor = ((factor + innermost_size - 1) / innermost_size) * innermost_size; } if (child && innermost_size > factor) { factor = innermost_size; } if (!parent.exists || factor == 1) { v.exists = false; v.extent = 1; } else if (size[parent.index] == 1 && !(child && child->innermost && parent.innermost_pure_dim && parent.var.name() == parent.orig.name())) { // Not split in this dimension v = parent; v.parallel = false; parent.exists = false; parent.extent = 1; } else { VarOrRVar inner(Var(parent.var.name() + "i")); if (parent.var.is_rvar) { inner = RVar(parent.var.name() + "i"); } auto tail_strategy = pure_var_tail_strategy; // If it's an RVar, or not the outermost split and we're in an update, we need a guard with if instead. if (parent.var.is_rvar || (stage->index != 0 && !parent.outermost)) { tail_strategy = TailStrategy::GuardWithIf; } if (factor > parent.extent && tail_strategy == TailStrategy::ShiftInwards) { // Don't shift all the way off the image. tail_strategy = TailStrategy::GuardWithIf; } s.split(parent.var, parent.var, inner, (int)factor, tail_strategy); state.schedule_source << "\n .split(" << parent.var.name() << ", " << parent.var.name() << ", " << inner.name() << ", " << factor << ", " << "TailStrategy::" << tail_strategy << ")"; v = parent; parent.extent = size[parent.index]; v.constant_extent = (tail_strategy != TailStrategy::GuardWithIf); v.var = inner; v.accessor.clear(); v.extent = factor; v.parallel = false; v.outermost = false; } new_inner.push_back(v); } if (child->innermost) { // Maybe do some unrolling int64_t product_of_pure_loops = 1; bool all_pure_loops_constant_size = true; for (size_t i = 0; i < symbolic_loop.size(); i++) { if (state.vars[i].pure) { product_of_pure_loops *= state.vars[i].extent; all_pure_loops_constant_size &= state.vars[i].constant_extent; } } if (product_of_pure_loops <= kUnrollLimit && all_pure_loops_constant_size) { // There's a hope we can fit anything compute-at this level into registers if we fully unroll // TODO: 16 should be the number of vector registers in the architecture std::stable_sort(state.vars.begin(), state.vars.begin() + symbolic_loop.size(), [](const StageScheduleState::FuncVar &a, const StageScheduleState::FuncVar &b) { return a.pure && !b.pure; }); for (size_t i = 0; i < symbolic_loop.size(); i++) { if (state.vars[i].pure && state.vars[i].exists && state.vars[i].extent > 1) { s.unroll(state.vars[i].var); state.schedule_source << "\n .unroll(" << state.vars[i].var.name() << ")"; } } } } bool found = false; for (const auto &v : state.vars) { if (!v.exists) { continue; } here = LoopLevel(node->func, v.var); found = true; break; } if (!found) { here = LoopLevel(node->func, Var::outermost()); } // internal_assert(found) << "Could not find appropriate compute_at location for children of " << node->func.name() << "\n"; state.vars.insert(state.vars.begin(), new_inner.begin(), new_inner.end()); } } if (innermost) { internal_assert(store_at.empty()); internal_assert(children.empty()); return; } for (const auto *f : store_at) { Func(f->func).store_at(here); } for (auto s : size) { num_cores /= s; } here.lock(); string loop_level; if (here.is_root()) { loop_level = "_root()"; } else { loop_level = "_at(" + here.func() + ", " + here.var().name() + ")"; } for (const auto &c : children) { if (c->node != node) { Func(c->node->func).compute_at(here); } c->apply(here, state_map, num_cores, depth + 1, this, compute_site); if (c->node != node && c->stage->index == 0) { auto &state = *(state_map.get(c->stage)); state.schedule_source << "\n .compute" << loop_level; } } for (const auto *f : store_at) { bool computed_here = false; for (const auto &c : children) { if (c->node == f) { computed_here = true; break; } } if (!computed_here) { auto &state = *(state_map.get(&(f->stages[0]))); state.schedule_source << "\n .store" << loop_level; } } } } void LoopNest::copy_from_including_features(const LoopNest &n) { size = n.size; children = n.children; inlined = n.inlined; store_at = n.store_at; bounds = n.bounds; node = n.node; stage = n.stage; innermost = n.innermost; tileable = n.tileable; parallel = n.parallel; vector_dim = n.vector_dim; vectorized_loop_index = n.vectorized_loop_index; features_cache = n.features_cache; feature_intermediates_cache = n.feature_intermediates_cache; } void LoopNest::memoize_points_computed_minimum(StageMap &memoized_features, const StageMap *features) const { for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *node = it.key(); const auto *stage_ptr = &(node->stages[0]); const auto &inlined_feat = features->get(stage_ptr); // Save pcm into memoized_features. memoized_features.get(stage_ptr).points_computed_minimum = inlined_feat.points_computed_minimum; } memoized_features.get(stage).points_computed_minimum = features->get(stage).points_computed_minimum; for (const auto &c : children) { c->memoize_points_computed_minimum(memoized_features, features); } } void LoopNest::memoize_features(StageMap &memoized_features, const StageMap *features_to_insert) const { for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *node = it.key(); const auto *stage_ptr = &(node->stages[0]); if (memoized_features.contains(stage_ptr)) { continue; } internal_assert(features_to_insert->contains(stage_ptr)) << "memoize_features attempted to save a stage_ptr that doesn't exist\n"; const auto &inlined_feat = features_to_insert->get(stage_ptr); memoized_features.insert(stage_ptr, inlined_feat); } if (!memoized_features.contains(stage)) { internal_assert(features_to_insert->contains(stage)) << "memoize_features attempted to save this->stage but that's not in features_to_insert\n"; memoized_features.insert(stage, features_to_insert->get(stage)); } for (const auto &c : children) { c->memoize_features(memoized_features, features_to_insert); } } void LoopNest::compute_working_set_from_features(int64_t *working_set, const StageMap *features) const { int64_t working_set_here = 0; for (const auto &c : children) { c->compute_working_set_from_features(&working_set_here, features); } for (const auto *node : store_at) { const auto &feat = features->get(&(node->stages[0])); working_set_here += feat.bytes_at_production; } *working_set += working_set_here; } void LoopNest::recompute_inlined_features(const StageMap &sites, StageMap *features) const { for (const auto &c : children) { c->recompute_inlined_features(sites, features); } // TODO(rootjalex): Figure out why hoisting the fetching of block / hash / cache_map breaks this loop. for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *f = it.key(); internal_assert(f); const auto &block = sites.get(stage).task; internal_assert(sites.contains(block->stage)); uint64_t hash_of_producers = sites.get(block->stage).hash_of_producers_stored_at_root; internal_assert(block->feature_intermediates_cache.count(hash_of_producers) > 0); auto &intermediate_map = block->feature_intermediates_cache[hash_of_producers].get(&(f->stages[0])); auto &intermediate = intermediate_map.get(stage); auto &inlined_feat = features->get(&(f->stages[0])); inlined_feat.inlined_calls += intermediate.inlined_calls; inlined_feat.num_scalars += intermediate.num_scalars; if (inlined_feat.innermost_pure_loop_extent > 0) { inlined_feat.innermost_pure_loop_extent = std::min(inlined_feat.innermost_pure_loop_extent, intermediate.innermost_pure_loop_extent); } else { inlined_feat.innermost_pure_loop_extent = intermediate.innermost_pure_loop_extent; } inlined_feat.outer_parallelism = intermediate.outer_parallelism; } } uint64_t LoopNest::compute_hash_of_producers_stored_at_root(const StageMap &sites) const { vector> producers = collect_producers(sites); // Sort them according to node id std::sort(producers.begin(), producers.end(), [](const pair &a, const pair &b) { return a.first < b.first; }); uint64_t store_root_hash = 0; for (const auto &p : producers) { hash_combine(store_root_hash, p.first); hash_combine(store_root_hash, p.second); } return store_root_hash; } vector> LoopNest::collect_producers(const StageMap &sites) const { set stages; collect_stages(stages); vector pending; for (const auto *stage : stages) { for (const auto *e : stage->incoming_edges) { pending.push_back(e); } } set done; vector> producers; // Collect all producers of the funcs within this LoopNest while (!pending.empty()) { const auto *e = pending.back(); pending.pop_back(); if (done.count(e->producer)) { continue; } done.insert(e->producer); const auto &site = sites.get(&(e->producer->stages[0])); if (site.store->is_root()) { int vector_dim = (e->producer->is_input ? 0 : site.produce != nullptr ? site.produce->vector_dim : -1); producers.emplace_back(e->producer->id, vector_dim); } else if (site.produce != nullptr) { // Computation must be nested inside this task or inlined into it. for (const auto &s : e->producer->stages) { for (const auto *e2 : s.incoming_edges) { pending.push_back(e2); } } } } return producers; } void LoopNest::collect_stages(std::set &stages) const { stages.insert(stage); for (const auto &c : children) { c->collect_stages(stages); } } const LoopNest *deepest_common_ancestor(const map> &parents, const LoopNest *a, const LoopNest *b) { if (a->is_root()) { return a; } if (b->is_root()) { return b; } if (a == b) { return a; } // Walk the deeper one up until they're at the same depth auto it_a = parents.find(a); auto it_b = parents.find(b); internal_assert(it_a != parents.end() && it_b != parents.end()); while (it_a->second.second > it_b->second.second) { a = it_a->second.first; it_a = parents.find(a); } while (it_b->second.second > it_a->second.second) { b = it_b->second.first; it_b = parents.find(b); } while (true) { // Walk each up one a = it_a->second.first; b = it_b->second.first; if (a == b) { return a; } it_a = parents.find(a); it_b = parents.find(b); internal_assert(it_a != parents.end() && it_b != parents.end()); } // unreachable return nullptr; } // Compute the parent and depth of every loop nest node void compute_loop_nest_parents(map> &parents, const LoopNest *here, int depth) { for (const auto &c : here->children) { parents.emplace(c.get(), pair{here, depth}); compute_loop_nest_parents(parents, c.get(), depth + 1); } } } // namespace Autoscheduler } // namespace Internal } // namespace Halide Halide-17.0.1/src/autoschedulers/adams2019/LoopNest.h000066400000000000000000000313731456515664200221510ustar00rootroot00000000000000/** This file defines the LoopNest, which is our * representation of a Halide schedule, and contains methods to * generate candidates for scheduling as well as extract a * featurization that can be used to cost each candidate. */ #ifndef LOOP_NEST_H #define LOOP_NEST_H #include "FunctionDAG.h" #include "PerfectHashMap.h" #include #include #include #include namespace Halide { namespace Internal { namespace Autoscheduler { template using NodeMap = PerfectHashMap; template using StageMap = PerfectHashMap; // Given a multi-dimensional box of dimensionality d, generate a list // of candidate tile sizes for it, logarithmically spacing the sizes // using the given factor. If 'allow_splits' is false, every dimension // must either be one, or the full extent of the box. This function is // used to generate candidate tilings when tiling for // producer-consumer fusion, or tiling for parallelism. std::vector> generate_tilings(const vector &s, int d, int factor, bool allow_splits); struct LoopNest { mutable RefCount ref_count; // The extents of this loop. Put another way, the number of tiles, // not the size of each tile. std::vector size; // The nodes inside the loop body std::vector> children; // Funcs inlined into this inner loop, and the number of times // each is called. Only valid if children is empty. NodeMap inlined; // Funcs stored inside this loop std::set store_at; // The total bounds required of any given Func over all iterations // of this loop. In the paper, this is represented using the // little boxes to the left of the loop nest tree figures. mutable NodeMap bounds; // The Func this loop nest belongs to const FunctionDAG::Node *node = nullptr; // The stage of the Func const FunctionDAG::Node::Stage *stage = nullptr; // Is this the innermost loop of this func (the SIMD loop)? bool innermost = false; // Are we permitted to tile this loop? bool tileable = false; // Is this the parallel outer loop? bool parallel = false; // What dimension is this Func vectorized over, in terms of the pure args of the Func? int vector_dim = -1; // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them. int vectorized_loop_index = -1; void copy_from(const LoopNest &n); static void hash_combine(uint64_t &h, uint64_t next) { // From boost h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2)); } // Hash the loop structure and sizes up to a fixed depth. This is // used as the hash function for the coarse-to-fine beam search in // the paper. void structural_hash(uint64_t &h, int depth) const; // How many funcs are scheduled inside this loop level. Used in // the structural hash. size_t funcs_realized_or_inlined() const { size_t count = inlined.size() + store_at.size(); for (const auto &c : children) { count += c->funcs_realized_or_inlined(); } return count; } // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage. struct Sites { const LoopNest *compute = nullptr; // Its containing compute_at site const LoopNest *store = nullptr; // Its containing store_at site const LoopNest *produce = nullptr; // Its own outermost node const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop const LoopNest *task = nullptr; // The parallel for loop it belongs to bool inlined = false; // Is the Func inlined? // Used for caching features/feature intermediates. uint64_t hash_of_producers_stored_at_root; }; // Compute all the sites of interest for each pipeline stage void get_sites(StageMap &sites, const LoopNest *task = nullptr, const LoopNest *parent = nullptr) const; // A helper for the working_set_at_task feature. Most features are // computed in the recursive pass 'compute_features' below, but // this one must be done in a second separate recursive pass. void set_working_set_at_task_feature(int64_t working_set, StageMap *features) const { for (const auto &c : children) { c->set_working_set_at_task_feature(working_set, features); features->get(c->stage).working_set_at_task = working_set; } } // Do a recursive walk over the loop nest computing features to feed the cost model. void compute_features(const FunctionDAG &dag, const Adams2019Params ¶ms, const StageMap &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, int64_t *working_set, StageMap *features, bool use_cached_features) const; bool is_root() const { // The root is the sole node without a Func associated with // it. return node == nullptr; } // Set the region required of a Func at this site. const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const { return bounds.emplace(f, b); } // Get the region required of a Func at this site, from which we // know what region would be computed if it were scheduled here, // and what its loop nest would be. const Bound &get_bounds(const FunctionDAG::Node *f) const; // Recursively print a loop nest representation to stderr void dump(std::ostream &os, string prefix, const LoopNest *parent) const; // Does this loop nest access the given Func bool calls(const FunctionDAG::Node *f) const; // What is the maximum number of inlined calls to a Func that // occur within this loop. Used to prune states that would // generate too much code. int64_t max_inlined_calls() const; // Does this loop nest access an input buffer? Used to select // trail strategies when splitting loops. We don't want to read // out of bounds on inputs, even if we don't intend to use the // values read. It could create annoying assertion failures for // the user. It's OK to read out of range of the values computed // on internal Funcs though. Allocation bounds inference just pads // out the bounds so that it won't fault. bool accesses_input_buffer() const; // Does this loop nest contain a computation of the given Func. bool computes(const FunctionDAG::Node *f) const; // Above here most methods query the loop nest. Below we have // methods that mutate the loop nest. // Inline a Func into all consumers within this loop. void inline_func(const FunctionDAG::Node *f); // Compute a Func at this site. void compute_here(const FunctionDAG::Node *f, bool tileable, int v, const Adams2019Params ¶ms); // Parallelize this loop according to the given tiling. IntrusivePtr parallelize_in_tiles(const Adams2019Params ¶ms, const vector &tiling, const LoopNest *parent) const; // Return all possible ways to compute f in tiles somewhere within // this loop nest. std::vector> compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Adams2019Params ¶ms, int v, bool in_realization) const; // Below here we have methods that apply a schedule to a Halide pipeline. // A model of the state of the loop nest of a Func while applying // Halide's scheduling directives. // Note that StageScheduleState is movable-but-not-copyable thanks // to its ostringstream member. struct StageScheduleState { // How much parallelism do we need to exploit with this Func? double num_cores = 0; // Which storage dimension is vectorized? We need to reorder it innermost int vector_dim = -1; int vectorized_loop_index = -1; // The various Vars and RVars used for scheduling a Func. struct FuncVar { // The top-level var or rvar this was split off from VarOrRVar orig; // This var. VarOrRVar var; // Source code to access this Var/RVar. Used for printing // valid Halide source for this schedule. string accessor; // Our estimate of the extent of this var. This is exact // when constant_extent flag is true. int64_t extent = 0; // Which index in the symbolic loop nest does this var // belong to. size_t index = 0; // Some flags. bool innermost_pure_dim = false, outermost = false, parallel = false, exists = false, pure = false, constant_extent = false; FuncVar() : orig(Var()), var(Var()) { } }; // In order from innermost to outermost. Each group of d is one tiling level. std::vector vars; std::ostringstream schedule_source; }; // Apply the schedule represented by this loop nest to a Halide pipeline. void apply(LoopLevel here, StageMap> &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site) const; // The below are two feature caches. // hash of producers -> StageMap mutable std::map>> feature_intermediates_cache; // hash of producers -> StageMap mutable std::map> features_cache; // Same as copy_from (above) but also copies the two caches. void copy_from_including_features(const LoopNest &n); // Loops through inlined funcs and caches the pcm found in features, into memoized_features. void memoize_points_computed_minimum(StageMap &memoized_features, const StageMap *features) const; // Merges features_to_insert into memoized_features if it does not already exist there. void memoize_features(StageMap &memoized_features, const StageMap *features_to_insert) const; // Recalculates working_set from cached features void compute_working_set_from_features(int64_t *working_set, const StageMap *features) const; // Features need to be recomputed for inlined Funcs void recompute_inlined_features(const StageMap &sites, StageMap *features) const; // Create a (hopefully) unique hash of the producers. uint64_t compute_hash_of_producers_stored_at_root(const StageMap &sites) const; // Gather all stages that are producers for any Func in this LoopNest. std::vector> collect_producers(const StageMap &sites) const; // Collect all stages referenced in this LoopNest. void collect_stages(std::set &stages) const; }; // Find the deepest common ancestor of `a` and `b`. // `parents` is a map from loop nest to (parent, depth) tuples. // Assumes that `a` and `b` are found in `parents`, otherwise errors. const LoopNest *deepest_common_ancestor(const std::map> &parents, const LoopNest *a, const LoopNest *b); // Compute the parent and depth of every loop nest node. // Stores in `parents` the children of `here` (keys) to tuples of (here, depth). // Recurses on all children of `here`. void compute_loop_nest_parents(std::map> &parents, const LoopNest *here, int depth); } // namespace Autoscheduler } // namespace Internal } // namespace Halide #endif // LOOP_NEST_H Halide-17.0.1/src/autoschedulers/adams2019/Makefile000066400000000000000000000077601456515664200217000ustar00rootroot00000000000000THIS_MAKEFILE = $(realpath $(filter %Makefile, $(MAKEFILE_LIST))) SRC = $(strip $(shell dirname $(THIS_MAKEFILE))) HALIDE_SRC_ROOT = $(realpath $(SRC)/../../../) COMMON_DIR ?= $(realpath $(SRC)/../common/) HALIDE_DISTRIB_PATH ?= $(HALIDE_SRC_ROOT)/distrib $(info Looking for Halide distro at $(HALIDE_DISTRIB_PATH). If this is incorrect, set the make variable HALIDE_DISTRIB_PATH) # Don't include an autoscheduler in the generator deps AUTOSCHEDULER= include $(HALIDE_SRC_ROOT)/apps/support/Makefile.inc # Add the relative location of libHalide.so in the rpath in a distro ifeq ($(UNAME), Darwin) HALIDE_RPATH_FOR_BIN = '-Wl,-rpath,@executable_path/../lib' HALIDE_RPATH_FOR_LIB = '-Wl,-rpath,@loader_path' else HALIDE_RPATH_FOR_BIN = '-Wl,-rpath,$$ORIGIN/../lib' HALIDE_RPATH_FOR_LIB = '-Wl,-rpath,$$ORIGIN' endif CXXFLAGS += -I$(COMMON_DIR) AUTOSCHED_WEIGHT_OBJECTS=$(BIN)/baseline_weights.o $(BIN)/binary2cpp: $(HALIDE_SRC_ROOT)/tools/binary2cpp.cpp @mkdir -p $(@D) $(CXX) $< -o $@ $(BIN)/baseline_weights.cpp: $(BIN)/binary2cpp $(SRC)/baseline.weights @mkdir -p $(@D) $(BIN)/binary2cpp baseline_weights < $(SRC)/baseline.weights > $@ $(BIN)/baseline_weights.o: $(BIN)/baseline_weights.cpp $(CXX) -c $< -o $@ AUTOSCHED_COST_MODEL_LIBS=\ $(BIN)/cost_model/adams2019_cost_model.a \ $(BIN)/cost_model/adams2019_train_cost_model.a \ $(BIN)/cost_model.generator: $(SRC)/cost_model_generator.cpp \ $(SRC)/cost_model_schedule.h \ $(SRC)/NetworkSize.h \ $(GENERATOR_DEPS) @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@ $(USE_EXPORT_DYNAMIC) $(LIBHALIDE_LDFLAGS) $(BIN)/auto_schedule_runtime.a: $(BIN)/cost_model.generator @mkdir -p $(@D) $^ -r auto_schedule_runtime -o $(BIN) target=$(HL_TARGET) $(BIN)/cost_model/adams2019_%.a: $(BIN)/cost_model.generator @mkdir -p $(@D) $^ -g $* -o $(BIN)/cost_model -f $* -n adams2019_$* target=$(HL_TARGET)-no_runtime -e stmt,static_library,h,assembly # It's important to use dynamic lookups for undefined symbols here: all of libHalide # is expected to be present (in the loading binary), so we explicitly make the symbols # undefined rather than dependent on libHalide.so. # # Also, be sure *not* to include libHalide in the link steps here; that can cause misbehavior # on OSX systems in certain situations -- note that $(LIB_HALIDE) is an order-only dep, # to ensure that (eg) Halide.h is built before this. $(BIN)/libautoschedule_adams2019.$(PLUGIN_EXT): \ $(COMMON_DIR)/ASLog.cpp \ $(SRC)/AutoSchedule.cpp \ $(SRC)/Cache.h \ $(SRC)/Cache.cpp \ $(SRC)/DefaultCostModel.h \ $(SRC)/DefaultCostModel.cpp \ $(SRC)/Weights.h \ $(SRC)/Weights.cpp \ $(SRC)/FunctionDAG.h \ $(SRC)/FunctionDAG.cpp \ $(SRC)/LoopNest.h \ $(SRC)/LoopNest.cpp \ $(SRC)/Featurization.h \ $(SRC)/CostModel.h \ $(SRC)/State.h \ $(SRC)/State.cpp \ $(SRC)/Timer.h \ $(COMMON_DIR)/PerfectHashMap.h \ $(AUTOSCHED_WEIGHT_OBJECTS) \ $(AUTOSCHED_COST_MODEL_LIBS) \ $(BIN)/auto_schedule_runtime.a \ | $(LIB_HALIDE) @mkdir -p $(@D) $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) $(HALIDE_RPATH_FOR_LIB) -I $(SRC) $(BIN)/adams2019_retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ $(COMMON_DIR)/ASLog.cpp \ $(SRC)/DefaultCostModel.h \ $(SRC)/DefaultCostModel.cpp \ $(SRC)/Weights.h \ $(SRC)/Weights.cpp \ $(SRC)/CostModel.h \ $(SRC)/NetworkSize.h \ $(AUTOSCHED_COST_MODEL_LIBS) \ $(AUTOSCHED_WEIGHT_OBJECTS) \ $(BIN)/auto_schedule_runtime.a @mkdir -p $(@D) $(CXX) $(CXXFLAGS) -frtti -Wall -I ../support -I $(BIN)/cost_model $(OPTIMIZE) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_OPEN_MP) $(HALIDE_RPATH_FOR_BIN) -I $(SRC) $(BIN)/adams2019_weightsdir_to_weightsfile: $(SRC)/weightsdir_to_weightsfile.cpp $(SRC)/Weights.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ -I $(SRC) .PHONY: clean clean: rm -rf $(BIN) Halide-17.0.1/src/autoschedulers/adams2019/NetworkSize.h000066400000000000000000000006051456515664200226640ustar00rootroot00000000000000#ifndef HALIDE_NETWORK_SIZE_H #define HALIDE_NETWORK_SIZE_H namespace Halide { // The size of the best cost model network found. Needed by the cost // model and also the cost model training script. const int head1_channels = 8, head1_w = 40, head1_h = 7; const int head2_channels = 24, head2_w = 39; const int conv1_channels = 32; } // namespace Halide #endif // HALIDE_NETWORK_SIZE_H Halide-17.0.1/src/autoschedulers/adams2019/State.cpp000066400000000000000000000644511456515664200220240ustar00rootroot00000000000000#include "State.h" namespace Halide { namespace Internal { namespace Autoscheduler { using std::map; using std::pair; uint64_t State::structural_hash(int depth) const { uint64_t h = num_decisions_made; internal_assert(root.defined()); root->structural_hash(h, depth); return h; } void State::compute_featurization(const FunctionDAG &dag, const Adams2019Params ¶ms, StageMap *features, const CachingOptions &cache_options) { StageMap sites; sites.make_large(dag.nodes[0].stages[0].max_id); features->make_large(dag.nodes[0].stages[0].max_id); internal_assert(root.defined()); root->get_sites(sites); // For the input nodes and unscheduled outputs, the compute // and store sites are root, and the produce and innermost // sites are unset (nullptr) for (const auto &n : dag.nodes) { if (n.is_input || n.is_output) { for (const auto &stage : n.stages) { auto &s = sites.get_or_create(&stage); if (s.compute == nullptr) { s.compute = root.get(); s.store = root.get(); } } } } // For the unscheduled nodes, give them sites as deep as they // could possibly be. We'll ignore the possibility of inlining // them for now. map> parent; compute_loop_nest_parents(parent, root.get(), 0); for (const auto &n : dag.nodes) { if (sites.contains(&(n.stages[0]))) { continue; } const LoopNest *loop = nullptr; for (const auto *e : n.outgoing_edges) { const auto &consumer_site = sites.get(e->consumer); const LoopNest *l = consumer_site.innermost; if (!l) { l = consumer_site.compute; } if (!l) { std::ostringstream err; dump(err); err << e->producer->func.name() << " -> " << e->consumer->name << "\n"; internal_error << err.str(); } if (loop) { loop = deepest_common_ancestor(parent, l, loop); } else { loop = l; } } internal_assert(loop) << "Could not compute plausible site for unscheduled Func: " << n.func.name() << "\n"; for (const auto &stage : n.stages) { auto &site = sites.get_or_create(&stage); site.compute = loop; site.store = loop; } } if (cache_options.cache_features) { // Store unique hashes for each Site, to be used as keys into cache for (const auto &c : root->children) { sites.get(c->stage).hash_of_producers_stored_at_root = c->compute_hash_of_producers_stored_at_root(sites); } } root->compute_features(dag, params, sites, 1, 1, nullptr, nullptr, *root, nullptr, features, cache_options.cache_features); for (const auto &n : dag.nodes) { if (sites.get(&(n.stages[0])).produce == nullptr) { internal_assert(!features->contains(&(n.stages[0]))) << "Somehow an input or unscheduled node ended up in the featurization: " << n.func.name() << "\n"; } } } void State::save_featurization(const FunctionDAG &dag, const Adams2019Params ¶ms, const CachingOptions &cache_options, std::ostream &out) { StageMap features; compute_featurization(dag, params, &features, cache_options); for (const auto &n : dag.nodes) { if (n.is_input) { continue; } for (size_t stage_idx = n.stages.size(); stage_idx > 0; stage_idx--) { const auto &s = n.stages[stage_idx - 1]; const size_t num_schedule_features = ScheduleFeatures::num_features(); const size_t num_pipeline_features = PipelineFeatures::num_features(); const auto &sched_feat = features.get(&s); float buf[num_schedule_features + num_pipeline_features]; // Save them as floats for (size_t i = 0; i < num_schedule_features; i++) { buf[i] = sched_feat[i]; } for (size_t i = 0; i < num_pipeline_features; i++) { buf[i + num_schedule_features] = s.features[i]; } out.write((const char *)buf, sizeof(buf)); } } } bool State::calculate_cost(const FunctionDAG &dag, const Adams2019Params ¶ms, CostModel *cost_model, const CachingOptions &cache_options, int verbosity) { StageMap features; compute_featurization(dag, params, &features, cache_options); cost = 0.0f; if (verbosity <= aslog::aslog_level()) { for (auto it = features.begin(); it != features.end(); it++) { const auto &stage = *(it.key()); const auto &feat = it.value(); aslog(verbosity) << "Schedule features for " << stage.stage.name() << "\n"; feat.dump(aslog(verbosity).get_ostream()); } } internal_assert(cost_model) << "calculate_cost received nullptr for cost_model\n"; // Perform some addition pruning before burdening the cost model with silly states for (auto it = features.begin(); it != features.end(); it++) { if (!it.key()->node->is_wrapper) { // It's OK to repeatedly stage data auto &feat = it.value(); if (feat.points_computed_total + feat.inlined_calls > 8 * feat.points_computed_minimum) { cost = 1e50; return false; } } } // Avoid code size explosion from recursive inlining. if (root->max_inlined_calls() >= 256) { cost = 1e50; return false; } // Apply the hard limit on memory use if (params.memory_limit >= 0) { int64_t mem_used = (int64_t)features.begin().value().working_set_at_root; for (auto it = features.begin(); it != features.end(); it++) { if (it.key()->node->is_output || it.key()->node->is_input) { // Not allocated by this pipeline mem_used -= it.value().bytes_at_production; } } if (mem_used > params.memory_limit) { cost = 1e50; return false; } } // Tell the cost model about this state. It won't actually // evaluate it until we call evaluate_costs (or if it runs out // of internal buffer space), so that the evaluations can be // batched. cost_model->enqueue(dag, features, &cost); cost_calculations++; return true; } // Make a child copy of this state. The loop nest is const (we // make mutated copies of it, rather than mutating it), so we can // continue to point to the same one and so this is a cheap // operation. IntrusivePtr State::make_child() const { State *s = new State; s->parent = this; s->root = root; s->cost = cost; s->num_decisions_made = num_decisions_made; return s; } // Generate the successor states to this state void State::generate_children(const FunctionDAG &dag, const Adams2019Params ¶ms, CostModel *cost_model, std::function &&)> &accept_child, Cache *cache) const { internal_assert(root.defined() && root->is_root()) << "generate_children needs defined root\n"; if (num_decisions_made == 2 * (int)dag.nodes.size()) { return; } int next_node = num_decisions_made / 2; int phase = num_decisions_made % 2; if (params.disable_subtiling) { // When emulating the older search space, we do all // parallelizing last, so that it is independent of the // tiling decisions. next_node = num_decisions_made % dag.nodes.size(); phase = num_decisions_made / dag.nodes.size(); } // Enumerate all legal ways to schedule the next Func const FunctionDAG::Node *node = &dag.nodes[next_node]; for (const auto *e : node->outgoing_edges) { internal_assert(root->computes(e->consumer->node)) << "Partially scheduled code doesn't compute " << e->consumer->name << ", which is one of the consumers of " << node->func.name(); } if (node->is_input) { // We don't need to schedule nodes that represent inputs, // and there are no other decisions to be made about them // at this time. // aslog(1) << "Skipping over scheduling input node: " << node->func.name() << "\n"; auto child = make_child(); child->num_decisions_made++; accept_child(std::move(child)); return; } if (!node->outgoing_edges.empty() && !root->calls(node)) { std::ostringstream err; err << "In state:\n"; dump(err); err << node->func.name() << " is consumed by:\n"; for (const auto *e : node->outgoing_edges) { err << e->consumer->name << "\n"; err << "Which in turn consumes:\n"; for (const auto *e2 : e->consumer->incoming_edges) { err << " " << e2->producer->func.name() << "\n"; } } err << "Pipeline so far doesn't use next Func: " << node->func.name() << "\n"; internal_error << err.str(); } int num_children = 0; if (phase == 0) { // Injecting realizations { // 1) Inline it if (node->stages.size() == 1 && !node->is_output) { auto child = make_child(); LoopNest *new_root = new LoopNest; new_root->copy_from(*root); new_root->inline_func(node); child->root = new_root; child->num_decisions_made++; if (child->calculate_cost(dag, params, cost_model, cache->options)) { num_children++; accept_child(std::move(child)); } } } // Some search-space pruning. If a node is pointwise, and // so are all its inputs and so is its sole output, and // inlining it is legal, just inline it. This saves time // on long chains of pointwise things. bool must_inline = (node->is_pointwise && (num_children > 0) && (node->outgoing_edges.size() == 1)); if (must_inline) { for (const auto *e : node->stages[0].incoming_edges) { must_inline &= e->producer->is_pointwise; } for (const auto *e : node->outgoing_edges) { must_inline &= (e->consumer->node->is_pointwise || e->consumer->node->is_boundary_condition); } if (must_inline) { return; } } // Construct a list of plausible dimensions to vectorize // over. Currently all of them. TODO: Pre-prune the list // of sane dimensions to vectorize a Func over to reduce // branching factor. vector vector_dims; if (!node->is_input && !node->is_output) { for (int v = 0; v < node->dimensions; v++) { const auto &p = root->get_bounds(node)->region_computed(v); if (p.extent() >= node->vector_size) { vector_dims.push_back(v); } } } // Outputs must be vectorized over their innermost // dimension, because we don't have control of the // storage. Infer which dimension(s) is(are) the innermost one(s) by // looking at the stride. Note that there can be more than one in // case some dimensions have an extent of 1. if (node->is_output && !node->func.output_buffers().empty()) { const Parameter &output = node->func.output_buffers()[0]; int num_dims = output.dimensions(); for (int i = 0; i < num_dims; ++i) { const Expr stride = output.stride_constraint(i); const int64_t *s = as_const_int(stride); if (s && *s == 1) { vector_dims.push_back(i); } } } if (vector_dims.empty()) { // This can happen if the output strides aren't known, or if all // the dimensions are smaller than the vector size. // TBD: consider extending compute_in_tiles to support -1 as a // vector dim to indicate no vectorization. for (int v = 0; v < node->dimensions; v++) { vector_dims.push_back(v); } // Handle the case of full reductions that generate a scalar. // We need at least one vector dimension to call cmopute_in_tiles // below. // TBD: figure out a better fallback strategy. if (vector_dims.empty()) { vector_dims.push_back(0); } } // 2) Realize it somewhere for (int vector_dim : vector_dims) { auto tile_options = root->compute_in_tiles(node, nullptr, params, vector_dim, false); for (IntrusivePtr &n : tile_options) { auto child = make_child(); child->root = std::move(n); child->num_decisions_made++; if (child->calculate_cost(dag, params, cost_model, cache->options)) { num_children++; accept_child(std::move(child)); } } } } else { // We are parallelizing the loops of the func we just injected a realization for. bool should_parallelize = false; const vector *pure_size = nullptr; if (params.parallelism > 1) { for (const auto &c : root->children) { if (c->node == node && node->dimensions > 0) { if (c->stage->index == 0) { pure_size = &(c->size); } should_parallelize = true; } } } if (!should_parallelize) { // The Func must be scalar, or not compute_root, or // we're not asking to use multiple cores. Just // return a copy of the parent state num_children++; auto child = make_child(); child->num_decisions_made++; accept_child(std::move(child)); } else { internal_assert(pure_size); if (cache->add_memoized_blocks(this, accept_child, node, num_children, dag, params, cost_model)) { return; // successfully added cached states. } // Generate some candidate parallel task shapes. auto tilings = generate_tilings(*pure_size, node->dimensions - 1, 2, true); // We could also just parallelize the outer loop entirely std::vector ones; ones.resize(pure_size->size(), 1); tilings.emplace_back(std::move(ones)); // Sort / filter the options struct Option { vector tiling; double idle_core_wastage; bool entire; bool operator<(const Option &other) const { return idle_core_wastage < other.idle_core_wastage; } // Ensure we don't accidentally copy this type Option() = default; Option(Option &&) = default; Option &operator=(Option &&) = default; Option(const Option &) = delete; Option &operator=(const Option &) = delete; }; vector