av-scenechange-0.14.1/.cargo_vcs_info.json0000644000000001360000000000100137660ustar { "git": { "sha1": "ae3d7e5f8f7d9fac3ba3813b8ffe57f7b3420a20" }, "path_in_vcs": "" }av-scenechange-0.14.1/.github/workflows/av-scenechange.yml000064400000000000000000000045631046102023000215550ustar 00000000000000name: av-scenechange on: pull_request: branches: - master push: branches: - master jobs: rustfmt: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install nightly uses: dtolnay/rust-toolchain@nightly with: components: rustfmt - name: Run rustfmt run: | cargo fmt -- --check build: needs: [rustfmt] strategy: matrix: platform: [ubuntu-latest, windows-latest, ubuntu-24.04-arm] runs-on: ${{ matrix.platform }} steps: - uses: actions/checkout@v4 - uses: ilammy/setup-nasm@v1 - name: Install stable uses: dtolnay/rust-toolchain@stable with: components: clippy - uses: Swatinem/rust-cache@v2 - name: Set MSVC x86_64 linker path if: matrix.platform == 'windows-latest' run: | $LinkGlob = "VC\Tools\MSVC\*\bin\Hostx64\x64" $env:PATH = "$env:PATH;${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer" $LinkPath = vswhere -latest -products * -find "$LinkGlob" | Select-Object -Last 1 echo "$LinkPath" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - name: Clippy run: cargo clippy --features binary,devel,tracing,serialize --tests --benches -- -D warnings - name: Build run: cargo build --features binary,devel,tracing,serialize --tests --benches - name: Run tests run: cargo test --features binary,devel,tracing,serialize - name: Generate docs run: cargo doc --features binary,devel,tracing,serialize --no-deps code-coverage: needs: [build] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ilammy/setup-nasm@v1 - name: Install stable uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - name: Install cargo-llvm-cov uses: taiki-e/install-action@v2 with: tool: cargo-llvm-cov - name: Generate code coverage run: cargo llvm-cov --features binary,tracing,serialize --lcov --output-path lcov.log --ignore-filename-regex tests\.rs - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} files: lcov.log fail_ci_if_error: false av-scenechange-0.14.1/.github/workflows/dependabot.yml000064400000000000000000000007521046102023000210070ustar 00000000000000version: 2 updates: - package-ecosystem: "cargo" directory: "/" schedule: interval: "weekly" open-pull-requests-limit: 5 commit-message: prefix: "chore" include: "scope" labels: - "dependencies" - "rust" groups: rust-dependencies: patterns: - "*" update-types: - "minor" - "patch" - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" av-scenechange-0.14.1/.github/workflows/deploy.yml000064400000000000000000000040761046102023000202010ustar 00000000000000name: deploy on: push: tags: - "v*.*.*" jobs: create-binaries: runs-on: windows-latest steps: - uses: actions/checkout@v2 - name: Install nasm run: | $NASM_VERSION="2.15.05" $LINK="https://www.nasm.us/pub/nasm/releasebuilds/$NASM_VERSION/win64" curl -LO "$LINK/nasm-$NASM_VERSION-win64.zip" 7z e -y "nasm-$NASM_VERSION-win64.zip" -o"C:\nasm" echo "C:\nasm" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - name: Install Rust uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: stable-x86_64-pc-windows-gnu override: true - name: Build run: | cargo build --release - name: Create zip run: | $METRICS_PATH="$Env:GITHUB_WORKSPACE\target\release" 7z a av-scenechange.zip ` "$METRICS_PATH\av-scenechange.exe" - name: Upload binaries uses: actions/upload-artifact@v2 with: name: av-scenechange-bins path: av-scenechange.zip deploy: needs: create-binaries runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Download the zip uses: actions/download-artifact@v2 - name: Unzip av-scenechange Windows binaries run: | unzip av-scenechange-bins/av-scenechange.zip -d av-scenechange-bins - name: Handle release data and files id: data run: | VERSION=$(head -n 1 CHANGELOG.md | tr -d "## Version ") echo "::set-output name=version::$VERSION" tail -n +2 CHANGELOG.md | sed -e '/^$/,$d' > CHANGELOG.txt cd av-scenechange-bins strip av-scenechange.exe mv av-scenechange.exe .. - name: Create a release uses: softprops/action-gh-release@v1 with: name: Version ${{ steps.data.outputs.version }} body_path: CHANGELOG.txt files: | av-scenechange.exe env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} av-scenechange-0.14.1/.gitignore000064400000000000000000000000431046102023000145430ustar 00000000000000/target **/*.rs.bk /.idea /.vscode av-scenechange-0.14.1/CHANGELOG.md000064400000000000000000000075651046102023000144040ustar 00000000000000## Version 0.14.1 - Readd support for caching intra costs - Expose structs for advanced API users ## Version 0.14.0 - [Breaking/Feature] Add `SceneDetectionSpeed::None`, which will only place keyframes at fixed intervals without running dynamic detection - Migrate detection code from rav1e into this crate ## Version 0.13.0 - [Breaking] Update ffmpeg-the-third to 3.x - Update all other dependencies to latest version ## Version 0.12.2 - Enable threading for ffmpeg decoder, should greatly improve speed ## Version 0.12.0 - [Breaking] Move `VideoDetails` struct from `y4m` module to `decoder` module, since it is not specific to y4m - Add support for Ffmpeg decoder (requires Cargo `ffmpeg` feature, disabled by default) ## Version 0.11.0 - Add support for Vapoursynth decoder (requires Cargo `vapoursynth` feature, disabled by default) - Breaking change required to add a wrapper enum defining which decoder is being used ## Version 0.10.0 - Bump `rav1e` dependency to `0.7` ## Version 0.9.0 - Bump `y4m` dependency to `0.8` ## Version 0.8.1 - Finally release a new version because we can depend on rav1e 0.6.1 ## Version 0.8.0 - Upgrade clap to 4.0 - Add frame limit arg to API - [Breaking] Change `progress_callback` to take a &dyn Fn - Misc improvements including some speedups from rav1e - Update to Rust edition 2021 ## Version 0.7.2 - Bump to the final release of rav1e 0.5 - Bump other dependencies to latest versions - Fix another inconsistency with rav1e's scene detection - Improve precision of FPS calculation ## Version 0.7.1 - Fix an inconsistency with how rav1e's scene detection works - Fix some CLI help text ## Version 0.7.0 - Bump rav1e dependency to 0.5-beta.2, which brings a new, improved scenechange algorithm. Medium is equivalent to the old slow level, but with improvements. The fast level also has improvements. The new slow level is a new algorithm with a higher accuracy than the previous two algorithms. - The `--fast-mode` CLI argument is removed in favor of a `--speed` or `-s` argument, which takes a 0, 1, or 2 (for slow, medium, or fast). The default is 0 for slow. ## Version 0.6.0 - Bump rav1e dependency to 0.5. This should bring significant performance improvements, but may cause breaking changes. ## Version 0.5.0 - Bump rav1e dependency to 0.4 - Expose `new_detector` and `detect_scene_changes` since these may be useful in some situations to use directly ## Version 0.4.2 - Fix compilation on non-x86 targets - Bump various dependencies ## Version 0.4.1 - Improve performance and memory usage ## Version 0.4.0 - [Breaking, New Feature] `detect_scene_changes` returns a `DetectionOptions` struct, which includes the list of scenecut frames, and the total count of frames in the video. The CLI output will reflect this as well. - [Breaking] Replace the default algorithm with an 8x8-block cost-based algorithm. This is more accurate in many cases. - [Breaking] As a result of the above change, now requires nasm for compilation. No action is needed if you use a prebuilt binary. - [Breaking] Replace the `use_chroma` option with a `fast_analysis` option. The new name is more accurate, as the updated algorithm will always analyze only the luma plane. - [Breaking] Move the `progress_callback` parameter from `DetectionOptions` to `detect_scene_changes`, since it only applies to that interface. - [New Feature] Expose the `SceneChangeDetector` struct, which allows going frame-by-frame to analyze a clip. Needed for some use cases. `detect_scene_changes` is the simpler, preferred interface. - The library for inputting frame data has been replaced with one that matches rav1e. - Simplify/optimize some internal code. ## Version 0.3.0 - [Breaking, New Feature] Add the ability to pass a `progress_callback` function to the `DetectionOptions`. ## Version 0.2.0 - [Breaking] Update `y4m` dependency to 0.5 ## Version 0.1.0 - Initial release av-scenechange-0.14.1/Cargo.lock0000644000000732230000000000100117500ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "aho-corasick" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] [[package]] name = "aligned" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "377e4c0ba83e4431b10df45c1d4666f178ea9c552cac93e60c3a88bf32785923" dependencies = [ "as-slice", ] [[package]] name = "aligned-vec" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1" [[package]] name = "anstream" version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anstyle-parse" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" version = "3.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" dependencies = [ "anstyle", "once_cell", "windows-sys", ] [[package]] name = "anyhow" version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" [[package]] name = "arg_enum_proc_macro" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "arrayvec" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "as-slice" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516" dependencies = [ "stable_deref_trait", ] [[package]] name = "autocfg" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "av-scenechange" version = "0.14.1" dependencies = [ "aligned", "anyhow", "arg_enum_proc_macro", "arrayvec", "cc", "clap", "console", "fern", "ffmpeg-the-third", "libc", "log", "nasm-rs", "num-rational", "num-traits", "pastey", "rayon", "serde", "serde_json", "thiserror 2.0.12", "tracing", "tracing-chrome", "tracing-subscriber", "v_frame", "vapoursynth", "y4m", ] [[package]] name = "bindgen" version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ "bitflags 2.9.1", "cexpr", "clang-sys", "itertools", "lazy_static", "lazycell", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", "syn", ] [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" [[package]] name = "bumpalo" version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "cc" version = "1.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766" dependencies = [ "jobserver", "libc", "shlex", ] [[package]] name = "cexpr" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ "nom", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clang" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84c044c781163c001b913cd018fc95a628c50d0d2dfea8bca77dad71edb16e37" dependencies = [ "clang-sys", "libc", ] [[package]] name = "clang-sys" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", "libloading", ] [[package]] name = "clap" version = "4.5.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000" dependencies = [ "clap_builder", "clap_derive", ] [[package]] name = "clap_builder" version = "4.5.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120" dependencies = [ "anstream", "anstyle", "clap_lex", "strsim", ] [[package]] name = "clap_derive" version = "4.5.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" dependencies = [ "heck", "proc-macro2", "quote", "syn", ] [[package]] name = "clap_lex" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "colorchoice" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "console" version = "0.15.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" dependencies = [ "encode_unicode", "libc", "once_cell", "unicode-width", "windows-sys", ] [[package]] name = "crossbeam-deque" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "encode_unicode" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" [[package]] name = "fern" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4316185f709b23713e41e3195f90edef7fb00c3ed4adc79769cf09cc762a3b29" dependencies = [ "log", ] [[package]] name = "ffmpeg-sys-the-third" version = "3.0.0+ffmpeg-7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8ac7541c97a452897b3b85648e2e6fcd862c272ea0085ee3cbd7c7e2cfed95b" dependencies = [ "bindgen", "cc", "clang", "libc", "pkg-config", "vcpkg", ] [[package]] name = "ffmpeg-the-third" version = "3.0.1+ffmpeg-7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79adca4fd8d80989bd449a19afdf31235849384ea5a6ef64489f155794e3be7f" dependencies = [ "bitflags 2.9.1", "ffmpeg-sys-the-third", "libc", ] [[package]] name = "getrandom" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" dependencies = [ "cfg-if", "libc", "r-efi", "wasi", ] [[package]] name = "glob" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itertools" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" dependencies = [ "either", ] [[package]] name = "itoa" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jobserver" version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" dependencies = [ "getrandom", "libc", ] [[package]] name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lazycell" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libloading" version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a793df0d7afeac54f95b471d3af7f0d4fb975699f972341a4b76988d49cdf0c" dependencies = [ "cfg-if", "windows-targets 0.53.0", ] [[package]] name = "log" version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "minimal-lexical" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "nasm-rs" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12fcfa1bd49e0342ec1d07ed2be83b59963e7acbeb9310e1bb2c07b69dadd959" dependencies = [ "jobserver", ] [[package]] name = "nom" version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" dependencies = [ "memchr", "minimal-lexical", ] [[package]] name = "nu-ansi-term" version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" dependencies = [ "overload", "winapi", ] [[package]] name = "num-integer" version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ "num-traits", ] [[package]] name = "num-rational" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ "num-integer", "num-traits", ] [[package]] name = "num-traits" version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "overload" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "pastey" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3a8cb46bdc156b1c90460339ae6bfd45ba0394e5effbaa640badb4987fdc261" [[package]] name = "pin-project-lite" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pkg-config" version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "proc-macro2" version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] [[package]] name = "r-efi" version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" [[package]] name = "rayon" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", ] [[package]] name = "regex" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rustc-hash" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustversion" version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" [[package]] name = "ryu" version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "serde" version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "itoa", "memchr", "ryu", "serde", ] [[package]] name = "sharded-slab" version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" dependencies = [ "lazy_static", ] [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "smallvec" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" [[package]] name = "stable_deref_trait" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" version = "2.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "thiserror" version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ "thiserror-impl 1.0.69", ] [[package]] name = "thiserror" version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" dependencies = [ "thiserror-impl 2.0.12", ] [[package]] name = "thiserror-impl" version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "thiserror-impl" version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "thread_local" version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" dependencies = [ "cfg-if", "once_cell", ] [[package]] name = "tracing" version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "pin-project-lite", "tracing-attributes", "tracing-core", ] [[package]] name = "tracing-attributes" version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "tracing-chrome" version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf0a738ed5d6450a9fb96e86a23ad808de2b727fd1394585da5cdd6788ffe724" dependencies = [ "serde_json", "tracing-core", "tracing-subscriber", ] [[package]] name = "tracing-core" version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", "valuable", ] [[package]] name = "tracing-log" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" dependencies = [ "log", "once_cell", "tracing-core", ] [[package]] name = "tracing-subscriber" version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ "nu-ansi-term", "sharded-slab", "smallvec", "thread_local", "tracing-core", "tracing-log", ] [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "unicode-width" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "v_frame" version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6f32aaa24bacd11e488aa9ba66369c7cd514885742c9fe08cfe85884db3e92b" dependencies = [ "aligned-vec", "num-traits", "wasm-bindgen", ] [[package]] name = "valuable" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" [[package]] name = "vapoursynth" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7df702c65dec1cfa3b93f824a1e58d5b0fdb82ac8a722596f43d7214282f56" dependencies = [ "anyhow", "bitflags 1.3.2", "lazy_static", "thiserror 1.0.69", "vapoursynth-sys", ] [[package]] name = "vapoursynth-sys" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b35092be61a799005aabfd2e9e95d074125984013142d87a5d3edecc039b9b5" dependencies = [ "cfg-if", ] [[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" dependencies = [ "wit-bindgen-rt", ] [[package]] name = "wasm-bindgen" version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" dependencies = [ "bumpalo", "log", "proc-macro2", "quote", "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" dependencies = [ "quote", "wasm-bindgen-macro-support", ] [[package]] name = "wasm-bindgen-macro-support" version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" dependencies = [ "unicode-ident", ] [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ "windows-targets 0.52.6", ] [[package]] name = "windows-targets" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", "windows_i686_gnullvm 0.52.6", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] [[package]] name = "windows-targets" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b" dependencies = [ "windows_aarch64_gnullvm 0.53.0", "windows_aarch64_msvc 0.53.0", "windows_i686_gnu 0.53.0", "windows_i686_gnullvm 0.53.0", "windows_i686_msvc 0.53.0", "windows_x86_64_gnu 0.53.0", "windows_x86_64_gnullvm 0.53.0", "windows_x86_64_msvc 0.53.0", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_aarch64_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnu" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_i686_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnu" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "windows_x86_64_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" [[package]] name = "wit-bindgen-rt" version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ "bitflags 2.9.1", ] [[package]] name = "y4m" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448" av-scenechange-0.14.1/Cargo.toml0000644000000061700000000000100117700ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.74.1" name = "av-scenechange" version = "0.14.1" authors = ["Josh Holmer "] build = "build.rs" autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "Estimates frames in a video where a scenecut would be ideal" readme = "README.md" license = "MIT" repository = "https://github.com/rust-av/av-scenechange" [features] asm = [ "nasm-rs", "cc", "libc", ] binary = [ "clap", "serialize", ] default = [ "binary", "asm", ] devel = [ "console", "fern", ] ffmpeg = ["ffmpeg-the-third"] libc = ["dep:libc"] serialize = [ "serde", "serde_json", ] tracing = [ "tracing-subscriber", "tracing-chrome", "dep:tracing", ] [lib] name = "av_scenechange" path = "src/lib.rs" [[bin]] name = "av-scenechange" path = "src/main.rs" required-features = ["binary"] [dependencies.aligned] version = "0.4.2" [dependencies.anyhow] version = "1.0.56" [dependencies.arg_enum_proc_macro] version = "0.3.4" [dependencies.arrayvec] version = "0.7.6" [dependencies.clap] version = "4.0.22" features = ["derive"] optional = true [dependencies.console] version = "0.15" optional = true [dependencies.fern] version = "0.7" optional = true [dependencies.ffmpeg-the-third] version = "3.0.0" features = [ "codec", "format", ] optional = true default-features = false [dependencies.libc] version = "0.2.172" optional = true [dependencies.log] version = "0.4.14" [dependencies.num-rational] version = "0.4.2" default-features = false [dependencies.num-traits] version = "0.2.19" [dependencies.pastey] version = "0.1.0" [dependencies.rayon] version = "1.10.0" [dependencies.serde] version = "1.0.123" features = ["derive"] optional = true [dependencies.serde_json] version = "1.0.62" optional = true [dependencies.thiserror] version = "2.0.12" [dependencies.tracing] version = "0.1.40" optional = true [dependencies.tracing-chrome] version = "0.7.1" optional = true [dependencies.tracing-subscriber] version = "0.3.18" optional = true [dependencies.v_frame] version = "0.3.8" [dependencies.vapoursynth] version = "0.4.0" features = [ "vsscript-functions", "vapoursynth-functions", "vapoursynth-api-32", "vsscript-api-31", ] optional = true [dependencies.y4m] version = "0.8.0" [build-dependencies.cc] version = "1.2.23" features = ["parallel"] optional = true [build-dependencies.nasm-rs] version = "0.3" features = ["parallel"] optional = true [lints.clippy] inline_always = "warn" missing_inline_in_public_items = "warn" [lints.rust.unexpected_cfgs] level = "warn" priority = 0 check-cfg = [ "cfg(asm_x86_64)", "cfg(asm_neon)", ] av-scenechange-0.14.1/Cargo.toml.orig000064400000000000000000000042311046102023000154450ustar 00000000000000[package] name = "av-scenechange" version = "0.14.1" authors = ["Josh Holmer "] edition = "2021" rust-version = "1.74.1" description = "Estimates frames in a video where a scenecut would be ideal" license = "MIT" repository = "https://github.com/rust-av/av-scenechange" build = "build.rs" [dependencies] aligned = "0.4.2" anyhow = "1.0.56" arg_enum_proc_macro = "0.3.4" arrayvec = "0.7.6" clap = { version = "4.0.22", optional = true, features = ["derive"] } console = { version = "0.15", optional = true } fern = { version = "0.7", optional = true } ffmpeg-the-third = { version = "3.0.0", optional = true, default-features = false, features = [ "codec", "format", ] } libc = { version = "0.2.172", optional = true } log = { version = "0.4.14" } num-rational = { version = "0.4.2", default-features = false } num-traits = "0.2.19" pastey = "0.1.0" rayon = "1.10.0" serde = { version = "1.0.123", optional = true, features = ["derive"] } serde_json = { version = "1.0.62", optional = true } thiserror = "2.0.12" tracing = { version = "0.1.40", optional = true } tracing-chrome = { version = "0.7.1", optional = true } tracing-subscriber = { version = "0.3.18", optional = true } v_frame = "0.3.8" vapoursynth = { version = "0.4.0", features = [ "vsscript-functions", "vapoursynth-functions", "vapoursynth-api-32", "vsscript-api-31", ], optional = true } y4m = "0.8.0" [build-dependencies] cc = { version = "1.2.23", optional = true, features = ["parallel"] } nasm-rs = { version = "0.3", optional = true, features = ["parallel"] } [features] default = ["binary", "asm"] binary = ["clap", "serialize"] serialize = ["serde", "serde_json"] devel = ["console", "fern"] tracing = ["tracing-subscriber", "tracing-chrome", "dep:tracing"] ffmpeg = ["ffmpeg-the-third"] asm = ["nasm-rs", "cc", "libc"] libc = ["dep:libc"] [[bin]] name = "av-scenechange" path = "src/main.rs" required-features = ["binary"] [lints.clippy] inline_always = "warn" missing_inline_in_public_items = "warn" [lints.rust.unexpected_cfgs] level = "warn" # These custom cfgs are expected, so tell rustc not to output warnings for them check-cfg = ['cfg(asm_x86_64)', 'cfg(asm_neon)'] av-scenechange-0.14.1/LICENSE000064400000000000000000000020641046102023000135650ustar 00000000000000MIT License Copyright (c) 2019 Multimedia and Rust Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. av-scenechange-0.14.1/README.md000064400000000000000000000007661046102023000140460ustar 00000000000000# av-scenechange [![Actions Status](https://github.com/rust-av/av-scenechange/workflows/av-scenechange/badge.svg)](https://github.com/rust-av/av-scenechange/actions) [![docs.rs](https://img.shields.io/docsrs/av-scenechange)](https://docs.rs/av-scenechange/latest/av-scenechange/) [![Crates.io Version](https://img.shields.io/crates/v/av-scenechange)](https://crates.io/crates/av-scenechange) [![Crates.io License](https://img.shields.io/crates/l/av-scenechange)](LICENSE) Scenechange detection tool av-scenechange-0.14.1/build.rs000064400000000000000000000167731046102023000142410ustar 00000000000000use std::{ env, fs, path::{Path, PathBuf}, }; fn rerun_dir>(dir: P) { for entry in fs::read_dir(dir).unwrap() { let entry = entry.unwrap(); let path = entry.path(); println!("cargo:rerun-if-changed={}", path.to_string_lossy()); if path.is_dir() { rerun_dir(path); } } } fn hash_changed(files: &[&str], out_dir: &str, config: &Path) -> Option<([u8; 8], PathBuf)> { use std::{collections::hash_map::DefaultHasher, hash::Hasher}; let mut hasher = DefaultHasher::new(); let paths = files .iter() .map(Path::new) .chain(std::iter::once(config)) .chain(std::iter::once(Path::new("build.rs"))); for path in paths { if let Ok(buf) = std::fs::read(path) { hasher.write(&buf); } else { panic!("Cannot open {}", path.display()); } } if let Some(cmd) = strip_command() { hasher.write(cmd.as_bytes()); } let hash = hasher.finish().to_be_bytes(); let hash_path = Path::new(&out_dir).join("asm.hash"); if let Ok(old_hash) = std::fs::read(&hash_path) { if old_hash == hash { return None; } } Some((hash, hash_path)) } #[cfg(feature = "asm")] fn build_nasm_files() { let mut config = " %pragma preproc sane_empty_expansion true %define private_prefix avsc %define ARCH_X86_32 0 %define ARCH_X86_64 1 %define PIC 1 %define STACK_ALIGNMENT 16 %define HAVE_AVX512ICL 1 " .to_owned(); if env::var("CARGO_CFG_TARGET_VENDOR").unwrap() == "apple" { config += "%define PREFIX 1\n"; } let out_dir = env::var("OUT_DIR").unwrap(); let dest_path = Path::new(&out_dir).join("config.asm"); std::fs::write(&dest_path, config).expect("can write config.asm"); let asm_files = &[ // "src/asm/x86/cdef_avx2.asm", // "src/asm/x86/cdef_avx512.asm", // "src/asm/x86/cdef_dist.asm", // "src/asm/x86/cdef_rav1e.asm", // "src/asm/x86/cdef_sse.asm", // "src/asm/x86/cdef16_avx2.asm", // "src/asm/x86/cdef16_avx512.asm", // "src/asm/x86/cdef16_sse.asm", "src/asm/x86/ipred_avx2.asm", "src/asm/x86/ipred_avx512.asm", "src/asm/x86/ipred_sse.asm", "src/asm/x86/ipred16_avx2.asm", "src/asm/x86/ipred16_avx512.asm", "src/asm/x86/ipred16_sse.asm", // "src/asm/x86/itx_avx2.asm", // "src/asm/x86/itx_avx512.asm", // "src/asm/x86/itx_sse.asm", // "src/asm/x86/itx16_avx2.asm", // "src/asm/x86/itx16_avx512.asm", // "src/asm/x86/itx16_sse.asm", // "src/asm/x86/looprestoration_avx2.asm", // "src/asm/x86/looprestoration_avx512.asm", // "src/asm/x86/looprestoration_sse.asm", // "src/asm/x86/looprestoration16_avx2.asm", // "src/asm/x86/looprestoration16_avx512.asm", // "src/asm/x86/looprestoration16_sse.asm", "src/asm/x86/mc_avx2.asm", "src/asm/x86/mc_avx512.asm", "src/asm/x86/mc_sse.asm", "src/asm/x86/mc16_avx2.asm", "src/asm/x86/mc16_avx512.asm", "src/asm/x86/mc16_sse.asm", // "src/asm/x86/me.asm", "src/asm/x86/sad_avx.asm", "src/asm/x86/sad_plane.asm", "src/asm/x86/sad_sse2.asm", "src/asm/x86/satd.asm", "src/asm/x86/satd16_avx2.asm", // "src/asm/x86/sse.asm", "src/asm/x86/tables.asm", ]; if let Some((hash, hash_path)) = hash_changed(asm_files, &out_dir, &dest_path) { let obj = nasm_rs::Build::new() .min_version(2, 15, 0) .include(&out_dir) .include("src") .files(asm_files) .compile_objects() .unwrap_or_else(|e| { panic!("NASM build failed. Make sure you have nasm installed or disable the \"asm\" feature.\n\ You can get NASM from https://nasm.us or your system's package manager.\n\ \n\ error: {e}"); }); // cc is better at finding the correct archiver let mut cc = cc::Build::new(); for o in obj { cc.object(o); } cc.compile("avscasm"); // Strip local symbols from the asm library since they // confuse the debugger. if let Some(strip) = strip_command() { let _ = std::process::Command::new(strip) .arg("-x") .arg(Path::new(&out_dir).join("libavscasm.a")) .status(); } std::fs::write(hash_path, &hash[..]).unwrap(); } else { println!("cargo:rustc-link-search={out_dir}"); } println!("cargo:rustc-link-lib=static=avscasm"); rerun_dir("src/asm/x86"); } fn strip_command() -> Option { let target = env::var("TARGET").expect("TARGET"); // follows Cargo's naming convention for the linker setting let normalized_target = target.replace('-', "_").to_uppercase(); let explicit_strip = env::var(format!("CARGO_TARGET_{normalized_target}_STRIP")) .ok() .or_else(|| env::var("STRIP").ok()); if explicit_strip.is_some() { return explicit_strip; } // strip command is target-specific, e.g. macOS's strip breaks MUSL's archives let host = env::var("HOST").expect("HOST"); if host != target { return None; } Some("strip".into()) } #[cfg(feature = "asm")] fn build_neon_asm_files() { let mut config = " #define PRIVATE_PREFIX avsc_ #define ARCH_AARCH64 1 #define ARCH_ARM 0 #define CONFIG_LOG 1 #define HAVE_ASM 1 " .to_owned(); if env::var("CARGO_CFG_TARGET_VENDOR").unwrap() == "apple" { config += "#define PREFIX 1\n"; } let out_dir = env::var("OUT_DIR").unwrap(); let dest_path = Path::new(&out_dir).join("config.h"); std::fs::write(&dest_path, config).expect("can write config.h"); let asm_files = &[ // "src/asm/arm/64/cdef.S", // "src/asm/arm/64/cdef16.S", // "src/asm/arm/64/cdef_dist.S", "src/asm/arm/64/mc.S", "src/asm/arm/64/mc16.S", // "src/asm/arm/64/itx.S", // "src/asm/arm/64/itx16.S", "src/asm/arm/64/ipred.S", "src/asm/arm/64/ipred16.S", // "src/asm/arm/64/sad.S", "src/asm/arm/64/satd.S", // "src/asm/arm/64/sse.S", "src/asm/arm/tables.S", ]; if let Some((hash, hash_path)) = hash_changed(asm_files, &out_dir, &dest_path) { cc::Build::new() .files(asm_files) .include(".") .include(&out_dir) .compile("avsc-aarch64"); std::fs::write(hash_path, &hash[..]).unwrap(); } else { println!("cargo:rustc-link-search={out_dir}"); println!("cargo:rustc-link-lib=static=avsc-aarch64"); } rerun_dir("src/asm/arm"); } #[allow(unused_variables)] fn main() { let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); let os = env::var("CARGO_CFG_TARGET_OS").unwrap(); // let env = env::var("CARGO_CFG_TARGET_ENV").unwrap(); #[cfg(feature = "asm")] { if arch == "x86_64" { println!("cargo:rustc-cfg=asm_x86_64"); build_nasm_files() } if arch == "aarch64" { println!("cargo:rustc-cfg=asm_neon"); build_neon_asm_files() } } println!("cargo:rustc-env=PROFILE={}", env::var("PROFILE").unwrap()); if let Ok(value) = env::var("CARGO_CFG_TARGET_FEATURE") { println!("cargo:rustc-env=CARGO_CFG_TARGET_FEATURE={value}"); } println!( "cargo:rustc-env=CARGO_ENCODED_RUSTFLAGS={}", env::var("CARGO_ENCODED_RUSTFLAGS").unwrap() ); } av-scenechange-0.14.1/rustfmt.toml000064400000000000000000000010021046102023000151500ustar 00000000000000use_field_init_shorthand = true use_try_shorthand = true # Unstable features--for future stabilization imports_layout = "HorizontalVertical" imports_granularity = "Crate" group_imports = "StdExternalCrate" format_strings = true format_macro_matchers = true format_macro_bodies = true hex_literal_case = "Lower" normalize_comments = true normalize_doc_attributes = true overflow_delimited_expr = true reorder_impl_items = true wrap_comments = true format_code_in_doc_comments = true empty_item_single_line = false av-scenechange-0.14.1/src/analyze/fast.rs000064400000000000000000000067551046102023000163300ustar 00000000000000use std::{cmp, sync::Arc}; use log::debug; use v_frame::{frame::Frame, pixel::Pixel, plane::Plane}; use super::{fast_idiv, ScaleFunction, SceneChangeDetector, ScenecutResult}; use crate::{data::sad::sad_plane, SceneDetectionSpeed}; /// Experiments have determined this to be an optimal threshold pub(super) const FAST_THRESHOLD: f64 = 18.0; impl SceneChangeDetector { /// The fast algorithm detects fast cuts using a raw difference /// in pixel values between the scaled frames. pub(super) fn fast_scenecut( &mut self, frame1: Arc>, frame2: Arc>, ) -> ScenecutResult { if let Some(scale_func) = &self.scale_func { // downscale both frames for faster comparison if let Some(frame_buffer) = &mut self.downscaled_frame_buffer { frame_buffer.swap(0, 1); (scale_func.downscale_in_place)(&frame2.planes[0], &mut frame_buffer[1]); } else { self.downscaled_frame_buffer = Some([ (scale_func.downscale)(&frame1.planes[0]), (scale_func.downscale)(&frame2.planes[0]), ]); } if let Some(frame_buffer) = &self.downscaled_frame_buffer { let &[first, second] = &frame_buffer; let delta = self.delta_in_planes(first, second); ScenecutResult { threshold: self.threshold, inter_cost: delta, imp_block_cost: delta, forward_adjusted_cost: delta, backward_adjusted_cost: delta, } } else { unreachable!() } } else { let delta = self.delta_in_planes(&frame1.planes[0], &frame2.planes[0]); ScenecutResult { threshold: self.threshold, inter_cost: delta, imp_block_cost: delta, backward_adjusted_cost: delta, forward_adjusted_cost: delta, } } } /// Calculates the average sum of absolute difference (SAD) per pixel /// between 2 planes fn delta_in_planes(&self, plane1: &Plane, plane2: &Plane) -> f64 { let delta = sad_plane(plane1, plane2, self.cpu_feature_level); delta as f64 / self.scaled_pixels as f64 } } /// Scaling factor for frame in scene detection pub(super) fn detect_scale_factor( resolution: (usize, usize), speed_mode: SceneDetectionSpeed, ) -> Option> { let small_edge = cmp::min(resolution.0, resolution.1); let scale_func = if speed_mode == SceneDetectionSpeed::Fast { match small_edge { 0..=240 => None, 241..=480 => Some(ScaleFunction::from_scale::<2>()), 481..=720 => Some(ScaleFunction::from_scale::<4>()), 721..=1080 => Some(ScaleFunction::from_scale::<8>()), 1081..=1600 => Some(ScaleFunction::from_scale::<16>()), 1601..=usize::MAX => Some(ScaleFunction::from_scale::<32>()), _ => None, } } else { None }; if let Some(scale_factor) = scale_func.as_ref().map(|x| x.factor) { debug!( "Scene detection scale factor {}, [{},{}] -> [{},{}]", scale_factor, resolution.0, resolution.1, fast_idiv(resolution.0, scale_factor), fast_idiv(resolution.1, scale_factor) ); } scale_func } av-scenechange-0.14.1/src/analyze/importance.rs000064400000000000000000000052461046102023000175260ustar 00000000000000use std::sync::Arc; use v_frame::{ frame::Frame, pixel::{CastFromPrimitive, Pixel}, }; use super::intra::BLOCK_TO_PLANE_SHIFT; use crate::data::plane::{Area, AsRegion, PlaneRegion, Rect}; /// Size of blocks for the importance computation, in pixels. pub const IMPORTANCE_BLOCK_SIZE: usize = 1 << (IMPORTANCE_BLOCK_TO_BLOCK_SHIFT + BLOCK_TO_PLANE_SHIFT); pub const IMPORTANCE_BLOCK_TO_BLOCK_SHIFT: usize = 1; pub const IMP_BLOCK_MV_UNITS_PER_PIXEL: i64 = 8; pub const IMP_BLOCK_SIZE_IN_MV_UNITS: i64 = IMPORTANCE_BLOCK_SIZE as i64 * IMP_BLOCK_MV_UNITS_PER_PIXEL; pub(crate) fn estimate_importance_block_difference( frame: Arc>, ref_frame: Arc>, ) -> f64 { let plane_org = &frame.planes[0]; let plane_ref = &ref_frame.planes[0]; let h_in_imp_b = plane_org.cfg.height / IMPORTANCE_BLOCK_SIZE; let w_in_imp_b = plane_org.cfg.width / IMPORTANCE_BLOCK_SIZE; let mut imp_block_costs = 0; (0..h_in_imp_b).for_each(|y| { (0..w_in_imp_b).for_each(|x| { // Coordinates of the top-left corner of the reference block, in MV // units. let region_org = plane_org.region(Area::Rect(Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, })); let region_ref = plane_ref.region(Area::Rect(Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, })); let sum_8x8_block = |region: &PlaneRegion| { region .rows_iter() .map(|row| { // 16-bit precision is sufficient for an 8 px row, // as `IMPORTANCE_BLOCK_SIZE * (2^12 - 1) < 2^16 - 1`, // so overflow is not possible row.iter().map(|pixel| u16::cast_from(*pixel)).sum::() as i64 }) .sum::() }; let histogram_org_sum = sum_8x8_block(®ion_org); let histogram_ref_sum = sum_8x8_block(®ion_ref); let count = (IMPORTANCE_BLOCK_SIZE * IMPORTANCE_BLOCK_SIZE) as i64; let mean = (((histogram_org_sum + count / 2) / count) - ((histogram_ref_sum + count / 2) / count)) .abs(); imp_block_costs += mean as u64; }); }); imp_block_costs as f64 / (w_in_imp_b * h_in_imp_b) as f64 } av-scenechange-0.14.1/src/analyze/inter.rs000064400000000000000000001530761046102023000165130ustar 00000000000000use std::sync::Arc; use aligned::{Aligned, A64}; use arrayvec::ArrayVec; use num_rational::Rational32; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use v_frame::{ frame::Frame, math::{clamp, ILog}, pixel::{ChromaSampling, Pixel}, plane::{Plane, PlaneConfig, PlaneOffset}, }; use super::importance::{ IMPORTANCE_BLOCK_SIZE, IMP_BLOCK_MV_UNITS_PER_PIXEL, IMP_BLOCK_SIZE_IN_MV_UNITS, }; use crate::{ cpu::CpuFeatureLevel, data::{ block::{BlockOffset, BlockSize, MIB_SIZE_LOG2}, frame::{FrameInvariants, FrameState, RefType, ALLOWED_REF_FRAMES}, motion::{ MEStats, MVSamplingMode, MotionEstimationSubsets, MotionVector, ReadGuardMEStats, RefMEStats, TileMEStats, MV_LOW, MV_UPP, }, plane::{Area, AsRegion, PlaneBlockOffset, PlaneRegion, PlaneRegionMut, Rect}, prediction::PredictionMode, sad::get_sad, satd::get_satd, superblock::{ SuperBlockOffset, TileSuperBlockOffset, MAX_SB_SIZE_LOG2, MI_SIZE, MI_SIZE_LOG2, SB_SIZE, }, tile::{TileBlockOffset, TileRect, TileStateMut, TilingInfo}, }, }; /// Declares an array of motion vectors in structure of arrays syntax. macro_rules! search_pattern_subpel { ($field_a:ident: [$($ll_a:expr),*], $field_b:ident: [$($ll_b:expr),*]) => { [ $(MotionVector { $field_a: $ll_a, $field_b: $ll_b } ),*] }; } /// Declares an array of motion vectors in structure of arrays syntax. /// Compared to [`search_pattern_subpel`], this version creates motion vectors /// in fullpel resolution (x8). macro_rules! search_pattern { ($field_a:ident: [$($ll_a:expr),*], $field_b:ident: [$($ll_b:expr),*]) => { [ $(MotionVector { $field_a: $ll_a << 3, $field_b: $ll_b << 3 } ),*] }; } /// Diamond pattern of radius 1 as shown below. For fullpel search, use /// `DIAMOND_R1_PATTERN_FULLPEL` since it has been scaled for fullpel search. /// ```text /// X /// XoX /// X /// ``` /// 'X's are motion candidates and the 'o' is the center. const DIAMOND_R1_PATTERN_SUBPEL: [MotionVector; 4] = search_pattern_subpel!( col: [ 0, 1, 0, -1], row: [ 1, 0, -1, 0] ); /// Diamond pattern of radius 1 as shown below. Unlike `DIAMOND_R1_PATTERN`, the /// vectors have been shifted fullpel scale. /// ```text /// X /// XoX /// X /// ``` /// 'X's are motion candidates and the 'o' is the center. const DIAMOND_R1_PATTERN: [MotionVector; 4] = search_pattern!( col: [ 0, 1, 0, -1], row: [ 1, 0, -1, 0] ); /// Uneven multi-hexagon search pattern around a center point. Used for locating /// irregular movement. /// ```text /// X /// X X /// X X /// X X /// X o X /// X X /// X X /// X X /// X /// ``` /// 'X's are motion candidates and the 'o' is the center. const UMH_PATTERN: [MotionVector; 16] = search_pattern!( col: [ -2, -1, 0, 1, 2, 3, 4, 3, 2, 1, 0, -1, -2, 3, -4, -3], row: [ 4, 4, 4, 4, 4, 2, 0, -2, -4, -4, -4, -4, -4, -2, 0, 2] ); /// A hexagon pattern around a center point. The pattern is ordered so that the /// offsets circle around the center. This is done to allow pruning locations /// covered by the last iteration. /// ```text /// 21012 /// 2 X X /// 1 /// 0 X o X /// 1 /// 2 X X /// ``` /// 'X's are motion candidates and the 'o' is the center. /// /// The illustration below shows the process of a hexagon search. /// ```text /// Step 1 Step 2 /// 1 1 1 1 2 /// /// 1(0)1 => 1 0(1)2 /// /// 1 1 1 1 2 /// ``` /// The search above has gone through the following steps. /// 1. Search '1' elements for better candidates than the center '0'. /// 2. Recenter around the best candidate ('(1)') and hexagon candidates that /// don't overlap with the previous search step (labeled '2'). const HEXAGON_PATTERN: [MotionVector; 6] = search_pattern!( col: [ 0, 2, 2, 0, -2, -2], row: [ -2, -1, 1, 2, 1, -1] ); /// A small square pattern around a center point. /// ```text /// 101 /// 1 XXX /// 0 XoX /// 1 XXX /// ``` /// 'X's are motion candidates and the 'o' is the center. const SQUARE_REFINE_PATTERN: [MotionVector; 8] = search_pattern!( col: [ -1, 0, 1, -1, 1, -1, 0, 1], row: [ 1, 1, 1, 0, 0, -1, -1, -1] ); pub(crate) fn estimate_inter_costs( frame: Arc>, ref_frame: Arc>, bit_depth: usize, frame_rate: Rational32, chroma_sampling: ChromaSampling, buffer: RefMEStats, cpu_feature_level: CpuFeatureLevel, ) -> f64 { let last_fi = FrameInvariants::new_key_frame(frame.planes[0].cfg.width, frame.planes[0].cfg.height); let fi = FrameInvariants::new_inter_frame(&last_fi, 1).unwrap(); // Compute the motion vectors. let mut fs = FrameState::new_with_frame_and_me_stats_and_rec(Arc::clone(&frame), buffer); let mut tiling = TilingInfo::from_target_tiles( frame.planes[0].cfg.width, frame.planes[0].cfg.height, *frame_rate.numer() as f64 / *frame_rate.denom() as f64, TilingInfo::tile_log2(1, 0).unwrap(), TilingInfo::tile_log2(1, 0).unwrap(), chroma_sampling == ChromaSampling::Cs422, ); compute_motion_vectors(&fi, &mut fs, &mut tiling, bit_depth, cpu_feature_level); // Estimate inter costs let plane_org = &frame.planes[0]; let plane_ref = &ref_frame.planes[0]; let h_in_imp_b = plane_org.cfg.height / IMPORTANCE_BLOCK_SIZE; let w_in_imp_b = plane_org.cfg.width / IMPORTANCE_BLOCK_SIZE; let stats = &fs.frame_me_stats.read().expect("poisoned lock")[0]; let bsize = BlockSize::from_width_and_height(IMPORTANCE_BLOCK_SIZE, IMPORTANCE_BLOCK_SIZE); let mut inter_costs = 0; (0..h_in_imp_b).for_each(|y| { (0..w_in_imp_b).for_each(|x| { let mv = stats[y * 2][x * 2].mv; // Coordinates of the top-left corner of the reference block, in MV // units. let reference_x = x as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.col as i64; let reference_y = y as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.row as i64; let region_org = plane_org.region(Area::Rect(Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, })); let region_ref = plane_ref.region(Area::Rect(Rect { x: reference_x as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, y: reference_y as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, })); inter_costs += get_satd( ®ion_org, ®ion_ref, bsize.width(), bsize.height(), bit_depth, cpu_feature_level, ) as u64; }); }); inter_costs as f64 / (w_in_imp_b * h_in_imp_b) as f64 } fn compute_motion_vectors( fi: &FrameInvariants, fs: &mut FrameState, tiling_info: &mut TilingInfo, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) { tiling_info .tile_iter_mut(fs) .collect::>() .into_par_iter() .for_each(|mut ctx| { let ts = &mut ctx.ts; estimate_tile_motion(fi, ts, bit_depth, cpu_feature_level); }); } fn estimate_tile_motion( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) { let init_size = MIB_SIZE_LOG2; let mut prev_ssdec: Option = None; for mv_size_in_b_log2 in (2..=init_size).rev() { let init = mv_size_in_b_log2 == init_size; // Choose subsampling. Pass one is quarter res and pass two is at half res. let ssdec = match init_size - mv_size_in_b_log2 { 0 => 2, 1 => 1, _ => 0, }; let new_subsampling = if let Some(prev) = prev_ssdec { prev != ssdec } else { false }; prev_ssdec = Some(ssdec); // 0.5 and 0.125 are a fudge factors let lambda = 0; for sby in 0..ts.sb_height { for sbx in 0..ts.sb_width { let mut tested_frames_flags = 0; for &ref_frame in ALLOWED_REF_FRAMES { let frame_flag = 1 << fi.ref_frames[ref_frame.to_index()]; if tested_frames_flags & frame_flag == frame_flag { continue; } tested_frames_flags |= frame_flag; let tile_bo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }) .block_offset(0, 0); if new_subsampling { refine_subsampled_sb_motion( fi, ts, ref_frame, mv_size_in_b_log2 + 1, tile_bo, ssdec, lambda, bit_depth, cpu_feature_level, ); } estimate_sb_motion( fi, ts, ref_frame, mv_size_in_b_log2, tile_bo, init, ssdec, lambda, bit_depth, cpu_feature_level, ); } } } } } #[allow(clippy::too_many_arguments)] fn refine_subsampled_sb_motion( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, ref_frame: RefType, mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, ssdec: u8, lambda: u32, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) { let pix_offset = tile_bo.to_luma_plane_offset(); let sb_h: usize = SB_SIZE.min(ts.height - pix_offset.y as usize); let sb_w: usize = SB_SIZE.min(ts.width - pix_offset.x as usize); let mv_size = MI_SIZE << mv_size_in_b_log2; // Process in blocks, cropping at edges. for y in (0..sb_h).step_by(mv_size) { for x in (0..sb_w).step_by(mv_size) { let sub_bo = tile_bo.with_offset(x as isize >> MI_SIZE_LOG2, y as isize >> MI_SIZE_LOG2); // Clamp to frame edge, rounding up in the case of subsampling. // The rounding makes some assumptions about how subsampling is done. let w = mv_size.min(sb_w - x + (1 << ssdec) - 1) >> ssdec; let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec; // Refine the existing motion estimate if let Some(results) = refine_subsampled_motion_estimate( fi, ts, w, h, sub_bo, ref_frame, ssdec, lambda, bit_depth, cpu_feature_level, ) { // normalize sad to 128x128 block let sad = (((results.rd.sad as u64) << (MAX_SB_SIZE_LOG2 * 2)) / (w * h) as u64) as u32; save_me_stats(ts, mv_size_in_b_log2, sub_bo, ref_frame, MEStats { mv: results.mv, normalized_sad: sad, }); } } } } /// Refine motion estimation that was computed one level of subsampling up. #[allow(clippy::too_many_arguments)] fn refine_subsampled_motion_estimate( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, w: usize, h: usize, tile_bo: TileBlockOffset, ref_frame: RefType, ssdec: u8, lambda: u32, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) -> Option { if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] { let frame_bo = ts.to_frame_block_offset(tile_bo); let (mvx_min, mvx_max, mvy_min, mvy_max) = get_mv_range(fi.w_in_b, fi.h_in_b, frame_bo, w << ssdec, h << ssdec); let pmv = [MotionVector { row: 0, col: 0 }; 2]; let po = frame_bo.to_luma_plane_offset(); let (mvx_min, mvx_max, mvy_min, mvy_max) = ( mvx_min >> ssdec, mvx_max >> ssdec, mvy_min >> ssdec, mvy_max >> ssdec, ); let po = PlaneOffset { x: po.x >> ssdec, y: po.y >> ssdec, }; let p_ref = match ssdec { 0 => &rec.frame.planes[0], 1 => &rec.input_hres, 2 => &rec.input_qres, _ => unimplemented!(), }; let org_region = &match ssdec { 0 => ts.input_tile.planes[0].subregion(Area::BlockStartingAt { bo: tile_bo.0 }), 1 => ts.input_hres.region(Area::StartingAt { x: po.x, y: po.y }), 2 => ts.input_qres.region(Area::StartingAt { x: po.x, y: po.y }), _ => unimplemented!(), }; let mv = ts.me_stats[ref_frame.to_index()][tile_bo.0.y][tile_bo.0.x].mv >> ssdec; // Given a motion vector at 0 at higher subsampling: // | -1 | 0 | 1 | // then the vectors at -1 to 2 should be tested at the current subsampling. // |-------------| // | -2 -1 | 0 1 | 2 3 | // This corresponds to a 4x4 full search. let x_lo = po.x + (mv.col as isize / 8 - 1).max(mvx_min / 8); let x_hi = po.x + (mv.col as isize / 8 + 2).min(mvx_max / 8); let y_lo = po.y + (mv.row as isize / 8 - 1).max(mvy_min / 8); let y_hi = po.y + (mv.row as isize / 8 + 2).min(mvy_max / 8); let mut results = full_search( x_lo, x_hi, y_lo, y_hi, w, h, org_region, p_ref, po, 1, lambda, pmv, bit_depth, cpu_feature_level, ); // Scale motion vectors to full res size results.mv = results.mv << ssdec; Some(results) } else { None } } fn get_mv_range( w_in_b: usize, h_in_b: usize, bo: PlaneBlockOffset, blk_w: usize, blk_h: usize, ) -> (isize, isize, isize, isize) { let border_w = 128 + blk_w as isize * 8; let border_h = 128 + blk_h as isize * 8; let mvx_min = -(bo.0.x as isize) * (8 * MI_SIZE) as isize - border_w; let mvx_max = ((w_in_b - bo.0.x) as isize - (blk_w / MI_SIZE) as isize) * (8 * MI_SIZE) as isize + border_w; let mvy_min = -(bo.0.y as isize) * (8 * MI_SIZE) as isize - border_h; let mvy_max = ((h_in_b - bo.0.y) as isize - (blk_h / MI_SIZE) as isize) * (8 * MI_SIZE) as isize + border_h; // ( mvx_min.max(MV_LOW as isize + 1), mvx_max.min(MV_UPP as isize - 1), mvy_min.max(MV_LOW as isize + 1), mvy_max.min(MV_UPP as isize - 1), ) } #[allow(clippy::too_many_arguments)] fn full_search( x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize, w: usize, h: usize, org_region: &PlaneRegion, p_ref: &Plane, po: PlaneOffset, step: usize, lambda: u32, pmv: [MotionVector; 2], bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) -> MotionSearchResult { let search_region = p_ref.region(Area::Rect(Rect { x: x_lo, y: y_lo, width: (x_hi - x_lo) as usize + w, height: (y_hi - y_lo) as usize + h, })); let mut best: MotionSearchResult = MotionSearchResult::empty(); // Select rectangular regions within search region with vert+horz windows for vert_window in search_region.vert_windows(h).step_by(step) { for ref_window in vert_window.horz_windows(w).step_by(step) { let &Rect { x, y, .. } = ref_window.rect(); let mv = MotionVector { row: 8 * (y as i16 - po.y as i16), col: 8 * (x as i16 - po.x as i16), }; let rd = compute_mv_rd( pmv, lambda, false, bit_depth, w, h, mv, org_region, &ref_window, cpu_feature_level, ); if rd.cost < best.rd.cost { best.rd = rd; best.mv = mv; } } } best } /// Compute the rate distortion stats for a motion vector. #[allow(clippy::too_many_arguments)] fn compute_mv_rd( pmv: [MotionVector; 2], lambda: u32, use_satd: bool, bit_depth: usize, w: usize, h: usize, cand_mv: MotionVector, plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, cpu_feature_level: CpuFeatureLevel, ) -> MVCandidateRD { let sad = if use_satd { get_satd(plane_org, plane_ref, w, h, bit_depth, cpu_feature_level) } else { get_sad(plane_org, plane_ref, w, h, bit_depth, cpu_feature_level) }; let rate1 = get_mv_rate(cand_mv, pmv[0]); let rate2 = get_mv_rate(cand_mv, pmv[1]); let rate = rate1.min(rate2 + 1); MVCandidateRD { cost: 256 * sad as u64 + rate as u64 * lambda as u64, sad, } } fn diff_to_rate(diff: i16) -> u32 { let d = diff >> 1; 2 * ILog::ilog(d.abs()) as u32 } fn get_mv_rate(a: MotionVector, b: MotionVector) -> u32 { diff_to_rate(a.row - b.row) + diff_to_rate(a.col - b.col) } /// Result of motion search. #[derive(Debug, Copy, Clone)] pub struct MotionSearchResult { /// Motion vector chosen by the motion search. pub mv: MotionVector, /// Rate distortion data associated with `mv`. pub rd: MVCandidateRD, } impl MotionSearchResult { /// Creates an 'empty' value. /// /// To be considered empty, cost is set higher than any naturally occurring /// cost value. The idea is that comparing to any valid rd output, the /// search result will always be replaced. pub fn empty() -> MotionSearchResult { MotionSearchResult { mv: MotionVector::default(), rd: MVCandidateRD::empty(), } } /// Check if the value should be considered to be empty. const fn is_empty(&self) -> bool { self.rd.cost == u64::MAX } } /// Holds data from computing rate distortion of a motion vector. #[derive(Debug, Copy, Clone)] pub struct MVCandidateRD { /// Rate distortion cost of the motion vector. pub cost: u64, /// Distortion metric value for the motion vector. pub sad: u32, } impl MVCandidateRD { /// Creates an 'empty' value. /// /// To be considered empty, cost is set higher than any naturally occurring /// cost value. The idea is that comparing to any valid rd output, the /// search result will always be replaced. const fn empty() -> MVCandidateRD { MVCandidateRD { sad: u32::MAX, cost: u64::MAX, } } } fn save_me_stats( ts: &mut TileStateMut<'_, T>, mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, ref_frame: RefType, stats: MEStats, ) { let size_in_b = 1 << mv_size_in_b_log2; let tile_me_stats = &mut ts.me_stats[ref_frame.to_index()]; let tile_bo_x_end = (tile_bo.0.x + size_in_b).min(ts.mi_width); let tile_bo_y_end = (tile_bo.0.y + size_in_b).min(ts.mi_height); for mi_y in tile_bo.0.y..tile_bo_y_end { for a in tile_me_stats[mi_y][tile_bo.0.x..tile_bo_x_end].iter_mut() { *a = stats; } } } #[allow(clippy::too_many_arguments)] fn estimate_sb_motion( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, ref_frame: RefType, mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, init: bool, ssdec: u8, lambda: u32, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) { let pix_offset = tile_bo.to_luma_plane_offset(); let sb_h: usize = SB_SIZE.min(ts.height - pix_offset.y as usize); let sb_w: usize = SB_SIZE.min(ts.width - pix_offset.x as usize); let mv_size = MI_SIZE << mv_size_in_b_log2; // Process in blocks, cropping at edges. for y in (0..sb_h).step_by(mv_size) { for x in (0..sb_w).step_by(mv_size) { let corner: MVSamplingMode = if init { MVSamplingMode::INIT } else { // Processing the block a size up produces data that can be used by // the right and bottom corners. MVSamplingMode::CORNER { right: x & mv_size == mv_size, bottom: y & mv_size == mv_size, } }; let sub_bo = tile_bo.with_offset(x as isize >> MI_SIZE_LOG2, y as isize >> MI_SIZE_LOG2); // Clamp to frame edge, rounding up in the case of subsampling. // The rounding makes some assumptions about how subsampling is done. let w = mv_size.min(sb_w - x + (1 << ssdec) - 1) >> ssdec; let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec; // Run motion estimation. // Note that the initial search (init) instructs the called function to // perform a more extensive search. if let Some(results) = estimate_motion( fi, ts, w, h, sub_bo, ref_frame, None, corner, init, ssdec, Some(lambda), bit_depth, cpu_feature_level, ) { // normalize sad to 128x128 block let sad = (((results.rd.sad as u64) << (MAX_SB_SIZE_LOG2 * 2)) / (w * h) as u64) as u32; save_me_stats(ts, mv_size_in_b_log2, sub_bo, ref_frame, MEStats { mv: results.mv, normalized_sad: sad, }); } } } } #[allow(clippy::too_many_arguments)] fn estimate_motion( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, w: usize, h: usize, tile_bo: TileBlockOffset, ref_frame: RefType, pmv: Option<[MotionVector; 2]>, corner: MVSamplingMode, extensive_search: bool, ssdec: u8, lambda: Option, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) -> Option { if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] { let frame_bo = ts.to_frame_block_offset(tile_bo); let (mvx_min, mvx_max, mvy_min, mvy_max) = get_mv_range(fi.w_in_b, fi.h_in_b, frame_bo, w << ssdec, h << ssdec); let lambda = lambda.unwrap_or(0); let global_mv = [MotionVector { row: 0, col: 0 }; 2]; let po = frame_bo.to_luma_plane_offset(); let (mvx_min, mvx_max, mvy_min, mvy_max) = ( mvx_min >> ssdec, mvx_max >> ssdec, mvy_min >> ssdec, mvy_max >> ssdec, ); let po = PlaneOffset { x: po.x >> ssdec, y: po.y >> ssdec, }; let p_ref = match ssdec { 0 => &rec.frame.planes[0], 1 => &rec.input_hres, 2 => &rec.input_qres, _ => unimplemented!(), }; let org_region = &match ssdec { 0 => ts.input_tile.planes[0].subregion(Area::BlockStartingAt { bo: tile_bo.0 }), 1 => ts.input_hres.region(Area::StartingAt { x: po.x, y: po.y }), 2 => ts.input_qres.region(Area::StartingAt { x: po.x, y: po.y }), _ => unimplemented!(), }; let mut best: MotionSearchResult = full_pixel_me( fi, ts, org_region, p_ref, tile_bo, po, lambda, pmv.unwrap_or(global_mv), w, h, mvx_min, mvx_max, mvy_min, mvy_max, ref_frame, corner, extensive_search, ssdec, bit_depth, cpu_feature_level, ); if let Some(pmv) = pmv { best.rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, true, mvx_min, mvx_max, mvy_min, mvy_max, w, h, best.mv, cpu_feature_level, ); sub_pixel_me( fi, po, org_region, p_ref, lambda, pmv, mvx_min, mvx_max, mvy_min, mvy_max, w, h, true, &mut best, ref_frame, bit_depth, cpu_feature_level, ); } // Scale motion vectors to full res size best.mv = best.mv << ssdec; Some(best) } else { None } } #[allow(clippy::too_many_arguments)] fn full_pixel_me( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, org_region: &PlaneRegion, p_ref: &Plane, tile_bo: TileBlockOffset, po: PlaneOffset, lambda: u32, pmv: [MotionVector; 2], w: usize, h: usize, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, ref_frame: RefType, corner: MVSamplingMode, extensive_search: bool, ssdec: u8, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) -> MotionSearchResult { let ref_frame_id = ref_frame.to_index(); let tile_me_stats = &ts.me_stats[ref_frame_id].as_const(); let frame_ref = fi.rec_buffer.frames[fi.ref_frames[0] as usize] .as_ref() .map(|frame_ref| frame_ref.frame_me_stats.read().expect("poisoned lock")); let subsets = get_subset_predictors( tile_bo, tile_me_stats, frame_ref, ref_frame_id, w, h, mvx_min, mvx_max, mvy_min, mvy_max, corner, ssdec, ); let try_cands = |predictors: &[MotionVector], best: &mut MotionSearchResult| { let mut results = get_best_predictor( po, org_region, p_ref, predictors, bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cpu_feature_level, ); fullpel_diamond_search( po, org_region, p_ref, &mut results, bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cpu_feature_level, ); if results.rd.cost < best.rd.cost { *best = results; } }; let mut best: MotionSearchResult = MotionSearchResult::empty(); if !extensive_search { try_cands(&subsets.all_mvs(), &mut best); best } else { // Perform a more thorough search before resorting to full search. // Search the median, the best mvs of neighboring blocks, and motion vectors // from the previous frame. Stop once a candidate with a sad less than a // threshold is found. let thresh = (subsets.min_sad as f32 * 1.2) as u32 + (((w * h) as u32) << (bit_depth - 8)); if let Some(median) = subsets.median { try_cands(&[median], &mut best); if best.rd.sad < thresh { return best; } } try_cands(&subsets.subset_b, &mut best); if best.rd.sad < thresh { return best; } try_cands(&subsets.subset_c, &mut best); if best.rd.sad < thresh { return best; } // Preform UMH search, either as the last possible search when full search // is disabled, or as the last search before resorting to full search. // Use 24 merange, since it is the largest range that x264 uses. uneven_multi_hex_search( po, org_region, p_ref, &mut best, bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, 24, cpu_feature_level, ); best } } #[allow(clippy::too_many_arguments)] fn sub_pixel_me( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, lambda: u32, pmv: [MotionVector; 2], mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, use_satd: bool, best: &mut MotionSearchResult, ref_frame: RefType, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) { subpel_diamond_search( fi, po, org_region, p_ref, bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, use_satd, best, ref_frame, cpu_feature_level, ); } /// Run a subpixel diamond search. The search is run on multiple step sizes. /// /// For each step size, candidate motion vectors are examined for improvement /// to the current search location. The search location is moved to the best /// candidate (if any). This is repeated until the search location stops moving. #[allow(clippy::too_many_arguments)] fn subpel_diamond_search( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, _p_ref: &Plane, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, use_satd: bool, current: &mut MotionSearchResult, ref_frame: RefType, cpu_feature_level: CpuFeatureLevel, ) { // Motion compensation assembly has special requirements for edges let mc_w = w.next_power_of_two(); let mc_h = (h + 1) & !1; // Metadata for subpel scratch pad. let cfg = PlaneConfig::new(mc_w, mc_h, 0, 0, 0, 0, std::mem::size_of::()); // Stack allocation for subpel scratch pad. // SAFETY: We write to the array below before reading from it. let mut buf: Aligned = Aligned([T::cast_from(0); 128 * 128]); let mut tmp_region = PlaneRegionMut::from_slice(buf.as_mut(), &cfg, Rect { x: 0, y: 0, width: cfg.width, height: cfg.height, }); // start at 1/2 pel and end at 1/4 or 1/8 pel let (mut diamond_radius_log2, diamond_radius_end_log2) = (2u8, 1u8); loop { // Find the best candidate from the diamond pattern. let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); for &offset in &DIAMOND_R1_PATTERN_SUBPEL { let cand_mv = current.mv + (offset << diamond_radius_log2); let rd = get_subpel_mv_rd( fi, po, org_region, bit_depth, pmv, lambda, use_satd, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, &mut tmp_region, ref_frame, cpu_feature_level, ); if rd.cost < best_cand.rd.cost { best_cand.mv = cand_mv; best_cand.rd = rd; } } // Continue the search at this scale until a better candidate isn't found. if current.rd.cost <= best_cand.rd.cost { if diamond_radius_log2 == diamond_radius_end_log2 { break; } else { diamond_radius_log2 -= 1; } } else { *current = best_cand; } } assert!(!current.is_empty()); } #[allow(clippy::too_many_arguments)] fn get_subpel_mv_rd( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, use_satd: bool, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, cand_mv: MotionVector, tmp_region: &mut PlaneRegionMut, ref_frame: RefType, cpu_feature_level: CpuFeatureLevel, ) -> MVCandidateRD { if (cand_mv.col as isize) < mvx_min || (cand_mv.col as isize) > mvx_max || (cand_mv.row as isize) < mvy_min || (cand_mv.row as isize) > mvy_max { return MVCandidateRD::empty(); } let tmp_width = tmp_region.rect().width; let tmp_height = tmp_region.rect().height; let tile_rect = TileRect { x: 0, y: 0, width: tmp_width, height: tmp_height, }; PredictionMode::NEWMV.predict_inter_single( fi, tile_rect, 0, po, tmp_region, // motion comp's w & h on edges can be different from distortion's tmp_width, tmp_height, ref_frame, cand_mv, bit_depth, cpu_feature_level, ); let plane_ref = tmp_region.as_const(); compute_mv_rd( pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region, &plane_ref, cpu_feature_level, ) } /// Perform an uneven multi-hexagon search. There are 4 stages: /// 1. Unsymmetrical-cross search: Search the horizontal and vertical directions /// for the general direction of the motion. /// 2. A 5x5 full search is done to refine the current candidate. /// 3. Uneven multi-hexagon search. See [`UMH_PATTERN`]. /// 4. Refinement using standard hexagon search. /// /// `current` provides the initial search location and serves as /// the output for the final search results. /// /// `me_range` parameter determines how far these stages can search. #[allow(clippy::too_many_arguments)] fn uneven_multi_hex_search( po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, current: &mut MotionSearchResult, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, me_range: i16, cpu_feature_level: CpuFeatureLevel, ) { assert!(!current.is_empty()); // Search in a cross pattern to obtain a rough approximate of motion. // The cross is split into a horizontal and vertical component. Video content // tends to have more horizontal motion, so the horizontal part of the cross // is twice as large as the vertical half. // X - // | <- me_range/2 // X | // X X X XoX X X X - // X // // X // |------| // \ // me_range let center = current.mv; // The larger, horizontal, part of the cross search. for i in (1..=me_range).step_by(2) { const HORIZONTAL_LINE: [MotionVector; 2] = search_pattern!( col: [ 0, 0], row: [-1, 1] ); for &offset in &HORIZONTAL_LINE { let cand_mv = center + offset * i; let rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, cpu_feature_level, ); if rd.cost < current.rd.cost { current.mv = cand_mv; current.rd = rd; } } } // The smaller, vertical, part of the cross search for i in (1..=me_range >> 1).step_by(2) { const VERTICAL_LINE: [MotionVector; 2] = search_pattern!( col: [-1, 1], row: [ 0, 0] ); for &offset in &VERTICAL_LINE { let cand_mv = center + offset * i; let rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, cpu_feature_level, ); if rd.cost < current.rd.cost { current.mv = cand_mv; current.rd = rd; } } } // 5x5 full search. Search a 5x5 square region around the current best mv. let center = current.mv; for row in -2..=2 { for col in -2..=2 { if row == 0 && col == 0 { continue; } let cand_mv = center + MotionVector { row, col }; let rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, cpu_feature_level, ); if rd.cost < current.rd.cost { current.mv = cand_mv; current.rd = rd; } } } // Run the hexagons in uneven multi-hexagon. The hexagonal pattern is tested // around the best vector at multiple scales. // Example of the UMH pattern run on a scale of 1 and 2: // 2 - // | <- me_range // 2 2 | // | // 2 1 2 | // 1 1 | // 2 1 1 2 | // 1 1 | // 2 1 o 1 2 | // 1 1 | // 2 1 1 2 | // 1 1 | // 2 1 2 | // | // 2 2 | // | // 2 - // |---------------| // \ // me_range let center = current.mv; // Divide by 4, the radius of the UMH's hexagon. let iterations = me_range >> 2; for i in 1..=iterations { for &offset in &UMH_PATTERN { let cand_mv = center + offset * i; let rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, cpu_feature_level, ); if rd.cost < current.rd.cost { current.mv = cand_mv; current.rd = rd; } } } // Refine the search results using a 'normal' hexagon search. hexagon_search( po, org_region, p_ref, current, bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cpu_feature_level, ); } #[allow(clippy::too_many_arguments)] fn get_subset_predictors( tile_bo: TileBlockOffset, tile_me_stats: &TileMEStats<'_>, frame_ref_opt: Option>, ref_frame_id: usize, pix_w: usize, pix_h: usize, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, corner: MVSamplingMode, ssdec: u8, ) -> MotionEstimationSubsets { let mut min_sad: u32 = u32::MAX; let mut subset_b = ArrayVec::::new(); let mut subset_c = ArrayVec::::new(); // rounded up width in blocks let w = ((pix_w << ssdec) + MI_SIZE - 1) >> MI_SIZE_LOG2; let h = ((pix_h << ssdec) + MI_SIZE - 1) >> MI_SIZE_LOG2; // Get predictors from the same frame. let clipped_half_w = (w >> 1).min(tile_me_stats.cols() - 1 - tile_bo.0.x); let clipped_half_h = (h >> 1).min(tile_me_stats.rows() - 1 - tile_bo.0.y); let mut process_cand = |stats: MEStats| -> MotionVector { min_sad = min_sad.min(stats.normalized_sad); let mv = stats.mv.quantize_to_fullpel(); MotionVector { col: clamp(mv.col as isize, mvx_min, mvx_max) as i16, row: clamp(mv.row as isize, mvy_min, mvy_max) as i16, } }; // Sample the middle of all block edges bordering this one. // Note: If motion vectors haven't been precomputed to a given blocksize, then // the right and bottom edges will be duplicates of the center predictor when // processing in raster order. // left if tile_bo.0.x > 0 { subset_b.push(process_cand( tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x - 1], )); } // top if tile_bo.0.y > 0 { subset_b.push(process_cand( tile_me_stats[tile_bo.0.y - 1][tile_bo.0.x + clipped_half_w], )); } // Sampling far right and far bottom edges was tested, but had worse results // without an extensive threshold test (with threshold being applied after // checking median and the best of each subset). // right if let MVSamplingMode::CORNER { right: true, bottom: _, } = corner { if tile_bo.0.x + w < tile_me_stats.cols() { subset_b.push(process_cand( tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x + w], )); } } // bottom if let MVSamplingMode::CORNER { right: _, bottom: true, } = corner { if tile_bo.0.y + h < tile_me_stats.rows() { subset_b.push(process_cand( tile_me_stats[tile_bo.0.y + h][tile_bo.0.x + clipped_half_w], )); } } let median = if corner != MVSamplingMode::INIT { // Sample the center of the current block. Some(process_cand( tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x + clipped_half_w], )) } else if subset_b.len() != 3 { None } else { let mut rows: ArrayVec = subset_b.iter().map(|&a| a.row).collect(); let mut cols: ArrayVec = subset_b.iter().map(|&a| a.col).collect(); rows.as_mut_slice().sort_unstable(); cols.as_mut_slice().sort_unstable(); Some(MotionVector { row: rows[1], col: cols[1], }) }; // Zero motion vector, don't use add_cand since it skips zero vectors. subset_b.push(MotionVector::default()); // EPZS subset C predictors. // Sample the middle of bordering side of the left, right, top and bottom // blocks of the previous frame. // Sample the middle of this block in the previous frame. if let Some(frame_me_stats) = frame_ref_opt { let prev_frame = &frame_me_stats[ref_frame_id]; let frame_bo = PlaneBlockOffset(BlockOffset { x: tile_me_stats.x() + tile_bo.0.x, y: tile_me_stats.y() + tile_bo.0.y, }); let clipped_half_w = (w >> 1).min(prev_frame.cols - 1 - frame_bo.0.x); let clipped_half_h = (h >> 1).min(prev_frame.rows - 1 - frame_bo.0.y); // left if frame_bo.0.x > 0 { subset_c.push(process_cand( prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x - 1], )); } // top if frame_bo.0.y > 0 { subset_c.push(process_cand( prev_frame[frame_bo.0.y - 1][frame_bo.0.x + clipped_half_w], )); } // right if frame_bo.0.x + w < prev_frame.cols { subset_c.push(process_cand( prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x + w], )); } // bottom if frame_bo.0.y + h < prev_frame.rows { subset_c.push(process_cand( prev_frame[frame_bo.0.y + h][frame_bo.0.x + clipped_half_w], )); } subset_c.push(process_cand( prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x + clipped_half_w], )); } // Undo normalization to 128x128 block size let min_sad = ((min_sad as u64 * (pix_w * pix_h) as u64) >> (MAX_SB_SIZE_LOG2 * 2)) as u32; let dec_mv = |mv: MotionVector| MotionVector { col: mv.col >> ssdec, row: mv.row >> ssdec, }; let median = median.map(dec_mv); for mv in subset_b.iter_mut() { *mv = dec_mv(*mv); } for mv in subset_c.iter_mut() { *mv = dec_mv(*mv); } MotionEstimationSubsets { min_sad, median, subset_b, subset_c, } } #[allow(clippy::too_many_arguments)] fn get_best_predictor( po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, predictors: &[MotionVector], bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, cpu_feature_level: CpuFeatureLevel, ) -> MotionSearchResult { let mut best: MotionSearchResult = MotionSearchResult::empty(); for &init_mv in predictors.iter() { let rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, init_mv, cpu_feature_level, ); if rd.cost < best.rd.cost { best.mv = init_mv; best.rd = rd; } } best } #[allow(clippy::too_many_arguments)] fn get_fullpel_mv_rd( po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, use_satd: bool, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, cand_mv: MotionVector, cpu_feature_level: CpuFeatureLevel, ) -> MVCandidateRD { if (cand_mv.col as isize) < mvx_min || (cand_mv.col as isize) > mvx_max || (cand_mv.row as isize) < mvy_min || (cand_mv.row as isize) > mvy_max { return MVCandidateRD::empty(); } // Convert the motion vector into an full pixel offset. let plane_ref = p_ref.region(Area::StartingAt { x: po.x + (cand_mv.col / 8) as isize, y: po.y + (cand_mv.row / 8) as isize, }); compute_mv_rd( pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region, &plane_ref, cpu_feature_level, ) } /// Perform hexagon search and refine afterwards. /// /// In the hexagon search stage, candidate motion vectors are examined for /// improvement to the current search location. The search location is moved to /// the best candidate (if any). This is repeated until the search location /// stops moving. /// /// Refinement uses a square pattern that fits between the hexagon candidates. /// /// The hexagon pattern is defined by [`HEXAGON_PATTERN`] and the refinement /// is defined by [`SQUARE_REFINE_PATTERN`]. /// /// `current` provides the initial search location and serves as /// the output for the final search results. #[allow(clippy::too_many_arguments)] fn hexagon_search( po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, current: &mut MotionSearchResult, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, cpu_feature_level: CpuFeatureLevel, ) { // The first iteration of hexagon search is implemented separate from // subsequent iterations, which overlap with previous iterations. // Holds what candidate is used (if any). This is used to determine which // candidates have already been tested in a previous iteration and can be // skipped. let mut best_cand_idx: usize = 0; let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); // First iteration of hexagon search. There are six candidates to consider. for (i, &pattern_mv) in HEXAGON_PATTERN.iter().enumerate() { let cand_mv = current.mv + pattern_mv; let rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, cpu_feature_level, ); if rd.cost < best_cand.rd.cost { best_cand_idx = i; best_cand.mv = cand_mv; best_cand.rd = rd; } } // Run additional iterations of hexagon search until the search location // doesn't update. while best_cand.rd.cost < current.rd.cost { // Update the search location. *current = best_cand; best_cand = MotionSearchResult::empty(); // Save the index/direction taken in the previous iteration to the current // search location. let center_cand_idx = best_cand_idx; // Look only at candidates that don't overlap with previous iterations. This // corresponds with the three offsets (2D) with the closest direction to // that traveled by the previous iteration. HEXAGON_PATTERN has clockwise // order, so the last direction -1, +0, and +1 (mod 6) give the indices for // these offsets. for idx_offset_mod6 in 5..=7 { let i = (center_cand_idx + idx_offset_mod6) % 6; let cand_mv = current.mv + HEXAGON_PATTERN[i]; let rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, cpu_feature_level, ); if rd.cost < best_cand.rd.cost { best_cand_idx = i; best_cand.mv = cand_mv; best_cand.rd = rd; } } } // Refine the motion after completing hexagon search. let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); for &offset in &SQUARE_REFINE_PATTERN { let cand_mv = current.mv + offset; let rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, cpu_feature_level, ); if rd.cost < best_cand.rd.cost { best_cand.mv = cand_mv; best_cand.rd = rd; } } if best_cand.rd.cost < current.rd.cost { *current = best_cand; } assert!(!current.is_empty()); } /// Run a full pixel diamond search. The search is run on multiple step sizes. /// /// For each step size, candidate motion vectors are examined for improvement /// to the current search location. The search location is moved to the best /// candidate (if any). This is repeated until the search location stops moving. #[allow(clippy::too_many_arguments)] fn fullpel_diamond_search( po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, current: &mut MotionSearchResult, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, cpu_feature_level: CpuFeatureLevel, ) { // Define the initial and the final scale (log2) of the diamond. let (mut diamond_radius_log2, diamond_radius_end_log2) = (1u8, 0u8); loop { // Find the best candidate from the diamond pattern. let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); for &offset in &DIAMOND_R1_PATTERN { let cand_mv = current.mv + (offset << diamond_radius_log2); let rd = get_fullpel_mv_rd( po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, cpu_feature_level, ); if rd.cost < best_cand.rd.cost { best_cand.mv = cand_mv; best_cand.rd = rd; } } // Continue the search at this scale until the can't find a better candidate // to use. if current.rd.cost <= best_cand.rd.cost { if diamond_radius_log2 == diamond_radius_end_log2 { break; } else { diamond_radius_log2 -= 1; } } else { *current = best_cand; } } assert!(!current.is_empty()); } av-scenechange-0.14.1/src/analyze/intra/simd_x86.rs000064400000000000000000000123531046102023000201400ustar 00000000000000use v_frame::pixel::{Pixel, PixelType}; use super::IntraEdge; use crate::{ cpu::CpuFeatureLevel, data::{block::TxSize, plane::PlaneRegionMut, prediction::PredictionVariant}, }; macro_rules! decl_angular_ipred_fn { ($($f:ident),+) => { extern "C" { $( fn $f( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, width: libc::c_int, height: libc::c_int, angle: libc::c_int, ); )* } }; } decl_angular_ipred_fn! { avsc_ipred_dc_8bpc_ssse3, avsc_ipred_dc_8bpc_avx2, avsc_ipred_dc_8bpc_avx512icl, avsc_ipred_dc_left_8bpc_ssse3, avsc_ipred_dc_left_8bpc_avx2, avsc_ipred_dc_left_8bpc_avx512icl, avsc_ipred_dc_128_8bpc_ssse3, avsc_ipred_dc_128_8bpc_avx2, avsc_ipred_dc_128_8bpc_avx512icl, avsc_ipred_dc_top_8bpc_ssse3, avsc_ipred_dc_top_8bpc_avx2, avsc_ipred_dc_top_8bpc_avx512icl } macro_rules! decl_angular_ipred_hbd_fn { ($($f:ident),+) => { extern "C" { $( fn $f( dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, width: libc::c_int, height: libc::c_int, angle: libc::c_int, max_width: libc::c_int, max_height: libc::c_int, bit_depth_max: libc::c_int, ); )* } }; } decl_angular_ipred_hbd_fn! { avsc_ipred_dc_16bpc_ssse3, avsc_ipred_dc_16bpc_avx2, avsc_ipred_dc_left_16bpc_ssse3, avsc_ipred_dc_left_16bpc_avx2, avsc_ipred_dc_128_16bpc_ssse3, avsc_ipred_dc_128_16bpc_avx2, avsc_ipred_dc_top_16bpc_ssse3, avsc_ipred_dc_top_16bpc_avx2 } pub(super) fn dispatch_predict_dc_intra( variant: PredictionVariant, dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize, edge_buf: &IntraEdge, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { super::rust::dispatch_predict_dc_intra(variant, dst, tx_size, bit_depth, edge_buf, cpu); }; // SAFETY: Calls Assembly code. unsafe { let stride = T::to_asm_stride(dst.plane_cfg.stride) as libc::ptrdiff_t; let w = tx_size.width() as libc::c_int; let h = tx_size.height() as libc::c_int; match T::type_enum() { PixelType::U8 => { let dst_ptr = dst.data_ptr_mut() as *mut _; let edge_ptr = edge_buf.top_left_ptr() as *const _; if cpu >= CpuFeatureLevel::AVX512ICL { (match variant { PredictionVariant::NONE => avsc_ipred_dc_128_8bpc_avx512icl, PredictionVariant::LEFT => avsc_ipred_dc_left_8bpc_avx512icl, PredictionVariant::TOP => avsc_ipred_dc_top_8bpc_avx512icl, PredictionVariant::BOTH => avsc_ipred_dc_8bpc_avx512icl, })(dst_ptr, stride, edge_ptr, w, h, 0); } else if cpu >= CpuFeatureLevel::AVX2 { (match variant { PredictionVariant::NONE => avsc_ipred_dc_128_8bpc_avx2, PredictionVariant::LEFT => avsc_ipred_dc_left_8bpc_avx2, PredictionVariant::TOP => avsc_ipred_dc_top_8bpc_avx2, PredictionVariant::BOTH => avsc_ipred_dc_8bpc_avx2, })(dst_ptr, stride, edge_ptr, w, h, 0); } else if cpu >= CpuFeatureLevel::SSSE3 { (match variant { PredictionVariant::NONE => avsc_ipred_dc_128_8bpc_ssse3, PredictionVariant::LEFT => avsc_ipred_dc_left_8bpc_ssse3, PredictionVariant::TOP => avsc_ipred_dc_top_8bpc_ssse3, PredictionVariant::BOTH => avsc_ipred_dc_8bpc_ssse3, })(dst_ptr, stride, edge_ptr, w, h, 0); } else { call_rust(dst) } } PixelType::U16 => { let dst_ptr = dst.data_ptr_mut() as *mut _; let edge_ptr = edge_buf.top_left_ptr() as *const _; let bd_max = (1 << bit_depth) - 1; if cpu >= CpuFeatureLevel::AVX2 { (match variant { PredictionVariant::NONE => avsc_ipred_dc_128_16bpc_avx2, PredictionVariant::LEFT => avsc_ipred_dc_left_16bpc_avx2, PredictionVariant::TOP => avsc_ipred_dc_top_16bpc_avx2, PredictionVariant::BOTH => avsc_ipred_dc_16bpc_avx2, })(dst_ptr, stride, edge_ptr, w, h, 0, 0, 0, bd_max); } else if cpu >= CpuFeatureLevel::SSSE3 { (match variant { PredictionVariant::NONE => avsc_ipred_dc_128_16bpc_ssse3, PredictionVariant::LEFT => avsc_ipred_dc_left_16bpc_ssse3, PredictionVariant::TOP => avsc_ipred_dc_top_16bpc_ssse3, PredictionVariant::BOTH => avsc_ipred_dc_16bpc_ssse3, })(dst_ptr, stride, edge_ptr, w, h, 0, 0, 0, bd_max); } else { call_rust(dst) } } } } } av-scenechange-0.14.1/src/analyze/intra.rs000064400000000000000000000263741046102023000165070ustar 00000000000000#[cfg(asm_x86_64)] mod simd_x86; use std::mem::{transmute, MaybeUninit}; use aligned::{Aligned, A64}; #[cfg(not(asm_x86_64))] use rust::*; #[cfg(asm_x86_64)] use simd_x86::*; use v_frame::{ frame::Frame, pixel::Pixel, plane::{Plane, PlaneOffset}, }; use super::importance::IMPORTANCE_BLOCK_SIZE; use crate::{ cpu::CpuFeatureLevel, data::{ block::{BlockSize, TxSize, MAX_TX_SIZE}, plane::{Area, AsRegion, PlaneRegion, PlaneRegionMut, Rect}, prediction::PredictionVariant, satd::get_satd, slice_assume_init_mut, superblock::MI_SIZE_LOG2, tile::TileRect, }, }; pub const BLOCK_TO_PLANE_SHIFT: usize = MI_SIZE_LOG2; mod rust { use v_frame::pixel::Pixel; use super::IntraEdge; use crate::{ cpu::CpuFeatureLevel, data::{block::TxSize, plane::PlaneRegionMut, prediction::PredictionVariant}, }; #[cfg_attr( all(asm_x86_64, any(target_feature = "ssse3", target_feature = "avx2")), cold )] pub(super) fn dispatch_predict_dc_intra( variant: PredictionVariant, dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize, edge_buf: &IntraEdge, _cpu: CpuFeatureLevel, ) { let width = tx_size.width(); let height = tx_size.height(); // left pixels are ordered from bottom to top and right-aligned let (left, _top_left, above) = edge_buf.as_slices(); let above_slice = above; let left_slice = &left[left.len().saturating_sub(height)..]; (match variant { PredictionVariant::NONE => pred_dc_128, PredictionVariant::LEFT => pred_dc_left, PredictionVariant::TOP => pred_dc_top, PredictionVariant::BOTH => pred_dc, })(dst, above_slice, left_slice, width, height, bit_depth) } fn pred_dc( output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], width: usize, height: usize, _bit_depth: usize, ) { let edges = left[..height].iter().chain(above[..width].iter()); let len = (width + height) as u32; let avg = (edges.fold(0u32, |acc, &v| { let v: u32 = v.into(); v + acc }) + (len >> 1)) / len; let avg = T::cast_from(avg); for line in output.rows_iter_mut().take(height) { line[..width].fill(avg); } } fn pred_dc_128( output: &mut PlaneRegionMut<'_, T>, _above: &[T], _left: &[T], width: usize, height: usize, bit_depth: usize, ) { let v = T::cast_from(128u32 << (bit_depth - 8)); for line in output.rows_iter_mut().take(height) { line[..width].fill(v); } } fn pred_dc_left( output: &mut PlaneRegionMut<'_, T>, _above: &[T], left: &[T], width: usize, height: usize, _bit_depth: usize, ) { let sum = left[..].iter().fold(0u32, |acc, &v| { let v: u32 = v.into(); v + acc }); let avg = T::cast_from((sum + (height >> 1) as u32) / height as u32); for line in output.rows_iter_mut().take(height) { line[..width].fill(avg); } } fn pred_dc_top( output: &mut PlaneRegionMut<'_, T>, above: &[T], _left: &[T], width: usize, height: usize, _bit_depth: usize, ) { let sum = above[..width].iter().fold(0u32, |acc, &v| { let v: u32 = v.into(); v + acc }); let avg = T::cast_from((sum + (width >> 1) as u32) / width as u32); for line in output.rows_iter_mut().take(height) { line[..width].fill(avg); } } } pub(crate) fn estimate_intra_costs( temp_plane: &mut Plane, frame: &Frame, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) -> Box<[u32]> { let plane = &frame.planes[0]; let plane_after_prediction = temp_plane; let bsize = BlockSize::from_width_and_height(IMPORTANCE_BLOCK_SIZE, IMPORTANCE_BLOCK_SIZE); let tx_size = bsize.tx_size(); let h_in_imp_b = plane.cfg.height / IMPORTANCE_BLOCK_SIZE; let w_in_imp_b = plane.cfg.width / IMPORTANCE_BLOCK_SIZE; let mut intra_costs = Vec::with_capacity(h_in_imp_b * w_in_imp_b); for y in 0..h_in_imp_b { for x in 0..w_in_imp_b { let plane_org = plane.region(Area::Rect(Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, })); // For scene detection, we are only going to support DC_PRED // for simplicity and speed purposes. let mut edge_buf = Aligned([MaybeUninit::uninit(); 4 * MAX_TX_SIZE + 1]); let edge_buf = get_intra_edges( &mut edge_buf, &plane.as_region(), PlaneOffset { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, }, bit_depth, ); let mut plane_after_prediction_region = plane_after_prediction.region_mut(Area::Rect(Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, })); predict_dc_intra( TileRect { x: x * IMPORTANCE_BLOCK_SIZE, y: y * IMPORTANCE_BLOCK_SIZE, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }, &mut plane_after_prediction_region, tx_size, bit_depth, &edge_buf, cpu_feature_level, ); let plane_after_prediction_region = plane_after_prediction.region(Area::Rect(Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, })); let intra_cost = get_satd( &plane_org, &plane_after_prediction_region, bsize.width(), bsize.height(), bit_depth, cpu_feature_level, ); intra_costs.push(intra_cost); } } intra_costs.into_boxed_slice() } pub fn get_intra_edges<'a, T: Pixel>( edge_buf: &'a mut IntraEdgeBuffer, dst: &PlaneRegion<'_, T>, po: PlaneOffset, bit_depth: usize, ) -> IntraEdge<'a, T> { let tx_size = TxSize::TX_8X8; let mut init_left: usize = 0; let mut init_above: usize = 0; let base = 128u16 << (bit_depth - 8); { // left pixels are ordered from bottom to top and right-aligned let (left, not_left) = edge_buf.split_at_mut(2 * MAX_TX_SIZE); let (top_left, above) = not_left.split_at_mut(1); let x = po.x as usize; let y = po.y as usize; let needs_left = x != 0; let needs_top = y != 0; let rect_w = dst .rect() .width .min(dst.plane_cfg.width - dst.rect().x as usize); let rect_h = dst .rect() .height .min(dst.plane_cfg.height - dst.rect().y as usize); // Needs left if needs_left { let txh = if y + tx_size.height() > rect_h { rect_h - y } else { tx_size.height() }; if x != 0 { for i in 0..txh { debug_assert!(y + i < rect_h); left[2 * MAX_TX_SIZE - 1 - i].write(dst[y + i][x - 1]); } if txh < tx_size.height() { let val = dst[y + txh - 1][x - 1]; for i in txh..tx_size.height() { left[2 * MAX_TX_SIZE - 1 - i].write(val); } } } else { let val = if y != 0 { dst[y - 1][0] } else { T::cast_from(base + 1) }; for v in left[2 * MAX_TX_SIZE - tx_size.height()..].iter_mut() { v.write(val); } } init_left += tx_size.height(); } // Needs top if needs_top { let txw = if x + tx_size.width() > rect_w { rect_w - x } else { tx_size.width() }; if y != 0 { above[..txw].copy_from_slice( // SAFETY: &[T] and &[MaybeUninit] have the same layout unsafe { transmute::<&[T], &[MaybeUninit]>(&dst[y - 1][x..x + txw]) }, ); if txw < tx_size.width() { let val = dst[y - 1][x + txw - 1]; for v in &mut above[txw..tx_size.width()] { v.write(val); } } } else { let val = if x != 0 { dst[0][x - 1] } else { T::cast_from(base - 1) }; for v in &mut above[..tx_size.width()] { v.write(val); } } init_above += tx_size.width(); } top_left[0].write(T::cast_from(base)); } IntraEdge::new(edge_buf, init_left, init_above) } pub fn predict_dc_intra( tile_rect: TileRect, dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize, edge_buf: &IntraEdge, cpu: CpuFeatureLevel, ) { let &Rect { x: frame_x, y: frame_y, .. } = dst.rect(); debug_assert!(frame_x >= 0 && frame_y >= 0); // x and y are expressed relative to the tile let x = frame_x as usize - tile_rect.x; let y = frame_y as usize - tile_rect.y; let variant = PredictionVariant::new(x, y); dispatch_predict_dc_intra::(variant, dst, tx_size, bit_depth, edge_buf, cpu); } type IntraEdgeBuffer = Aligned; 4 * MAX_TX_SIZE + 1]>; pub struct IntraEdge<'a, T: Pixel>(&'a [T], &'a [T], &'a [T]); impl<'a, T: Pixel> IntraEdge<'a, T> { fn new(edge_buf: &'a mut IntraEdgeBuffer, init_left: usize, init_above: usize) -> Self { // SAFETY: Initialized in `get_intra_edges`. let left = unsafe { let begin_left = 2 * MAX_TX_SIZE - init_left; let end_above = 2 * MAX_TX_SIZE + 1 + init_above; slice_assume_init_mut(&mut edge_buf[begin_left..end_above]) }; let (left, top_left) = left.split_at(init_left); let (top_left, above) = top_left.split_at(1); Self(left, top_left, above) } pub const fn as_slices(&self) -> (&'a [T], &'a [T], &'a [T]) { (self.0, self.1, self.2) } #[allow(dead_code)] pub const fn top_left_ptr(&self) -> *const T { self.1.as_ptr() } } av-scenechange-0.14.1/src/analyze/mod.rs000064400000000000000000000357311046102023000161460ustar 00000000000000use std::{cmp, collections::BTreeMap, num::NonZeroUsize, sync::Arc}; use log::debug; use num_rational::Rational32; use v_frame::{ frame::Frame, pixel::{ChromaSampling, Pixel}, plane::Plane, }; use self::fast::{detect_scale_factor, FAST_THRESHOLD}; use crate::{data::motion::RefMEStats, CpuFeatureLevel, SceneDetectionSpeed}; mod fast; mod importance; mod inter; mod intra; mod standard; /// Experiments have determined this to be an optimal threshold const IMP_BLOCK_DIFF_THRESHOLD: f64 = 7.0; /// Fast integer division where divisor is a nonzero power of 2 pub(crate) fn fast_idiv(n: usize, d: NonZeroUsize) -> usize { debug_assert!(d.is_power_of_two()); n >> d.trailing_zeros() } struct ScaleFunction { downscale_in_place: fn(/* &self: */ &Plane, /* in_plane: */ &mut Plane), downscale: fn(/* &self: */ &Plane) -> Plane, factor: NonZeroUsize, } impl ScaleFunction { fn from_scale() -> Self { assert!( SCALE.is_power_of_two(), "Scaling factor needs to be a nonzero power of two" ); Self { downscale: Plane::downscale::, downscale_in_place: Plane::downscale_in_place::, factor: NonZeroUsize::new(SCALE).unwrap(), } } } /// Runs keyframe detection on frames from the lookahead queue. /// /// This struct is intended for advanced users who need the ability to analyze /// a small subset of frames at a time, for example in a streaming fashion. /// Most users will prefer to use `new_detector` and `detect_scene_changes` /// at the top level of this crate. pub struct SceneChangeDetector { // User configuration options /// Scenecut detection mode scene_detection_mode: SceneDetectionSpeed, /// Deque offset for current lookahead_offset: usize, /// Minimum number of frames between two scenecuts min_key_frame_interval: usize, /// Maximum number of frames between two scenecuts max_key_frame_interval: usize, /// The CPU feature level to be used. cpu_feature_level: CpuFeatureLevel, // Internal configuration options /// Minimum average difference between YUV deltas that will trigger a scene /// change. threshold: f64, /// Width and height of the unscaled frame resolution: (usize, usize), /// The bit depth of the video. bit_depth: usize, /// The frame rate of the video. frame_rate: Rational32, /// The chroma subsampling of the video. chroma_sampling: ChromaSampling, /// Number of pixels in scaled frame for fast mode scaled_pixels: usize, /// Downscaling function for fast scene detection scale_func: Option>, // Internal data structures /// Start deque offset based on lookahead deque_offset: usize, /// Frame buffer for scaled frames downscaled_frame_buffer: Option<[Plane; 2]>, /// Scenechange results for adaptive threshold score_deque: Vec, /// Temporary buffer used by `estimate_intra_costs`. /// We store it on the struct so we only need to allocate it once. temp_plane: Option>, /// Buffer for `FrameMEStats` for cost scenecut frame_me_stats_buffer: Option, /// Calculated intra costs for each input frame. /// These can be cached for reuse by advanced API users. /// Caching will occur if this is not `None`. pub intra_costs: Option>>, } impl SceneChangeDetector { /// Creates a new instance of the `SceneChangeDetector`. #[allow(clippy::too_many_arguments)] #[allow(clippy::missing_panics_doc)] #[inline] pub fn new( resolution: (usize, usize), bit_depth: usize, frame_rate: Rational32, chroma_sampling: ChromaSampling, lookahead_distance: usize, scene_detection_mode: SceneDetectionSpeed, min_key_frame_interval: usize, max_key_frame_interval: usize, cpu_feature_level: CpuFeatureLevel, ) -> Self { // Downscaling function for fast scene detection let scale_func = detect_scale_factor(resolution, scene_detection_mode); // Set lookahead offset to 5 if normal lookahead available let lookahead_offset = if lookahead_distance >= 5 { 5 } else { 0 }; let deque_offset = lookahead_offset; let score_deque = Vec::with_capacity(5 + lookahead_distance); // Downscaling factor for fast scenedetect (is currently always a power of 2) let factor = scale_func.as_ref().map_or( NonZeroUsize::new(1).expect("constant should not panic"), |x| x.factor, ); let pixels = if scene_detection_mode == SceneDetectionSpeed::Fast { fast_idiv(resolution.1, factor) * fast_idiv(resolution.0, factor) } else { 1 }; let threshold = FAST_THRESHOLD * (bit_depth as f64) / 8.0; Self { threshold, scene_detection_mode, scale_func, lookahead_offset, deque_offset, score_deque, scaled_pixels: pixels, bit_depth, frame_rate, chroma_sampling, min_key_frame_interval, max_key_frame_interval, cpu_feature_level, downscaled_frame_buffer: None, resolution, temp_plane: None, frame_me_stats_buffer: None, intra_costs: None, } } /// Enables caching of intra costs. For advanced API users. #[inline] pub fn enable_cache(&mut self) { if self.intra_costs.is_none() { self.intra_costs = Some(BTreeMap::new()); } } /// Runs keyframe detection on the next frame in the lookahead queue. /// /// This function requires that a subset of input frames /// is passed to it in order, and that `keyframes` is only /// updated from this method. `input_frameno` should correspond /// to the second frame in `frame_set`. /// /// This will gracefully handle the first frame in the video as well. #[inline] pub fn analyze_next_frame( &mut self, frame_set: &[&Arc>], input_frameno: usize, previous_keyframe: usize, ) -> bool { // Use score deque for adaptive threshold for scene cut // Declare score_deque offset based on lookahead for scene change scores // Find the distance to the previous keyframe. let distance = input_frameno - previous_keyframe; if frame_set.len() <= self.lookahead_offset { // Don't insert keyframes in the last few frames of the video // This is basically a scene flash and a waste of bits return false; } if self.scene_detection_mode == SceneDetectionSpeed::None { if let Some(true) = self.handle_min_max_intervals(distance) { return true; }; return false; } // Initialization of score deque // based on frame set length if self.deque_offset > 0 && frame_set.len() > self.deque_offset + 1 && self.score_deque.is_empty() { self.initialize_score_deque(frame_set, input_frameno, self.deque_offset); } else if self.score_deque.is_empty() { self.initialize_score_deque(frame_set, input_frameno, frame_set.len() - 1); self.deque_offset = frame_set.len() - 2; } // Running single frame comparison and adding it to deque // Decrease deque offset if there is no new frames if frame_set.len() > self.deque_offset + 1 { self.run_comparison( frame_set[self.deque_offset].clone(), frame_set[self.deque_offset + 1].clone(), input_frameno + self.deque_offset, ); } else { self.deque_offset -= 1; } // Adaptive scenecut check let (scenecut, score) = self.adaptive_scenecut(); let scenecut = self.handle_min_max_intervals(distance).unwrap_or(scenecut); debug!( "[SC-Detect] Frame {}: Raw={:5.1} ImpBl={:5.1} Bwd={:5.1} Fwd={:5.1} Th={:.1} {}", input_frameno, score.inter_cost, score.imp_block_cost, score.backward_adjusted_cost, score.forward_adjusted_cost, score.threshold, if scenecut { "Scenecut" } else { "No cut" } ); // Keep score deque of 5 backward frames // and forward frames of length of lookahead offset if self.score_deque.len() > 5 + self.lookahead_offset { self.score_deque.pop(); } scenecut } fn handle_min_max_intervals(&mut self, distance: usize) -> Option { // Handle minimum and maximum keyframe intervals. if distance < self.min_key_frame_interval { return Some(false); } if distance >= self.max_key_frame_interval { return Some(true); } None } // Initially fill score deque with frame scores fn initialize_score_deque( &mut self, frame_set: &[&Arc>], input_frameno: usize, init_len: usize, ) { for x in 0..init_len { self.run_comparison( frame_set[x].clone(), frame_set[x + 1].clone(), input_frameno + x, ); } } /// Runs scene change comparison between 2 given frames /// Insert result to start of score deque fn run_comparison( &mut self, frame1: Arc>, frame2: Arc>, input_frameno: usize, ) { let mut result = match self.scene_detection_mode { SceneDetectionSpeed::Fast => self.fast_scenecut(frame1, frame2), SceneDetectionSpeed::Standard => self.cost_scenecut(frame1, frame2, input_frameno), _ => unreachable!(), }; // Subtract the highest metric value of surrounding frames from the current one. // It makes the peaks in the metric more distinct. if self.scene_detection_mode == SceneDetectionSpeed::Standard && self.deque_offset > 0 { if input_frameno == 1 { // Accounts for the second frame not having a score to adjust against. // It should always be 0 because the first frame of the video is always a // keyframe. result.backward_adjusted_cost = 0.0; } else { let mut adjusted_cost = f64::MAX; for other_cost in self .score_deque .iter() .take(self.deque_offset) .map(|i| i.inter_cost) { let this_cost = result.inter_cost - other_cost; if this_cost < adjusted_cost { adjusted_cost = this_cost; } if adjusted_cost < 0.0 { adjusted_cost = 0.0; break; } } result.backward_adjusted_cost = adjusted_cost; } if !self.score_deque.is_empty() { for i in 0..cmp::min(self.deque_offset, self.score_deque.len()) { let adjusted_cost = self.score_deque[i].inter_cost - result.inter_cost; if i == 0 || adjusted_cost < self.score_deque[i].forward_adjusted_cost { self.score_deque[i].forward_adjusted_cost = adjusted_cost; } if self.score_deque[i].forward_adjusted_cost < 0.0 { self.score_deque[i].forward_adjusted_cost = 0.0; } } } } self.score_deque.insert(0, result); } /// Compares current scene score to adapted threshold based on previous /// scores /// /// Value of current frame is offset by lookahead, if lookahead >=5 /// /// Returns true if current scene score is higher than adapted threshold fn adaptive_scenecut(&mut self) -> (bool, ScenecutResult) { let score = self.score_deque[self.deque_offset]; // We use the importance block algorithm's cost metrics as a secondary algorithm // because, although it struggles in certain scenarios such as // finding the end of a pan, it is very good at detecting hard scenecuts // or detecting if a pan exists. // // Because of this, we only consider a frame for a scenechange if // the importance block algorithm is over the threshold either on this frame // (hard scenecut) or within the past few frames (pan). This helps // filter out a few false positives produced by the cost-based // algorithm. let imp_block_threshold = IMP_BLOCK_DIFF_THRESHOLD * (self.bit_depth as f64) / 8.0; if !&self.score_deque[self.deque_offset..] .iter() .any(|result| result.imp_block_cost >= imp_block_threshold) { return (false, score); } let cost = score.forward_adjusted_cost; if cost >= score.threshold { let back_deque = &self.score_deque[self.deque_offset + 1..]; let forward_deque = &self.score_deque[..self.deque_offset]; let back_over_tr_count = back_deque .iter() .filter(|result| result.backward_adjusted_cost >= result.threshold) .count(); let forward_over_tr_count = forward_deque .iter() .filter(|result| result.forward_adjusted_cost >= result.threshold) .count(); // Check for scenecut after the flashes // No frames over threshold forward // and some frames over threshold backward let back_count_req = if self.scene_detection_mode == SceneDetectionSpeed::Fast { // Fast scenecut is more sensitive to false flash detection, // so we want more "evidence" of there being a flash before creating a keyframe. 2 } else { 1 }; if forward_over_tr_count == 0 && back_over_tr_count >= back_count_req { return (true, score); } // Check for scenecut before flash // If distance longer than max flash length if back_over_tr_count == 0 && forward_over_tr_count == 1 && forward_deque[0].forward_adjusted_cost >= forward_deque[0].threshold { return (true, score); } if back_over_tr_count != 0 || forward_over_tr_count != 0 { return (false, score); } } (cost >= score.threshold, score) } } #[derive(Debug, Clone, Copy)] struct ScenecutResult { inter_cost: f64, imp_block_cost: f64, backward_adjusted_cost: f64, forward_adjusted_cost: f64, threshold: f64, } av-scenechange-0.14.1/src/analyze/standard.rs000064400000000000000000000065021046102023000171610ustar 00000000000000use std::sync::Arc; use v_frame::{frame::Frame, math::Fixed, pixel::Pixel}; use super::{SceneChangeDetector, ScenecutResult}; use crate::{ analyze::{ importance::estimate_importance_block_difference, inter::estimate_inter_costs, intra::estimate_intra_costs, }, data::motion::FrameMEStats, }; impl SceneChangeDetector { /// Run a comparison between two frames to determine if they qualify for a /// scenecut. /// /// We gather both intra and inter costs for the frames, /// as well as an importance-block-based difference, /// and use all three metrics. pub(super) fn cost_scenecut( &mut self, frame1: Arc>, frame2: Arc>, input_frameno: usize, ) -> ScenecutResult { let frame2_inter_ref = Arc::clone(&frame2); let frame1_imp_ref = Arc::clone(&frame1); let frame2_imp_ref = Arc::clone(&frame2); let mut intra_cost = 0.0; let mut mv_inter_cost = 0.0; let mut imp_block_cost = 0.0; let cols = 2 * self.resolution.0.align_power_of_two_and_shift(3); let rows = 2 * self.resolution.1.align_power_of_two_and_shift(3); let buffer = if let Some(buffer) = &self.frame_me_stats_buffer { Arc::clone(buffer) } else { let frame_me_stats = FrameMEStats::new_arc_array(cols, rows); let clone = Arc::clone(&frame_me_stats); self.frame_me_stats_buffer = Some(frame_me_stats); clone }; rayon::scope(|s| { s.spawn(|_| { let temp_plane = self .temp_plane .get_or_insert_with(|| frame2.planes[0].clone()); let intra_costs = estimate_intra_costs( temp_plane, &*frame2, self.bit_depth, self.cpu_feature_level, ); if let Some(ref mut intra_cache) = self.intra_costs { intra_cache.insert(input_frameno, intra_costs.clone()); } intra_cost = intra_costs.iter().map(|&cost| cost as u64).sum::() as f64 / intra_costs.len() as f64; }); s.spawn(|_| { mv_inter_cost = estimate_inter_costs( frame2_inter_ref, frame1, self.bit_depth, self.frame_rate, self.chroma_sampling, buffer, self.cpu_feature_level, ); }); s.spawn(|_| { imp_block_cost = estimate_importance_block_difference(frame2_imp_ref, frame1_imp_ref); }); }); // `BIAS` determines how likely we are // to choose a keyframe, between 0.0-1.0. // Higher values mean we are more likely to choose a keyframe. // This value was chosen based on trials using the new // adaptive scenecut code. const BIAS: f64 = 0.7; let threshold = intra_cost * (1.0 - BIAS); ScenecutResult { inter_cost: mv_inter_cost, imp_block_cost, threshold, backward_adjusted_cost: 0.0, forward_adjusted_cost: 0.0, } } } av-scenechange-0.14.1/src/asm/arm/64/ipred.S000064400000000000000000006375711046102023000164070ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/asm/arm/asm.S" #include "util.S" // void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_128_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_128_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] movi v0.16b, #128 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 4: AARCH64_VALID_JUMP_TARGET st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET movi v1.16b, #128 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET movi v1.16b, #128 movi v2.16b, #128 movi v3.16b, #128 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret L(ipred_dc_128_tbl): .hword L(ipred_dc_128_tbl) - 640b .hword L(ipred_dc_128_tbl) - 320b .hword L(ipred_dc_128_tbl) - 16b .hword L(ipred_dc_128_tbl) - 8b .hword L(ipred_dc_128_tbl) - 4b endfunc // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_v_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #1 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2] 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret L(ipred_v_tbl): .hword L(ipred_v_tbl) - 640b .hword L(ipred_v_tbl) - 320b .hword L(ipred_v_tbl) - 160b .hword L(ipred_v_tbl) - 80b .hword L(ipred_v_tbl) - 40b endfunc // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_h_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] sub x2, x2, #4 sub x5, x5, w3, uxtw mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 br x5 4: AARCH64_VALID_JUMP_TARGET ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.s}[0], [x0], x1 st1 {v2.s}[0], [x6], x1 subs w4, w4, #4 st1 {v1.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.8b}, [x0], x1 st1 {v2.8b}, [x6], x1 subs w4, w4, #4 st1 {v1.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 64b ret L(ipred_h_tbl): .hword L(ipred_h_tbl) - 64b .hword L(ipred_h_tbl) - 32b .hword L(ipred_h_tbl) - 16b .hword L(ipred_h_tbl) - 8b .hword L(ipred_h_tbl) - 4b endfunc // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_top_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #1 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.8b, v0.b[0] 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.8b, v0.b[0] 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b add v2.4h, v0.4h, v1.4h rshrn v2.8b, v2.8h, #5 dup v0.16b, v2.b[0] dup v1.16b, v2.b[0] 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v4.4h, v0.4h, v1.4h add v5.4h, v2.4h, v3.4h add v4.4h, v4.4h, v5.4h rshrn v4.8b, v4.8h, #6 dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] dup v2.16b, v4.b[0] dup v3.16b, v4.b[0] 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret L(ipred_dc_top_tbl): .hword L(ipred_dc_top_tbl) - 640b .hword L(ipred_dc_top_tbl) - 320b .hword L(ipred_dc_top_tbl) - 160b .hword L(ipred_dc_top_tbl) - 80b .hword L(ipred_dc_top_tbl) - 40b endfunc // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w3, w3 clz w7, w4 adr x5, L(ipred_dc_left_tbl) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 ldrh w3, [x5, w3, uxtw #1] ldrh w7, [x5, w7, uxtw #1] sub x3, x5, w3, uxtw sub x5, x5, w7, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_left_h4): AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w4): AARCH64_VALID_JUMP_TARGET st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt L(ipred_dc_left_w4) ret L(ipred_dc_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w8): AARCH64_VALID_JUMP_TARGET st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt L(ipred_dc_left_w8) ret L(ipred_dc_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w16): AARCH64_VALID_JUMP_TARGET st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt L(ipred_dc_left_w16) ret L(ipred_dc_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b add v0.4h, v0.4h, v1.4h rshrn v0.8b, v0.8h, #5 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w32): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 1: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h add v0.4h, v0.4h, v2.4h rshrn v0.8b, v0.8h, #6 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w64): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 1: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 1b ret L(ipred_dc_left_tbl): .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) endfunc // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_8bpc_neon, export=1 sub x2, x2, w4, uxtw add w7, w3, w4 // width + height clz w3, w3 clz w6, w4 dup v16.8h, w7 // width + height adr x5, L(ipred_dc_tbl) rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) ldrh w3, [x5, w3, uxtw #1] ldrh w6, [x5, w6, uxtw #1] neg w7, w7 // -ctz(width + height) sub x3, x5, w3, uxtw sub x5, x5, w6, uxtw ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w7 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr uaddlv h0, v0.8b add x2, x2, #1 br x3 L(ipred_dc_w4): AARCH64_VALID_JUMP_TARGET ld1 {v1.s}[0], [x2] ins v1.s[1], wzr add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b cmp w4, #4 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x3334/2) movk w16, #(0x5556/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8b, v0.b[0] 2: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 2b ret L(ipred_dc_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b add x2, x2, #1 br x3 L(ipred_dc_w8): AARCH64_VALID_JUMP_TARGET ld1 {v1.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b cmp w4, #8 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8b, v0.b[0] 2: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 2b ret L(ipred_dc_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b add x2, x2, #1 br x3 L(ipred_dc_w16): AARCH64_VALID_JUMP_TARGET ld1 {v1.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b cmp w4, #16 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/8/32/64 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.16b, v0.b[0] 2: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2], #32 uaddlv h0, v0.16b uaddlv h1, v1.16b add x2, x2, #1 add v0.4h, v0.4h, v1.4h br x3 L(ipred_dc_w32): AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b uaddlv h2, v2.16b cmp w4, #32 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v2.4h ushl v4.4h, v0.4h, v17.4h b.eq 1f // h = 8/16/64 cmp w4, #8 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v4.4h, v4.4h, v16.4h 1: dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] 2: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h add x2, x2, #1 add v0.4h, v0.4h, v2.4h br x3 L(ipred_dc_w64): AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b uaddlv h4, v4.16b add v1.4h, v1.4h, v2.4h add v3.4h, v3.4h, v4.4h cmp w4, #64 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v3.4h ushl v4.4h, v0.4h, v17.4h b.eq 1f // h = 16/32 mov w16, #(0x5556/2) movk w16, #(0x3334/2), lsl #16 lsr w16, w16, w4 dup v16.4h, w16 sqdmulh v4.4h, v4.4h, v16.4h 1: dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] dup v2.16b, v4.b[0] dup v3.16b, v4.b[0] 2: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_tbl): .hword L(ipred_dc_tbl) - L(ipred_dc_h64) .hword L(ipred_dc_tbl) - L(ipred_dc_h32) .hword L(ipred_dc_tbl) - L(ipred_dc_h16) .hword L(ipred_dc_tbl) - L(ipred_dc_h8) .hword L(ipred_dc_tbl) - L(ipred_dc_h4) .hword L(ipred_dc_tbl) - L(ipred_dc_w64) .hword L(ipred_dc_tbl) - L(ipred_dc_w32) .hword L(ipred_dc_tbl) - L(ipred_dc_w16) .hword L(ipred_dc_tbl) - L(ipred_dc_w8) .hword L(ipred_dc_tbl) - L(ipred_dc_w4) endfunc // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_8bpc_neon, export=1 clz w9, w3 adr x5, L(ipred_paeth_tbl) sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.16b}, [x2] add x8, x2, #1 sub x2, x2, #4 sub x5, x5, w9, uxtw mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v5.4s}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 zip1 v0.2s, v0.2s, v1.2s zip1 v2.2s, v2.2s, v3.2s uaddw v16.8h, v6.8h, v0.8b uaddw v17.8h, v6.8h, v2.8b sqxtun v16.8b, v16.8h // base sqxtun2 v16.16b, v17.8h zip1 v0.2d, v0.2d, v2.2d uabd v20.16b, v5.16b, v16.16b // tdiff uabd v22.16b, v4.16b, v16.16b // tldiff uabd v16.16b, v0.16b, v16.16b // ldiff umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... st1 {v20.s}[3], [x0], x1 st1 {v20.s}[2], [x6], x1 subs w4, w4, #4 st1 {v20.s}[1], [x0], x1 st1 {v20.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1r {v5.2d}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 uaddw v16.8h, v6.8h, v0.8b uaddw v17.8h, v6.8h, v1.8b uaddw v18.8h, v6.8h, v2.8b uaddw v19.8h, v6.8h, v3.8b sqxtun v16.8b, v16.8h // base sqxtun2 v16.16b, v17.8h sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h zip1 v2.2d, v2.2d, v3.2d zip1 v0.2d, v0.2d, v1.2d uabd v21.16b, v5.16b, v18.16b // tdiff uabd v20.16b, v5.16b, v16.16b uabd v23.16b, v4.16b, v18.16b // tldiff uabd v22.16b, v4.16b, v16.16b uabd v17.16b, v2.16b, v18.16b // ldiff uabd v16.16b, v0.16b, v16.16b umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) umin v18.16b, v20.16b, v22.16b cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff cmhs v20.16b, v22.16b, v20.16b cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff cmhs v16.16b, v18.16b, v16.16b bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v20.16b, v5.16b, v4.16b bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... bit v20.16b, v0.16b, v16.16b st1 {v21.d}[1], [x0], x1 st1 {v21.d}[0], [x6], x1 subs w4, w4, #4 st1 {v20.d}[1], [x0], x1 st1 {v20.d}[0], [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET ld1 {v5.16b}, [x8], #16 mov w9, w3 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw 1: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 2: usubl v6.8h, v5.8b, v4.8b // top - topleft usubl2 v7.8h, v5.16b, v4.16b uaddw v24.8h, v6.8h, v0.8b uaddw v25.8h, v7.8h, v0.8b uaddw v26.8h, v6.8h, v1.8b uaddw v27.8h, v7.8h, v1.8b uaddw v28.8h, v6.8h, v2.8b uaddw v29.8h, v7.8h, v2.8b uaddw v30.8h, v6.8h, v3.8b uaddw v31.8h, v7.8h, v3.8b sqxtun v17.8b, v26.8h // base sqxtun2 v17.16b, v27.8h sqxtun v16.8b, v24.8h sqxtun2 v16.16b, v25.8h sqxtun v19.8b, v30.8h sqxtun2 v19.16b, v31.8h sqxtun v18.8b, v28.8h sqxtun2 v18.16b, v29.8h uabd v23.16b, v5.16b, v19.16b // tdiff uabd v22.16b, v5.16b, v18.16b uabd v21.16b, v5.16b, v17.16b uabd v20.16b, v5.16b, v16.16b uabd v27.16b, v4.16b, v19.16b // tldiff uabd v26.16b, v4.16b, v18.16b uabd v25.16b, v4.16b, v17.16b uabd v24.16b, v4.16b, v16.16b uabd v19.16b, v3.16b, v19.16b // ldiff uabd v18.16b, v2.16b, v18.16b uabd v17.16b, v1.16b, v17.16b uabd v16.16b, v0.16b, v16.16b umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) umin v30.16b, v22.16b, v26.16b umin v29.16b, v21.16b, v25.16b umin v28.16b, v20.16b, v24.16b cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff cmhs v22.16b, v26.16b, v22.16b cmhs v21.16b, v25.16b, v21.16b cmhs v20.16b, v24.16b, v20.16b cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff cmhs v18.16b, v30.16b, v18.16b cmhs v17.16b, v29.16b, v17.16b cmhs v16.16b, v28.16b, v16.16b bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v22.16b, v5.16b, v4.16b bsl v21.16b, v5.16b, v4.16b bsl v20.16b, v5.16b, v4.16b bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... bit v22.16b, v2.16b, v18.16b bit v21.16b, v1.16b, v17.16b bit v20.16b, v0.16b, v16.16b subs w3, w3, #16 st1 {v23.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v21.16b}, [x5], #16 st1 {v20.16b}, [x10], #16 b.le 8f ld1 {v5.16b}, [x8], #16 b 2b 8: subs w4, w4, #4 b.le 9f // End of horizontal loop, move pointers to next four rows sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 // Load the top row as early as possible ld1 {v5.16b}, [x8], #16 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_paeth_tbl): .hword L(ipred_paeth_tbl) - 640b .hword L(ipred_paeth_tbl) - 320b .hword L(ipred_paeth_tbl) - 160b .hword L(ipred_paeth_tbl) - 80b .hword L(ipred_paeth_tbl) - 40b endfunc // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_8bpc_neon, export=1 movrel x10, X(sm_weights) add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_tbl) sub x12, x2, w4, uxtw sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.16b}, [x12] // bottom add x8, x2, #1 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2s}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor sub x2, x2, #4 mov x7, #-4 dup v5.16b, v6.b[3] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 zip1 v1.2s, v1.2s, v0.2s // left, flipped zip1 v0.2s, v3.2s, v2.2s zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s shll v22.8h, v4.8b, #8 // bottom*256 shll v23.8h, v4.8b, #8 usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v23.8h, v6.8h, v18.8h uhadd v20.8h, v20.8h, v22.8h uhadd v21.8h, v21.8h, v23.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x6], x1 subs w4, w4, #4 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8b}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor sub x2, x2, #4 mov x7, #-4 dup v5.16b, v6.b[7] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b usubl v2.8h, v2.8b, v5.8b usubl v3.8h, v3.8b, v5.8b shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v2.8h, v7.8h // (left flipped) mla v22.8h, v1.8h, v7.8h mla v23.8h, v0.8h, v7.8h mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v6.8h, v17.8h mla v26.8h, v6.8h, v18.8h mla v27.8h, v6.8h, v19.8h uhadd v20.8h, v20.8h, v24.8h uhadd v21.8h, v21.8h, v25.8h uhadd v22.8h, v22.8h, v26.8h uhadd v23.8h, v23.8h, v27.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v20.8b}, [x0], x1 st1 {v21.8b}, [x6], x1 subs w4, w4, #4 st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET add x12, x2, w3, uxtw sub x2, x2, #2 mov x7, #-2 ld1r {v5.16b}, [x12] // right sub x1, x1, w3, uxtw mov w9, w3 1: ld2r {v0.8b, v1.8b}, [x2], x7 // left ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b 2: ld1 {v7.16b}, [x10], #16 // weights_hor ld1 {v3.16b}, [x8], #16 // top shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b usubl v2.8h, v3.8b, v4.8b // top-bottom usubl2 v3.8h, v3.16b, v4.16b mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h // (left flipped) mla v22.8h, v0.8h, v6.8h mla v23.8h, v0.8h, v7.8h shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v3.8h, v16.8h mla v26.8h, v2.8h, v17.8h mla v27.8h, v3.8h, v17.8h uhadd v20.8h, v20.8h, v24.8h uhadd v21.8h, v21.8h, v25.8h uhadd v22.8h, v22.8h, v26.8h uhadd v23.8h, v23.8h, v27.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x8, w9, uxtw sub x10, x10, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_tbl): .hword L(ipred_smooth_tbl) - 640b .hword L(ipred_smooth_tbl) - 320b .hword L(ipred_smooth_tbl) - 160b .hword L(ipred_smooth_tbl) - 80b .hword L(ipred_smooth_tbl) - 40b endfunc // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_8bpc_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 adr x5, L(ipred_smooth_v_tbl) sub x8, x2, w4, uxtw sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.16b}, [x8] // bottom add x2, x2, #1 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2s}, [x2] // top usubl v6.8h, v6.8b, v4.8b // top-bottom 4: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver shll v22.8h, v4.8b, #8 // bottom*256 shll v23.8h, v4.8b, #8 zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v23.8h, v6.8h, v18.8h rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v22.s}[0], [x0], x1 st1 {v22.s}[1], [x6], x1 subs w4, w4, #4 st1 {v23.s}[0], [x0], x1 st1 {v23.s}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8b}, [x2] // top usubl v6.8h, v6.8b, v4.8b // top-bottom 8: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v6.8h, v17.8h mla v26.8h, v6.8h, v18.8h mla v27.8h, v6.8h, v19.8h rshrn v24.8b, v24.8h, #8 rshrn v25.8b, v25.8h, #8 rshrn v26.8b, v26.8h, #8 rshrn v27.8b, v27.8h, #8 st1 {v24.8b}, [x0], x1 st1 {v25.8b}, [x6], x1 subs w4, w4, #4 st1 {v26.8b}, [x0], x1 st1 {v27.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET // Set up pointers for four rows in parallel; x0, x6, x5, x8 add x5, x0, x1 add x8, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b 2: ld1 {v3.16b}, [x2], #16 // top shll v20.8h, v4.8b, #8 // bottom*256 shll v21.8h, v4.8b, #8 shll v22.8h, v4.8b, #8 shll v23.8h, v4.8b, #8 shll v24.8h, v4.8b, #8 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 usubl v2.8h, v3.8b, v4.8b // top-bottom usubl2 v3.8h, v3.16b, v4.16b mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v21.8h, v3.8h, v16.8h mla v22.8h, v2.8h, v17.8h mla v23.8h, v3.8h, v17.8h mla v24.8h, v2.8h, v18.8h mla v25.8h, v3.8h, v18.8h mla v26.8h, v2.8h, v19.8h mla v27.8h, v3.8h, v19.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 rshrn v24.8b, v24.8h, #8 rshrn2 v24.16b, v25.8h, #8 rshrn v26.8b, v26.8h, #8 rshrn2 v26.16b, v27.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v24.16b}, [x5], #16 st1 {v26.16b}, [x8], #16 b.gt 2b subs w4, w4, #4 b.le 9f sub x2, x2, w9, uxtw add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x8, x8, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_v_tbl): .hword L(ipred_smooth_v_tbl) - 640b .hword L(ipred_smooth_v_tbl) - 320b .hword L(ipred_smooth_v_tbl) - 160b .hword L(ipred_smooth_v_tbl) - 80b .hword L(ipred_smooth_v_tbl) - 40b endfunc // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_8bpc_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_h_tbl) add x12, x2, w3, uxtw sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v5.16b}, [x12] // right sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v7.2s}, [x8] // weights_hor sub x2, x2, #4 mov x7, #-4 uxtl v7.8h, v7.8b // weights_hor 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 zip1 v1.2s, v1.2s, v0.2s // left, flipped zip1 v0.2s, v3.2s, v2.2s usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x6], x1 subs w4, w4, #4 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v7.8b}, [x8] // weights_hor sub x2, x2, #4 mov x7, #-4 uxtl v7.8h, v7.8b // weights_hor 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 usubl v3.8h, v3.8b, v5.8b // left-right usubl v2.8h, v2.8b, v5.8b usubl v1.8h, v1.8b, v5.8b usubl v0.8h, v0.8b, v5.8b mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v2.8h, v7.8h // (left flipped) mla v22.8h, v1.8h, v7.8h mla v23.8h, v0.8h, v7.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v20.8b}, [x0], x1 st1 {v21.8b}, [x6], x1 subs w4, w4, #4 st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET sub x2, x2, #4 mov x7, #-4 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b usubl v2.8h, v2.8b, v5.8b usubl v3.8h, v3.8b, v5.8b 2: ld1 {v7.16b}, [x8], #16 // weights_hor shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 shll v24.8h, v5.8b, #8 shll v25.8h, v5.8b, #8 shll v26.8h, v5.8b, #8 shll v27.8h, v5.8b, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor mla v21.8h, v3.8h, v7.8h // (left flipped) mla v22.8h, v2.8h, v6.8h mla v23.8h, v2.8h, v7.8h mla v24.8h, v1.8h, v6.8h mla v25.8h, v1.8h, v7.8h mla v26.8h, v0.8h, v6.8h mla v27.8h, v0.8h, v7.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 rshrn v24.8b, v24.8h, #8 rshrn2 v24.16b, v25.8h, #8 rshrn v26.8b, v26.8h, #8 rshrn2 v26.16b, v27.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v24.16b}, [x5], #16 st1 {v26.16b}, [x10], #16 b.gt 2b subs w4, w4, #4 b.le 9f sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_h_tbl): .hword L(ipred_smooth_h_tbl) - 640b .hword L(ipred_smooth_h_tbl) - 320b .hword L(ipred_smooth_h_tbl) - 160b .hword L(ipred_smooth_h_tbl) - 80b .hword L(ipred_smooth_h_tbl) - 40b endfunc const padding_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 padding_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz, // const pixel *const in, const int end); function ipred_z1_upsample_edge_8bpc_neon, export=1 movrel x4, padding_mask ld1 {v0.16b}, [x2] // in[] add x5, x2, w3, uxtw // in[end] sub x4, x4, w3, uxtw ld1r {v1.16b}, [x5] // padding ld1 {v3.16b}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v1.16b, v3.16b // padded in[] ext v4.16b, v0.16b, v1.16b, #1 ext v5.16b, v0.16b, v1.16b, #2 ext v6.16b, v0.16b, v1.16b, #3 uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2] uaddl2 v17.8h, v4.16b, v5.16b uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3] uaddl2 v19.8h, v0.16b, v6.16b mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) mul v17.8h, v17.8h, v31.8h sub v16.8h, v16.8h, v18.8h sub v17.8h, v17.8h, v19.8h sqrshrun v16.8b, v16.8h, #4 sqrshrun2 v16.16b, v17.8h, #4 zip1 v0.16b, v4.16b, v16.16b zip2 v1.16b, v4.16b, v16.16b st1 {v0.16b, v1.16b}, [x0] ret endfunc // void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz, // const pixel *const in); function ipred_z2_upsample_edge_8bpc_neon, export=1 // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. movrel x4, padding_mask ld1 {v0.16b}, [x2] // in[] add x5, x2, w1, uxtw // in[sz] sub x4, x4, w1, uxtw ld1r {v2.16b}, [x2] // in[0] for padding ld1r {v1.16b}, [x5] // padding ld1 {v3.16b}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v1.16b, v3.16b // padded in[] ext v4.16b, v2.16b, v0.16b, #15 ext v5.16b, v0.16b, v1.16b, #1 ext v6.16b, v0.16b, v1.16b, #2 uaddl v16.8h, v0.8b, v5.8b // in[i+0] + in[i+1] uaddl v18.8h, v4.8b, v6.8b // in[i-1] + in[i+2] mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) sub v16.8h, v16.8h, v18.8h sqrshrun v16.8b, v16.8h, #4 add x5, x0, #16 zip1 v2.16b, v0.16b, v16.16b st1 {v1.b}[0], [x5] // In case sz=8, output one single pixel in out[16]. st1 {v2.16b}, [x0] ret endfunc const edge_filter .byte 0, 4, 8, 0 .byte 0, 5, 6, 0 // Leaving out the coeffs for strength=3 // .byte 2, 4, 4, 0 endconst // void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz, // const pixel *const in, const int end, // const int strength); function ipred_z1_filter_edge_8bpc_neon, export=1 cmp w4, #3 b.eq L(fivetap) // if (strength == 3) goto fivetap movrel x5, edge_filter, -3 add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1 ld1 {v31.h}[0], [x5] // kernel[1-2] ld1 {v0.16b}, [x2], #16 dup v30.16b, v31.b[0] dup v31.16b, v31.b[1] 1: // in[end], is the last valid pixel. We produce 16 pixels out by // using 18 pixels in - the last pixel used is [17] of the ones // read/buffered. cmp w3, #17 ld1 {v1.16b}, [x2], #16 b.lt 2f ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 umull v4.8h, v0.8b, v30.8b umlal v4.8h, v2.8b, v31.8b umlal v4.8h, v3.8b, v30.8b umull2 v5.8h, v0.16b, v30.16b umlal2 v5.8h, v2.16b, v31.16b umlal2 v5.8h, v3.16b, v30.16b subs w1, w1, #16 mov v0.16b, v1.16b rshrn v4.8b, v4.8h, #4 rshrn2 v4.16b, v5.8h, #4 sub w3, w3, #16 st1 {v4.16b}, [x0], #16 b.gt 1b ret 2: // Right padding // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead) movrel x5, padding_mask sub w6, w3, #32 sub x5, x5, w3, uxtw add x6, x2, w6, sxtw ld1 {v2.16b}, [x5] // padding_mask ld1r {v1.16b}, [x6] bit v0.16b, v1.16b, v2.16b // Pad v0-v1 // Filter one block ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 umull v4.8h, v0.8b, v30.8b umlal v4.8h, v2.8b, v31.8b umlal v4.8h, v3.8b, v30.8b umull2 v5.8h, v0.16b, v30.16b umlal2 v5.8h, v2.16b, v31.16b umlal2 v5.8h, v3.16b, v30.16b subs w1, w1, #16 rshrn v4.8b, v4.8h, #4 rshrn2 v4.16b, v5.8h, #4 st1 {v4.16b}, [x0], #16 b.le 9f 5: // After one block, any remaining output would only be filtering // padding - thus just store the padding. subs w1, w1, #16 st1 {v1.16b}, [x0], #16 b.gt 5b 9: ret L(fivetap): sub x2, x2, #1 // topleft -= 1 movi v29.16b, #2 ld1 {v0.16b}, [x2], #16 movi v30.16b, #4 movi v31.16b, #4 ins v0.b[0], v0.b[1] 1: // in[end+1], is the last valid pixel. We produce 16 pixels out by // using 20 pixels in - the last pixel used is [19] of the ones // read/buffered. cmp w3, #18 ld1 {v1.16b}, [x2], #16 b.lt 2f // if (end + 1 < 19) ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v0.16b, v1.16b, #3 ext v5.16b, v0.16b, v1.16b, #4 umull v6.8h, v0.8b, v29.8b umlal v6.8h, v2.8b, v30.8b umlal v6.8h, v3.8b, v31.8b umlal v6.8h, v4.8b, v30.8b umlal v6.8h, v5.8b, v29.8b umull2 v7.8h, v0.16b, v29.16b umlal2 v7.8h, v2.16b, v30.16b umlal2 v7.8h, v3.16b, v31.16b umlal2 v7.8h, v4.16b, v30.16b umlal2 v7.8h, v5.16b, v29.16b subs w1, w1, #16 mov v0.16b, v1.16b rshrn v6.8b, v6.8h, #4 rshrn2 v6.16b, v7.8h, #4 sub w3, w3, #16 st1 {v6.16b}, [x0], #16 b.gt 1b ret 2: // Right padding // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead) movrel x5, padding_mask, -1 sub w6, w3, #31 sub x5, x5, w3, uxtw add x6, x2, w6, sxtw ld1 {v2.16b, v3.16b}, [x5] // padding_mask ld1r {v28.16b}, [x6] bit v0.16b, v28.16b, v2.16b // Pad v0-v1 bit v1.16b, v28.16b, v3.16b 4: // Filter one block ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v0.16b, v1.16b, #3 ext v5.16b, v0.16b, v1.16b, #4 umull v6.8h, v0.8b, v29.8b umlal v6.8h, v2.8b, v30.8b umlal v6.8h, v3.8b, v31.8b umlal v6.8h, v4.8b, v30.8b umlal v6.8h, v5.8b, v29.8b umull2 v7.8h, v0.16b, v29.16b umlal2 v7.8h, v2.16b, v30.16b umlal2 v7.8h, v3.16b, v31.16b umlal2 v7.8h, v4.16b, v30.16b umlal2 v7.8h, v5.16b, v29.16b subs w1, w1, #16 mov v0.16b, v1.16b mov v1.16b, v28.16b rshrn v6.8b, v6.8h, #4 rshrn2 v6.16b, v7.8h, #4 sub w3, w3, #16 st1 {v6.16b}, [x0], #16 b.le 9f // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to // filter properly once more - aka (w3 >= 0). cmp w3, #0 b.ge 4b 5: // When w3 <= 0, all remaining pixels in v0-v1 are equal to the // last valid pixel - thus just output that without filtering. subs w1, w1, #16 st1 {v1.16b}, [x0], #16 b.gt 5b 9: ret endfunc // void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px, // const int n); function ipred_pixel_set_8bpc_neon, export=1 dup v0.16b, w1 1: subs w2, w2, #16 st1 {v0.16b}, [x0], #16 b.gt 1b ret endfunc // void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const int width, const int height, // const int dx, const int max_base_x); function ipred_z1_fill1_8bpc_neon, export=1 clz w9, w3 adr x8, L(ipred_z1_fill1_tbl) sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] add x10, x2, w6, uxtw // top[max_base_x] sub x8, x8, w9, uxtw ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 br x8 40: AARCH64_VALID_JUMP_TARGET 4: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f ldr d0, [x2, w8, uxtw] // top[base] ldr d2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 ext v1.8b, v0.8b, v0.8b, #1 // top[base+1] ext v3.8b, v2.8b, v2.8b, #1 usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] usubl v7.8h, v3.8b, v2.8b ushll v16.8h, v0.8b, #6 // top[base]*64 ushll v17.8h, v2.8b, #6 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac mla v17.4h, v7.4h, v5.4h rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.s}[0], [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.s}[0], [x0], x1 b.gt 4b ret 49: st1 {v31.s}[0], [x0], x1 subs w4, w4, #2 st1 {v31.s}[0], [x0], x1 b.gt 49b ret 80: AARCH64_VALID_JUMP_TARGET 8: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.8b, w9 // frac dup v5.8b, w11 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8b, w9 // 64 - frac dup v7.8b, w11 ext v1.16b, v0.16b, v0.16b, #1 // top[base+1] ext v3.16b, v2.16b, v2.16b, #1 umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac) umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac umull v17.8h, v2.8b, v7.8b umlal v17.8h, v3.8b, v5.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.gt 8b ret 89: st1 {v31.8b}, [x0], x1 subs w4, w4, #2 st1 {v31.8b}, [x0], x1 b.gt 89b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov w12, w3 add x13, x0, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 169f add x8, x2, w8, uxtw add x10, x2, w10, uxtw dup v4.16b, w9 // frac dup v5.16b, w11 ld1 {v0.16b, v1.16b}, [x8], #32 // top[base] ld1 {v2.16b, v3.16b}, [x10], #32 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.16b, w9 // 64 - frac dup v7.16b, w11 add w7, w7, w5 // xpos += dx 2: ext v16.16b, v0.16b, v1.16b, #1 // top[base+1] ext v17.16b, v2.16b, v3.16b, #1 subs w3, w3, #16 umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac) umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac umull2 v19.8h, v0.16b, v6.16b umlal2 v19.8h, v16.16b, v4.16b umull v20.8h, v2.8b, v7.8b umlal v20.8h, v17.8b, v5.8b umull2 v21.8h, v2.16b, v7.16b umlal2 v21.8h, v17.16b, v5.16b rshrn v16.8b, v18.8h, #6 rshrn2 v16.16b, v19.8h, #6 rshrn v17.8b, v20.8h, #6 rshrn2 v17.16b, v21.8h, #6 st1 {v16.16b}, [x0], #16 st1 {v17.16b}, [x13], #16 b.le 3f mov v0.16b, v1.16b ld1 {v1.16b}, [x8], #16 // top[base] mov v2.16b, v3.16b ld1 {v3.16b}, [x10], #16 b 2b 3: subs w4, w4, #2 b.le 9f add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 1b 9: ret 169: st1 {v31.16b}, [x0], #16 subs w3, w3, #16 st1 {v31.16b}, [x13], #16 b.gt 169b subs w4, w4, #2 b.le 9b add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 169b L(ipred_z1_fill1_tbl): .hword L(ipred_z1_fill1_tbl) - 640b .hword L(ipred_z1_fill1_tbl) - 320b .hword L(ipred_z1_fill1_tbl) - 160b .hword L(ipred_z1_fill1_tbl) - 80b .hword L(ipred_z1_fill1_tbl) - 40b endfunc function ipred_z1_fill2_8bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // top[max_base_x] ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 b.eq 8f 4: // w == 4 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f ldr d0, [x2, w8, uxtw] // top[base] ldr d2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 uzp2 v1.8b, v0.8b, v0.8b // top[base+1] uzp1 v0.8b, v0.8b, v0.8b // top[base] uzp2 v3.8b, v2.8b, v2.8b uzp1 v2.8b, v2.8b, v2.8b usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] usubl v7.8h, v3.8b, v2.8b ushll v16.8h, v0.8b, #6 // top[base]*64 ushll v17.8h, v2.8b, #6 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac mla v17.4h, v7.4h, v5.4h rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.s}[0], [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.s}[0], [x0], x1 b.gt 4b ret 49: st1 {v31.s}[0], [x0], x1 subs w4, w4, #2 st1 {v31.s}[0], [x0], x1 b.gt 49b ret 8: // w == 8 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.8b, w9 // frac dup v5.8b, w11 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8b, w9 // 64 - frac dup v7.8b, w11 uzp2 v1.16b, v0.16b, v0.16b // top[base+1] uzp1 v0.16b, v0.16b, v0.16b // top[base] uzp2 v3.16b, v2.16b, v2.16b uzp1 v2.16b, v2.16b, v2.16b umull v16.8h, v1.8b, v4.8b // top[base+1]*frac umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac) umull v17.8h, v3.8b, v5.8b umlal v17.8h, v2.8b, v7.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.gt 8b ret 89: st1 {v31.8b}, [x0], x1 subs w4, w4, #2 st1 {v31.8b}, [x0], x1 b.gt 89b ret endfunc // void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src, // const int n); function ipred_reverse_8bpc_neon, export=1 sub x1, x1, #16 add x3, x0, #8 mov x4, #16 1: ld1 {v0.16b}, [x1] subs w2, w2, #16 rev64 v0.16b, v0.16b sub x1, x1, #16 st1 {v0.d}[1], [x0], x4 st1 {v0.d}[0], [x3], x4 b.gt 1b ret endfunc const increments .short 0, 1, 2, 3, 4, 5, 6, 7 .short 8, 9, 10, 11, 12, 13, 14, 15 endconst // void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const pixel *const left, // const int width, const int height, // const int dx, const int dy); function ipred_z2_fill1_8bpc_neon, export=1 clz w10, w4 adr x9, L(ipred_z2_fill1_tbl) sub w10, w10, #25 ldrh w10, [x9, w10, uxtw #1] mov w8, #(1 << 6) // xpos = 1 << 6 sub x9, x9, w10, uxtw sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy br x9 40: AARCH64_VALID_JUMP_TARGET dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.16b, #0x3e add v30.4h, v16.4h, v30.4h // -= dy xtn v31.8b, v31.8h // {0,1,2,3} // Worst case height for w=4 is 16, but we need at least h+1 elements ld1 {v0.16b, v1.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 add v30.8b, v29.8b, v17.8b // base_y + 1 add v28.8b, v29.8b, v19.8b // base_y + 2 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} trn1 v27.2s, v27.2s, v27.2s // frac_y trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y movi v29.8b, #2 4: asr w9, w8, #6 // base_x dup v6.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f dup v7.4h, w8 // xpos ldr d2, [x2, w9, sxtw] // top[base_x] ldr d4, [x2, w11, sxtw] trn1 v6.2d, v6.2d, v7.2d // xpos // Cut corners here; only doing tbl over v0 here; we only // seem to need the last pixel, from v1, after skipping to the // left-only codepath below. tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] shrn v20.8b, v6.8h, #6 // first base_x for each row xtn v6.8b, v6.8h // (uint8_t)xpos ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] ext v5.8b, v4.8b, v4.8b, #1 and v6.8b, v6.8b, v25.8b // frac_x trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] trn1 v2.2s, v2.2s, v4.2s // top[base_x] trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] sub v7.8b, v26.8b, v6.8b // 64 - frac_x add v20.8b, v20.8b, v31.8b // actual base_x umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x cmge v20.8b, v20.8b, #0 rshrn v16.8b, v16.8h, #6 rshrn v22.8b, v22.8h, #6 bit v16.8b, v22.8b, v20.8b st1 {v16.s}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v16.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 4b 49: tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2] trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y rshrn v18.8b, v18.8h, #6 st1 {v18.s}[0], [x0], x1 subs w5, w5, #2 st1 {v18.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 49b 9: ret 80: AARCH64_VALID_JUMP_TARGET dup v30.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.16b, #0x3e add v30.8h, v16.8h, v30.8h // -= dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} // Worst case height for w=8 is 32, but we need at least h+1 elements ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbl v18.8b, {v0.16b}, v29.8b // left[base_y] add v30.8b, v29.8b, v19.8b // base_y + 2 add v29.8b, v29.8b, v17.8b // base_y + 1 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} movi v24.8b, #2 // 2 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] // Cut corners here; only doing tbl over v0-v1 here; we only // seem to need the last pixel, from v2, after skipping to the // left-only codepath below. tbl v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1] shrn v21.8b, v16.8h, #6 // first base_x shrn2 v21.16b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn2 v16.16b, v17.8h tbl v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2] ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #1 and v16.16b, v16.16b, v25.16b // frac_x trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v7.16b, v26.16b, v16.16b // 64 - frac_x add v21.16b, v21.16b, v31.16b // actual base_x umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v23.8h, v4.16b, v7.16b umlal2 v23.8h, v5.16b, v16.16b cmge v21.16b, v21.16b, #0 rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 rshrn v22.8b, v22.8h, #6 rshrn2 v22.16b, v23.8h, #6 bit v6.16b, v22.16b, v21.16b st1 {v6.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 8b 89: tbl v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1] tbl v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2] umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 st1 {v6.d}[0], [x0], x1 subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 89b 9: ret 160: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] add x11, x11, #16 // increments dup v18.8h, w7 // -dy movi v17.16b, #1 add x3, x3, #1 // Skip past left[0] ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy mul v19.8h, v14.8h, v18.8h // {8,9,10,11,12,13,14,15}* -dy movi v25.16b, #0x3e add v16.8h, v16.8h, v18.8h // -= dy add v18.8h, v19.8h, v18.8h xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} // Worst case height is 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] ld1r {v15.16b}, [x2] // left[0] == top[0] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v16.8h // (uint8_t)ypos xtn2 v27.16b, v18.8h shrn v29.8b, v16.8h, #6 // ypos >> 6 shrn2 v29.16b, v18.8h, #6 mov v18.16b, v15.16b // left[0] and v27.16b, v27.16b, v25.16b // frac_y // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbx v18.16b, {v0.16b}, v29.16b // left[base_y] add v30.16b, v29.16b, v19.16b // base_y + 2 add v29.16b, v29.16b, v17.16b // base_y + 1 sub v28.16b, v26.16b, v27.16b // 64 - frac_y movi v24.16b, #2 // 2 16: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 169f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw add x11, x2, w11, sxtw ld1 {v4.16b, v5.16b}, [x9] // top[base_x] mov v19.16b, v15.16b // left[0] ld1 {v6.16b, v7.16b}, [x11] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] mov v20.16b, v15.16b // left[0] shrn v21.8b, v16.8h, #6 // first base_x shrn v22.8b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn v17.8b, v17.8h tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] trn1 v21.2d, v21.2d, v21.2d // first base_x trn1 v22.2d, v22.2d, v22.2d trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos trn1 v17.2d, v17.2d, v17.2d ext v5.16b, v4.16b, v5.16b, #1 // top[base_x+1] ext v7.16b, v6.16b, v7.16b, #1 and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y sub v8.16b, v26.16b, v16.16b // 64 - frac_x sub v9.16b, v26.16b, v17.16b umull2 v11.8h, v18.16b, v28.16b umlal2 v11.8h, v19.16b, v27.16b add v21.16b, v21.16b, v31.16b // actual base_x add v22.16b, v22.16b, v31.16b umull v12.8h, v19.8b, v28.8b umlal v12.8h, v20.8b, v27.8b umull2 v13.8h, v19.16b, v28.16b umlal2 v13.8h, v20.16b, v27.16b rshrn v10.8b, v10.8h, #6 rshrn2 v10.16b, v11.8h, #6 rshrn v11.8b, v12.8h, #6 rshrn2 v11.16b, v13.8h, #6 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) umlal v12.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v13.8h, v4.16b, v8.16b umlal2 v13.8h, v5.16b, v16.16b umull v14.8h, v6.8b, v9.8b umlal v14.8h, v7.8b, v17.8b umull2 v18.8h, v6.16b, v9.16b umlal2 v18.8h, v7.16b, v17.16b cmge v21.16b, v21.16b, #0 cmge v22.16b, v22.16b, #0 rshrn v12.8b, v12.8h, #6 rshrn2 v12.16b, v13.8h, #6 rshrn v13.8b, v14.8h, #6 rshrn2 v13.16b, v18.8h, #6 bit v10.16b, v12.16b, v21.16b bit v11.16b, v13.16b, v22.16b st1 {v10.16b}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.16b}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 add v30.16b, v30.16b, v24.16b // base_y += 2 b 16b 169: mov v19.16b, v15.16b mov v20.16b, v15.16b tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] umull v4.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v4.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v5.8h, v18.16b, v28.16b umlal2 v5.8h, v19.16b, v27.16b umull v6.8h, v19.8b, v28.8b umlal v6.8h, v20.8b, v27.8b umull2 v7.8h, v19.16b, v28.16b umlal2 v7.8h, v20.16b, v27.16b rshrn v4.8b, v4.8h, #6 rshrn2 v4.16b, v5.8h, #6 rshrn v5.8b, v6.8h, #6 rshrn2 v5.16b, v7.8h, #6 st1 {v4.16b}, [x0], x1 subs w5, w5, #2 st1 {v5.16b}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 add v30.16b, v30.16b, v24.16b // base_y += 2 b 169b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret 320: 640: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] add x11, x11, #16 // increments dup v25.8h, w7 // -dy add x3, x3, #1 // Skip past left[0] ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} add x13, x0, x1 // alternating row lsl x1, x1, #1 // stride *= 2 sub x1, x1, w4, uxtw // stride -= width movi v11.8h, #8 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy add v26.8h, v26.8h, v25.8h // -= dy mul v25.8h, v25.8h, v11.8h // -8*dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} // Worst case height is 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] ld1r {v15.16b}, [x2] // left[0] == top[0] mov w12, w4 // orig w neg w14, w4 // -w 1: mov v23.16b, v26.16b // reset ypos asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, w14 // base_x <= -w asr w11, w8, #6 // base_x b.le 329f dup v17.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx add x9, x2, w9, sxtw add x11, x2, w11, sxtw sqshrn v21.8b, v16.8h, #6 // first base_x sqshrn v22.8b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn v17.8b, v17.8h ld1 {v4.16b}, [x9], #16 // top[base_x] ld1 {v6.16b}, [x11], #16 trn1 v21.2d, v21.2d, v21.2d // first base_x trn1 v22.2d, v22.2d, v22.2d trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos trn1 v17.2d, v17.2d, v17.2d movi v10.16b, #0x3e movi v11.16b, #64 and v16.16b, v16.16b, v10.16b // frac_x and v17.16b, v17.16b, v10.16b sub v8.16b, v11.16b, v16.16b // 64 - frac_x sub v9.16b, v11.16b, v17.16b add v21.16b, v21.16b, v31.16b // actual base_x add v22.16b, v22.16b, v31.16b 2: add v13.8h, v23.8h, v25.8h // ypos -= 8*dy movi v12.16b, #64 movi v20.16b, #2 movi v10.16b, #0x3e smov w10, v22.b[0] xtn v27.8b, v23.8h // (uint8_t)ypos xtn2 v27.16b, v13.8h shrn v29.8b, v23.8h, #6 // ypos >> 6 shrn2 v29.16b, v13.8h, #6 cmp w10, #0 // base_x (bottom left) >= 0 and v27.16b, v27.16b, v10.16b // frac_y mov v18.16b, v15.16b // left[0] b.ge 4f add v23.8h, v13.8h, v25.8h // ypos -= 8*dy movi v13.16b, #1 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v13.16b // base_y + 1 mov v19.16b, v15.16b // left[0] sub v28.16b, v12.16b, v27.16b // 64 - frac_y ld1 {v5.16b}, [x9], #16 // top[base_x] ld1 {v7.16b}, [x11], #16 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v13.16b // base_y + 2 mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v11.8h, v18.16b, v28.16b umlal2 v11.8h, v19.16b, v27.16b umull v12.8h, v19.8b, v28.8b umlal v12.8h, v20.8b, v27.8b umull2 v13.8h, v19.16b, v28.16b umlal2 v13.8h, v20.16b, v27.16b ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #1 rshrn v10.8b, v10.8h, #6 rshrn2 v10.16b, v11.8h, #6 rshrn v11.8b, v12.8h, #6 rshrn2 v11.16b, v13.8h, #6 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x umull2 v13.8h, v4.16b, v8.16b umlal2 v13.8h, v18.16b, v16.16b umull v14.8h, v6.8b, v9.8b umlal v14.8h, v19.8b, v17.8b umull2 v20.8h, v6.16b, v9.16b umlal2 v20.8h, v19.16b, v17.16b cmge v18.16b, v21.16b, #0 cmge v19.16b, v22.16b, #0 rshrn v12.8b, v12.8h, #6 rshrn2 v12.16b, v13.8h, #6 rshrn v13.8b, v14.8h, #6 rshrn2 v13.16b, v20.8h, #6 bit v10.16b, v12.16b, v18.16b bit v11.16b, v13.16b, v19.16b st1 {v10.16b}, [x0], #16 subs w4, w4, #16 st1 {v11.16b}, [x13], #16 b.le 3f movi v10.16b, #16 mov v4.16b, v5.16b mov v6.16b, v7.16b add v21.16b, v21.16b, v10.16b // base_x += 16 add v22.16b, v22.16b, v10.16b b 2b 3: subs w5, w5, #2 b.le 9f movi v10.8h, #128 add x0, x0, x1 add x13, x13, x1 mov w4, w12 // reset w add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) b 1b 4: // The rest of the row only predicted from top[] ld1 {v5.16b}, [x9], #16 // top[base_x] ld1 {v7.16b}, [x11], #16 ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #1 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x umull2 v13.8h, v4.16b, v8.16b umlal2 v13.8h, v18.16b, v16.16b umull v14.8h, v6.8b, v9.8b umlal v14.8h, v19.8b, v17.8b umull2 v20.8h, v6.16b, v9.16b umlal2 v20.8h, v19.16b, v17.16b rshrn v12.8b, v12.8h, #6 rshrn2 v12.16b, v13.8h, #6 rshrn v13.8b, v14.8h, #6 rshrn2 v13.16b, v20.8h, #6 st1 {v12.16b}, [x0], #16 subs w4, w4, #16 st1 {v13.16b}, [x13], #16 b.le 3b mov v4.16b, v5.16b mov v6.16b, v7.16b b 4b 329: // The rest of the block only predicted from left[] add x1, x1, w4, uxtw // restore stride mov w12, w5 // orig remaining h 1: add v13.8h, v23.8h, v25.8h // ypos -= 8*dy movi v12.16b, #64 movi v10.16b, #0x3e xtn v27.8b, v23.8h // (uint8_t)ypos xtn2 v27.16b, v13.8h shrn v29.8b, v23.8h, #6 // ypos >> 6 shrn2 v29.16b, v13.8h, #6 and v27.16b, v27.16b, v10.16b // frac_y mov v18.16b, v15.16b // left[0] add v23.8h, v13.8h, v25.8h // ypos -= 8*dy movi v21.16b, #1 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v21.16b // base_y + 1 sub v28.16b, v12.16b, v27.16b // 64 - frac_y 2: mov v19.16b, v15.16b // left[0] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v21.16b // base_y + 2 mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] add v29.16b, v29.16b, v21.16b // next base_y umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v11.8h, v18.16b, v28.16b umlal2 v11.8h, v19.16b, v27.16b umull v12.8h, v19.8b, v28.8b umlal v12.8h, v20.8b, v27.8b umull2 v13.8h, v19.16b, v28.16b umlal2 v13.8h, v20.16b, v27.16b rshrn v10.8b, v10.8h, #6 rshrn2 v10.16b, v11.8h, #6 rshrn v11.8b, v12.8h, #6 rshrn2 v11.16b, v13.8h, #6 st1 {v10.16b}, [x0], x1 subs w5, w5, #2 st1 {v11.16b}, [x13], x1 b.le 3f mov v18.16b, v20.16b b 2b 3: subs w4, w4, #16 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w5, w12 // reset h b 1b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret L(ipred_z2_fill1_tbl): .hword L(ipred_z2_fill1_tbl) - 640b .hword L(ipred_z2_fill1_tbl) - 320b .hword L(ipred_z2_fill1_tbl) - 160b .hword L(ipred_z2_fill1_tbl) - 80b .hword L(ipred_z2_fill1_tbl) - 40b endfunc function ipred_z2_fill2_8bpc_neon, export=1 cmp w4, #8 mov w8, #(2 << 6) // xpos = 2 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.16b, #0x3e add v30.4h, v16.4h, v30.4h // -= dy xtn v31.8b, v31.8h // {0,1,2,3} // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 add v30.8b, v29.8b, v17.8b // base_y + 1 add v28.8b, v29.8b, v19.8b // base_y + 2 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} trn1 v27.2s, v27.2s, v27.2s // frac_y trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y movi v29.8b, #2 add v31.8b, v31.8b, v31.8b // {0,2,4,6,0,2,4,6} 4: asr w9, w8, #6 // base_x dup v6.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 49f dup v7.4h, w8 // xpos ldr d2, [x2, w9, sxtw] // top[base_x] ldr d4, [x2, w11, sxtw] trn1 v6.2d, v6.2d, v7.2d // xpos tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] shrn v20.8b, v6.8h, #6 // first base_x for each row xtn v6.8b, v6.8h // (uint8_t)xpos uzp2 v3.8b, v2.8b, v4.8b // top[base_x+1] uzp1 v2.8b, v2.8b, v4.8b // top[base_x] and v6.8b, v6.8b, v25.8b // frac_x trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] sub v7.8b, v26.8b, v6.8b // 64 - frac_x add v20.8b, v20.8b, v31.8b // actual base_x umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x cmge v20.8b, v20.8b, #0 rshrn v16.8b, v16.8h, #6 rshrn v22.8b, v22.8h, #6 bit v16.8b, v22.8b, v20.8b st1 {v16.s}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v16.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 4b 49: tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y rshrn v18.8b, v18.8h, #6 st1 {v18.s}[0], [x0], x1 subs w5, w5, #2 st1 {v18.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 49b 9: ret 80: dup v30.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.16b, #0x3e add v30.8h, v16.8h, v30.8h // -= dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] add v30.8b, v29.8b, v19.8b // base_y + 2 add v29.8b, v29.8b, v17.8b // base_y + 1 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} movi v24.8b, #2 // 2 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] shrn v21.8b, v16.8h, #6 // first base_x shrn2 v21.16b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn2 v16.16b, v17.8h tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] uzp2 v5.16b, v4.16b, v6.16b // top[base_x+1] uzp1 v4.16b, v4.16b, v6.16b // top[base_x] and v16.16b, v16.16b, v25.16b // frac_x sub v7.16b, v26.16b, v16.16b // 64 - frac_x add v21.16b, v21.16b, v31.16b // actual base_x umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v23.8h, v4.16b, v7.16b umlal2 v23.8h, v5.16b, v16.16b cmge v21.16b, v21.16b, #0 rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 rshrn v22.8b, v22.8h, #6 rshrn2 v22.16b, v23.8h, #6 bit v6.16b, v22.16b, v21.16b st1 {v6.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 8b 89: tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 st1 {v6.d}[0], [x0], x1 subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 89b 9: ret endfunc function ipred_z2_fill3_8bpc_neon, export=1 cmp w4, #8 mov w8, #(1 << 6) // xpos = 1 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.16b, #0x3e add v30.4h, v16.4h, v30.4h // -= dy xtn v31.8b, v31.8h // {0,1,2,3} // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.16b, v1.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 add v30.8b, v29.8b, v17.8b // base_y + 1 add v28.8b, v29.8b, v19.8b // base_y + 2 trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} add v24.8b, v30.8b, v19.8b // base_y + 3 trn1 v29.2s, v29.2s, v28.2s // base_y + 0, base_y + 2 trn1 v30.2s, v30.2s, v24.2s // base_y + 1, base_y + 3 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v27.2s, v27.2s, v27.2s // frac_y trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y movi v24.8b, #4 4: asr w9, w8, #6 // base_x dup v6.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f dup v7.4h, w8 // xpos ldr d2, [x2, w9, sxtw] // top[base_x] ldr d4, [x2, w11, sxtw] trn1 v6.2d, v6.2d, v7.2d // xpos tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] shrn v20.8b, v6.8h, #6 // first base_x for each row xtn v6.8b, v6.8h // (uint8_t)xpos ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] ext v5.8b, v4.8b, v4.8b, #1 and v6.8b, v6.8b, v25.8b // frac_x trn1 v2.2s, v2.2s, v4.2s // top[base_x] trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] sub v7.8b, v26.8b, v6.8b // 64 - frac_x add v20.8b, v20.8b, v31.8b // actual base_x umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x cmge v20.8b, v20.8b, #0 rshrn v16.8b, v16.8h, #6 rshrn v22.8b, v22.8h, #6 bit v16.8b, v22.8b, v20.8b st1 {v16.s}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v16.s}[1], [x0], x1 b.le 9f add v29.8b, v29.8b, v24.8b // base_y += 4 add v30.8b, v30.8b, v24.8b // base_y += 4 b 4b 49: tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y rshrn v18.8b, v18.8h, #6 st1 {v18.s}[0], [x0], x1 subs w5, w5, #2 st1 {v18.s}[1], [x0], x1 b.le 9f add v29.8b, v29.8b, v24.8b // base_y += 4 add v30.8b, v30.8b, v24.8b // base_y += 4 b 49b 9: ret 80: dup v30.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.16b, #0x3e add v30.8h, v16.8h, v30.8h // -= dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 add v28.8b, v29.8b, v17.8b // base_y + 1 add v30.8b, v29.8b, v19.8b // base_y + 2 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} add v24.8b, v28.8b, v19.8b // base_y + 3 trn1 v29.2d, v29.2d, v30.2d // base_y + 0, base_y + 2 trn1 v30.2d, v28.2d, v24.2d // base_y + 1, base_y + 3 sub v28.8b, v26.8b, v27.8b // 64 - frac_y movi v24.16b, #4 trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] shrn v21.8b, v16.8h, #6 // first base_x shrn2 v21.16b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn2 v16.16b, v17.8h ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #1 and v16.16b, v16.16b, v25.16b // frac_x trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v7.16b, v26.16b, v16.16b // 64 - frac_x add v21.16b, v21.16b, v31.16b // actual base_x umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v17.8h, v18.16b, v28.16b umlal2 v17.8h, v19.16b, v27.16b umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v23.8h, v4.16b, v7.16b umlal2 v23.8h, v5.16b, v16.16b cmge v21.16b, v21.16b, #0 rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 rshrn v22.8b, v22.8h, #6 rshrn2 v22.16b, v23.8h, #6 bit v6.16b, v22.16b, v21.16b st1 {v6.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 add v30.16b, v30.16b, v24.16b // base_y += 4 b 8b 89: tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v17.8h, v18.16b, v28.16b umlal2 v17.8h, v19.16b, v27.16b rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 st1 {v6.d}[0], [x0], x1 subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 add v30.16b, v30.16b, v24.16b // base_y += 4 b 89b 9: ret endfunc // void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const left, // const int width, const int height, // const int dy, const int max_base_y); function ipred_z3_fill1_8bpc_neon, export=1 cmp w6, #64 clz w9, w3 adr x8, L(ipred_z3_fill1_tbl) sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] add x10, x2, w6, uxtw // left[max_base_y] sub x8, x8, w9, uxtw movrel x11, increments ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments mov w7, w5 b.gt L(ipred_z3_fill1_large_h16) br x8 40: AARCH64_VALID_JUMP_TARGET dup v29.4h, w5 // dy mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32 ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.4h, v29.4h, v30.4h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac mov v4.8b, v31.8b uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base] trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2 trn1 v24.2s, v24.2s, v24.2s // frac trn1 v25.2s, v25.2s, v25.2s // 64 - frac 1: mov v5.8b, v31.8b tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2] trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac rshrn v16.8b, v16.8h, #6 st1 {v16.s}[0], [x0], x1 subs w4, w4, #2 st1 {v16.s}[1], [x0], x1 b.le 9f ext v4.8b, v5.8b, v5.8b, #4 uqadd v27.8b, v27.8b, v21.8b // base += 2 b 1b 9: ret 80: AARCH64_VALID_JUMP_TARGET dup v29.8h, w5 // dy mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48 ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[] add v30.8h, v29.8h, v30.8h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac mov v4.8b, v31.8b uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base] 1: mov v5.8b, v31.8b mov v6.8b, v31.8b tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1] tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull v17.8h, v5.8b, v25.8b umlal v17.8h, v6.8b, v24.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.le 9f mov v4.8b, v6.8b uqadd v27.8b, v27.8b, v21.8b // base += 2 uqadd v28.8b, v28.8b, v21.8b // base += 2 b 1b 9: ret 160: AARCH64_VALID_JUMP_TARGET dup v28.8h, w5 // dy shl v29.8h, v28.8h, #3 // 8*dy mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // This is only executed if we've checked that max_base_y <= 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] add v28.8h, v28.8h, v30.8h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 add v29.8h, v28.8h, v29.8h // ypos + 8*dy xtn v24.8b, v28.8h // (uint8_t)ypos xtn2 v24.16b, v29.8h uqshrn v26.8b, v28.8h, #6 // base uqshrn2 v26.16b, v29.8h, #6 and v24.16b, v24.16b, v23.16b // frac mov v4.16b, v31.16b uqadd v27.16b, v26.16b, v20.16b // base + 1 uqadd v28.16b, v26.16b, v21.16b // base + 2 sub v25.16b, v22.16b, v24.16b // 64 - frac tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base] 1: mov v5.16b, v31.16b mov v6.16b, v31.16b tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1] tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull2 v17.8h, v4.16b, v25.16b umlal2 v17.8h, v5.16b, v24.16b umull v18.8h, v5.8b, v25.8b umlal v18.8h, v6.8b, v24.8b umull2 v19.8h, v5.16b, v25.16b umlal2 v19.8h, v6.16b, v24.16b rshrn v16.8b, v16.8h, #6 rshrn2 v16.16b, v17.8h, #6 rshrn v17.8b, v18.8h, #6 rshrn2 v17.16b, v19.8h, #6 st1 {v16.16b}, [x0], x1 subs w4, w4, #2 st1 {v17.16b}, [x0], x1 b.le 9f mov v4.16b, v6.16b uqadd v27.16b, v27.16b, v21.16b // base += 2 uqadd v28.16b, v28.16b, v21.16b // base += 2 b 1b 9: ret 320: 640: AARCH64_VALID_JUMP_TARGET dup v28.8h, w5 // dy mov w12, w3 add x13, x0, x1 shl v29.8h, v28.8h, #3 // 8*dy mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e lsl x1, x1, #1 sub x1, x1, w3, uxtw add v30.8h, v28.8h, v30.8h // ypos // This is only executed if we've checked that max_base_y <= 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 1: mov v26.16b, v30.16b // reset ypos 2: add v27.8h, v26.8h, v29.8h // ypos + 8*dy uqshrn v16.8b, v26.8h, #6 // base uqshrn2 v16.16b, v27.8h, #6 xtn v24.8b, v26.8h // (uint8_t)ypos xtn2 v24.16b, v27.8h umov w14, v16.b[0] and v24.16b, v24.16b, v23.16b // frac uqadd v17.16b, v16.16b, v20.16b // base + 1 cmp w14, w6 // base >= max_base_y uqadd v18.16b, v16.16b, v21.16b // base + 2 sub v25.16b, v22.16b, v24.16b // 64 - frac b.ge 4f mov v4.16b, v31.16b mov v5.16b, v31.16b mov v6.16b, v31.16b tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base] tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1] tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2] subs w3, w3, #16 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull2 v17.8h, v4.16b, v25.16b umlal2 v17.8h, v5.16b, v24.16b umull v18.8h, v5.8b, v25.8b umlal v18.8h, v6.8b, v24.8b umull2 v19.8h, v5.16b, v25.16b umlal2 v19.8h, v6.16b, v24.16b rshrn v16.8b, v16.8h, #6 rshrn2 v16.16b, v17.8h, #6 rshrn v17.8b, v18.8h, #6 rshrn2 v17.16b, v19.8h, #6 st1 {v16.16b}, [x0], #16 st1 {v17.16b}, [x13], #16 b.le 3f add v26.8h, v27.8h, v29.8h // ypos += 16*dy b 2b 3: subs w4, w4, #2 b.le 9f movi v16.8h, #128 add x0, x0, x1 add x13, x13, x1 add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2 mov w3, w12 b 1b 4: subs w3, w3, #16 st1 {v31.16b}, [x0], #16 st1 {v31.16b}, [x13], #16 b.gt 4b b 3b 9: ret L(ipred_z3_fill1_large_h16): // Fallback case for max_base_y > 64; similar to the z1 // implementation. This does the filtering vertically, filling out // a 2x pixel column at a time. mov w15, #64 add x13, x0, x1 lsl x1, x1, #1 mov w12, w4 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // ypos += dy cmp w8, w6 // base >= max_base_y lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw add x10, x2, w10, uxtw dup v4.16b, w9 // frac dup v5.16b, w11 ld1 {v0.16b, v1.16b}, [x8], #32 // left[base] ld1 {v2.16b, v3.16b}, [x10], #32 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.16b, w9 // 64 - frac dup v7.16b, w11 add w7, w7, w5 // ypos += dy 2: ext v16.16b, v0.16b, v1.16b, #1 // left[base+1] ext v17.16b, v2.16b, v3.16b, #1 subs w4, w4, #16 umull v18.8h, v16.8b, v4.8b // left[base+1]*frac umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac) umull2 v19.8h, v16.16b, v4.16b umlal2 v19.8h, v0.16b, v6.16b umull v20.8h, v17.8b, v5.8b umlal v20.8h, v2.8b, v7.8b umull2 v21.8h, v17.16b, v5.16b umlal2 v21.8h, v2.16b, v7.16b rshrn v16.8b, v18.8h, #6 rshrn2 v16.16b, v19.8h, #6 rshrn v17.8b, v20.8h, #6 rshrn2 v17.16b, v21.8h, #6 zip1 v18.16b, v16.16b, v17.16b zip2 v19.16b, v16.16b, v17.16b st1 {v18.h}[0], [x0], x1 st1 {v18.h}[1], [x13], x1 st1 {v18.h}[2], [x0], x1 st1 {v18.h}[3], [x13], x1 st1 {v18.h}[4], [x0], x1 st1 {v18.h}[5], [x13], x1 st1 {v18.h}[6], [x0], x1 st1 {v18.h}[7], [x13], x1 st1 {v19.h}[0], [x0], x1 st1 {v19.h}[1], [x13], x1 st1 {v19.h}[2], [x0], x1 st1 {v19.h}[3], [x13], x1 st1 {v19.h}[4], [x0], x1 st1 {v19.h}[5], [x13], x1 st1 {v19.h}[6], [x0], x1 st1 {v19.h}[7], [x13], x1 b.le 3f mov v0.16b, v1.16b ld1 {v1.16b}, [x8], #16 // left[base] mov v2.16b, v3.16b ld1 {v3.16b}, [x10], #16 b 2b 3: subs w3, w3, #2 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #2 add x13, x13, #2 mov w4, w12 b 1b 9: ret L(ipred_z3_fill1_tbl): .hword L(ipred_z3_fill1_tbl) - 640b .hword L(ipred_z3_fill1_tbl) - 320b .hword L(ipred_z3_fill1_tbl) - 160b .hword L(ipred_z3_fill1_tbl) - 80b .hword L(ipred_z3_fill1_tbl) - 40b endfunc function ipred_z3_fill_padding_neon, export=0 cmp w3, #16 adr x8, L(ipred_z3_fill_padding_tbl) b.gt L(ipred_z3_fill_padding_wide) // w3 = remaining width, w4 = constant height mov w12, w4 1: // Fill a WxH rectangle with padding. W can be any number; // this fills the exact width by filling in the largest // power of two in the remaining width, and repeating. clz w9, w3 sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] sub x9, x8, w9, uxtw br x9 2: st1 {v31.h}[0], [x0], x1 subs w4, w4, #4 st1 {v31.h}[0], [x13], x1 st1 {v31.h}[0], [x0], x1 st1 {v31.h}[0], [x13], x1 b.gt 2b subs w3, w3, #2 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #2 add x13, x13, #2 mov w4, w12 b 1b 4: st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 st1 {v31.s}[0], [x0], x1 st1 {v31.s}[0], [x13], x1 b.gt 4b subs w3, w3, #4 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #4 add x13, x13, #4 mov w4, w12 b 1b 8: st1 {v31.8b}, [x0], x1 subs w4, w4, #4 st1 {v31.8b}, [x13], x1 st1 {v31.8b}, [x0], x1 st1 {v31.8b}, [x13], x1 b.gt 4b subs w3, w3, #8 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #8 add x13, x13, #8 mov w4, w12 b 1b 16: 32: 64: st1 {v31.16b}, [x0], x1 subs w4, w4, #4 st1 {v31.16b}, [x13], x1 st1 {v31.16b}, [x0], x1 st1 {v31.16b}, [x13], x1 b.gt 4b subs w3, w3, #16 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w4, w12 b 1b 9: ret L(ipred_z3_fill_padding_tbl): .hword L(ipred_z3_fill_padding_tbl) - 64b .hword L(ipred_z3_fill_padding_tbl) - 32b .hword L(ipred_z3_fill_padding_tbl) - 16b .hword L(ipred_z3_fill_padding_tbl) - 8b .hword L(ipred_z3_fill_padding_tbl) - 4b .hword L(ipred_z3_fill_padding_tbl) - 2b L(ipred_z3_fill_padding_wide): // Fill a WxH rectangle with padding, with W > 16. lsr x1, x1, #1 mov w12, w3 sub x1, x1, w3, uxtw 1: ands w5, w3, #15 b.eq 2f // If the width isn't aligned to 16, first do one 16 byte write // and align the start pointer. sub w3, w3, w5 st1 {v31.16b}, [x0] add x0, x0, w5, uxtw 2: // Fill the rest of the line with aligned 16 byte writes. subs w3, w3, #16 st1 {v31.16b}, [x0], #16 b.gt 2b subs w4, w4, #1 add x0, x0, x1 b.le 9f mov w3, w12 b 1b 9: ret endfunc function ipred_z3_fill2_8bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // left[max_base_y] movrel x11, increments ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments b.eq 80f 40: // w == 4 dup v29.4h, w5 // dy mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, // so max_base_y <= 32. ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.4h, v29.4h, v30.4h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac uqadd v29.8b, v27.8b, v21.8b // base + 3 trn1 v24.2s, v24.2s, v24.2s // frac trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2 trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3 trn1 v25.2s, v25.2s, v25.2s // 64 - frac movi v21.16b, #4 1: mov v4.8b, v31.8b mov v5.8b, v31.8b tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2] tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac rshrn v16.8b, v16.8h, #6 st1 {v16.s}[0], [x0], x1 subs w4, w4, #2 st1 {v16.s}[1], [x0], x1 b.le 9f uqadd v26.8b, v26.8b, v21.8b // base += 4 uqadd v27.8b, v27.8b, v21.8b // base += 4 b 1b 9: ret 80: // w == 8 dup v29.8h, w5 // dy mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, // so max_base_y <= 32. ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.8h, v29.8h, v30.8h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac uqadd v29.8b, v27.8b, v21.8b // base + 3 trn1 v24.2d, v24.2d, v24.2d // frac trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2 trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3 trn1 v25.2d, v25.2d, v25.2d // 64 - frac movi v21.16b, #4 1: mov v4.16b, v31.16b mov v5.16b, v31.16b tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2] tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull2 v17.8h, v4.16b, v25.16b umlal2 v17.8h, v5.16b, v24.16b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.le 9f uqadd v26.16b, v26.16b, v21.16b // base += 4 uqadd v27.16b, v27.16b, v21.16b // base += 4 b 1b 9: ret endfunc // void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height); function ipred_filter_8bpc_neon, export=1 and w5, w5, #511 movrel x6, X(filter_intra_taps) lsl w5, w5, #6 add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 adr x5, L(ipred_filter_tbl) ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 ldrh w9, [x5, w9, uxtw #1] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b sub x5, x5, w9, uxtw sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 lsl x1, x1, #1 sxtl v20.8h, v20.8b sxtl v21.8h, v21.8b sxtl v22.8h, v22.8b br x5 40: AARCH64_VALID_JUMP_TARGET ldur s0, [x2, #1] // top (0-3) sub x2, x2, #2 mov x7, #-2 uxtl v0.8h, v0.8b // top (0-3) 4: ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) uxtl v1.8h, v1.8b // left (0-1) + topleft (2) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) sqrshrun v2.8b, v2.8h, #4 subs w4, w4, #2 st1 {v2.s}[0], [x0], x1 uxtl v0.8h, v2.8b st1 {v2.s}[1], [x6], x1 ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ldur d0, [x2, #1] // top (0-7) sub x2, x2, #2 mov x7, #-2 uxtl v0.8h, v0.8b // top (0-7) 8: ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) uxtl v1.8h, v1.8b // left (0-1) + topleft (2) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) sqrshrun v2.8b, v2.8h, #4 uxtl v1.8h, v2.8b // first block, in 16 bit mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) sqrshrun v3.8b, v3.8h, #4 subs w4, w4, #2 st2 {v2.s, v3.s}[0], [x0], x1 zip2 v0.2s, v2.2s, v3.2s st2 {v2.s, v3.s}[1], [x6], x1 uxtl v0.8h, v0.8b b.gt 8b ret 160: 320: AARCH64_VALID_JUMP_TARGET add x8, x2, #1 sub x2, x2, #2 mov x7, #-2 sub x1, x1, w3, uxtw mov w9, w3 1: ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) uxtl v0.8h, v0.8b // left (0-1) + topleft (2) 2: ld1 {v2.16b}, [x8], #16 // top(0-15) mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) uxtl v1.8h, v2.8b // top(0-7) uxtl2 v2.8h, v2.16b // top(8-15) mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) sqrshrun v3.8b, v3.8h, #4 uxtl v0.8h, v3.8b // first block, in 16 bit mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) sqrshrun v4.8b, v4.8h, #4 uxtl v0.8h, v4.8b // second block, in 16 bit mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) sqrshrun v5.8b, v5.8h, #4 uxtl v0.8h, v5.8b // third block, in 16 bit mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 sqrshrun v6.8b, v6.8h, #4 st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 b.le 8f ins v0.h[2], v2.h[7] ins v0.b[0], v6.b[7] ins v0.b[2], v6.b[3] b 2b 8: subs w4, w4, #2 b.le 9f sub x8, x6, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_filter_tbl): .hword L(ipred_filter_tbl) - 320b .hword L(ipred_filter_tbl) - 160b .hword L(ipred_filter_tbl) - 80b .hword L(ipred_filter_tbl) - 40b endfunc // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint16_t *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_8bpc_neon, export=1 ld1 {v0.8h}, [x2] clz w9, w4 adr x6, L(pal_pred_tbl) sub w9, w9, #25 ldrh w9, [x6, w9, uxtw #1] xtn v0.8b, v0.8h sub x6, x6, w9, uxtw add x2, x0, x1 lsl x1, x1, #1 br x6 4: AARCH64_VALID_JUMP_TARGET ld1 {v1.16b}, [x3], #16 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x2], x1 st1 {v1.s}[2], [x0], x1 st1 {v1.s}[3], [x2], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x3], #32 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b st1 {v1.d}[0], [x0], x1 tbl v2.16b, {v0.16b}, v2.16b st1 {v1.d}[1], [x2], x1 st1 {v2.d}[0], [x0], x1 st1 {v2.d}[1], [x2], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b tbl v2.16b, {v0.16b}, v2.16b st1 {v1.16b}, [x0], x1 tbl v3.16b, {v0.16b}, v3.16b st1 {v2.16b}, [x2], x1 tbl v4.16b, {v0.16b}, v4.16b st1 {v3.16b}, [x0], x1 st1 {v4.16b}, [x2], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #4 tbl v16.16b, {v0.16b}, v16.16b tbl v17.16b, {v0.16b}, v17.16b tbl v18.16b, {v0.16b}, v18.16b tbl v19.16b, {v0.16b}, v19.16b tbl v20.16b, {v0.16b}, v20.16b st1 {v16.16b, v17.16b}, [x0], x1 tbl v21.16b, {v0.16b}, v21.16b st1 {v18.16b, v19.16b}, [x2], x1 tbl v22.16b, {v0.16b}, v22.16b st1 {v20.16b, v21.16b}, [x0], x1 tbl v23.16b, {v0.16b}, v23.16b st1 {v22.16b, v23.16b}, [x2], x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #2 tbl v16.16b, {v0.16b}, v16.16b tbl v17.16b, {v0.16b}, v17.16b tbl v18.16b, {v0.16b}, v18.16b tbl v19.16b, {v0.16b}, v19.16b st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 tbl v20.16b, {v0.16b}, v20.16b tbl v21.16b, {v0.16b}, v21.16b tbl v22.16b, {v0.16b}, v22.16b tbl v23.16b, {v0.16b}, v23.16b st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 b.gt 64b ret L(pal_pred_tbl): .hword L(pal_pred_tbl) - 64b .hword L(pal_pred_tbl) - 32b .hword L(pal_pred_tbl) - 16b .hword L(pal_pred_tbl) - 8b .hword L(pal_pred_tbl) - 4b endfunc // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_128_8bpc_neon, export=1 clz w9, w3 adr x7, L(ipred_cfl_128_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] movi v0.8h, #128 // dc dup v1.8h, w6 // alpha sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_splat_w4): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x5], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h cmlt v4.8h, v2.8h, #0 // sign cmlt v5.8h, v3.8h, #0 add v2.8h, v2.8h, v4.8h // diff + sign add v3.8h, v3.8h, v5.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h st1 {v2.s}[0], [x0], x1 st1 {v2.s}[1], [x6], x1 subs w4, w4, #4 st1 {v3.s}[0], [x0], x1 st1 {v3.s}[1], [x6], x1 b.gt L(ipred_cfl_splat_w4) ret L(ipred_cfl_splat_w8): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h cmlt v16.8h, v2.8h, #0 // sign cmlt v17.8h, v3.8h, #0 cmlt v18.8h, v4.8h, #0 cmlt v19.8h, v5.8h, #0 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h add v5.8h, v5.8h, v19.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 srshr v4.8h, v4.8h, #6 srshr v5.8h, v5.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h add v4.8h, v4.8h, v0.8h add v5.8h, v5.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h sqxtun v4.8b, v4.8h sqxtun v5.8b, v5.8h st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x6], x1 subs w4, w4, #4 st1 {v4.8b}, [x0], x1 st1 {v5.8b}, [x6], x1 b.gt L(ipred_cfl_splat_w8) ret L(ipred_cfl_splat_w16): AARCH64_VALID_JUMP_TARGET add x7, x5, w3, uxtw #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld1 {v2.8h, v3.8h}, [x5], #32 ld1 {v4.8h, v5.8h}, [x7], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h cmlt v16.8h, v2.8h, #0 // sign cmlt v17.8h, v3.8h, #0 cmlt v18.8h, v4.8h, #0 cmlt v19.8h, v5.8h, #0 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h add v5.8h, v5.8h, v19.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 srshr v4.8h, v4.8h, #6 srshr v5.8h, v5.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h add v4.8h, v4.8h, v0.8h add v5.8h, v5.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h sqxtun v4.8b, v4.8h sqxtun v5.8b, v5.8h subs w3, w3, #16 st1 {v2.8b, v3.8b}, [x0], #16 st1 {v4.8b, v5.8b}, [x6], #16 b.gt 1b subs w4, w4, #2 add x5, x5, w9, uxtw #1 add x7, x7, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b.gt 1b ret L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) endfunc // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_top_8bpc_neon, export=1 clz w9, w3 adr x7, L(ipred_cfl_top_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] dup v1.8h, w6 // alpha add x2, x2, #1 sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 4: AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_top_tbl): .hword L(ipred_cfl_top_tbl) - 32b .hword L(ipred_cfl_top_tbl) - 16b .hword L(ipred_cfl_top_tbl) - 8b .hword L(ipred_cfl_top_tbl) - 4b endfunc // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w9, w3 clz w8, w4 adr x10, L(ipred_cfl_splat_tbl) adr x7, L(ipred_cfl_left_tbl) sub w9, w9, #26 sub w8, w8, #26 ldrh w9, [x10, w9, uxtw #1] ldrh w8, [x7, w8, uxtw #1] dup v1.8h, w6 // alpha sub x9, x10, w9, uxtw sub x7, x7, w8, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_left_h4): AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] br x9 L(ipred_cfl_left_tbl): .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) endfunc // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_8bpc_neon, export=1 sub x2, x2, w4, uxtw add w8, w3, w4 // width + height dup v1.8h, w6 // alpha clz w9, w3 clz w6, w4 dup v16.8h, w8 // width + height adr x7, L(ipred_cfl_tbl) rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) ldrh w9, [x7, w9, uxtw #1] ldrh w6, [x7, w6, uxtw #1] neg w8, w8 // -ctz(width + height) sub x9, x7, w9, uxtw sub x7, x7, w6, uxtw ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w8 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr add x2, x2, #1 uaddlv h0, v0.8b br x9 L(ipred_cfl_w4): AARCH64_VALID_JUMP_TARGET ld1 {v2.s}[0], [x2] ins v2.s[1], wzr add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b cmp w4, #4 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x3334/2) movk w16, #(0x5556/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b add x2, x2, #1 br x9 L(ipred_cfl_w8): AARCH64_VALID_JUMP_TARGET ld1 {v2.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b cmp w4, #8 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b add x2, x2, #1 br x9 L(ipred_cfl_w16): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b cmp w4, #16 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/8/32 cmp w4, #4 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2], #32 uaddlv h2, v2.16b uaddlv h3, v3.16b add x2, x2, #1 add v0.4h, v2.4h, v3.4h br x9 L(ipred_cfl_w32): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b uaddlv h3, v3.16b cmp w4, #32 add v0.4h, v0.4h, v2.4h add v0.4h, v0.4h, v3.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x5556/2) movk w16, #(0x3334/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_tbl): .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) endfunc // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_420_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_420_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v0.d}[1], [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 subs w8, w8, #2 st1 {v0.8h}, [x0], #16 add v16.8h, v16.8h, v0.8h b.gt 1b trn2 v1.2d, v0.2d, v0.2d trn2 v0.2d, v0.2d, v0.2d L(ipred_cfl_ac_420_w4_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h b.gt 2b 3: // Aggregate the sums add v0.8h, v16.8h, v17.8h uaddlv s0, v0.8h // sum sub x0, x0, w6, uxtw #3 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h}, [x0] subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 6b ret L(ipred_cfl_ac_420_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x10], x2 ld1 {v2.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v3.16b}, [x10], x2 uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b add v0.8h, v0.8h, v1.8h add v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #1 shl v1.8h, v2.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h}, [x0], #32 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h b.gt 1b mov v0.16b, v1.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v0.d}[1], [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 dup v1.4h, v0.h[3] dup v3.4h, v0.h[7] trn2 v2.2d, v0.2d, v0.2d subs w8, w8, #2 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 add v16.4h, v16.4h, v0.4h add v17.4h, v17.4h, v1.4h add v18.4h, v18.4h, v2.4h add v19.4h, v19.4h, v3.4h b.gt 1b trn1 v0.2d, v2.2d, v3.2d trn1 v1.2d, v2.2d, v3.2d L(ipred_cfl_ac_420_w8_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 add v18.8h, v18.8h, v0.8h add v19.8h, v19.8h, v1.8h b.gt 2b 3: L(ipred_cfl_ac_420_w8_calc_subtract_dc): // Aggregate the sums add v0.8h, v16.8h, v17.8h add v2.8h, v18.8h, v19.8h uaddlp v0.4s, v0.8h uaddlp v2.4s, v2.8h add v0.4s, v0.4s, v2.4s addv s0, v0.4s // sum sub x0, x0, w6, uxtw #4 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] L(ipred_cfl_ac_420_w8_subtract_dc): 6: // Subtract dc from ac ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h sub v2.8h, v2.8h, v4.8h sub v3.8h, v3.8h, v4.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 6b ret L(ipred_cfl_ac_420_w16): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_420_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_420_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 uaddlp v0.8h, v0.16b ld1 {v4.16b, v5.16b}, [x1], x2 uaddlp v1.8h, v1.16b ld1 {v6.16b, v7.16b}, [x10], x2 uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b uaddlp v4.8h, v4.16b uaddlp v5.8h, v5.16b uaddlp v6.8h, v6.16b uaddlp v7.8h, v7.16b add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h add v4.8h, v4.8h, v6.8h add v5.8h, v5.8h, v7.8h shl v0.8h, v0.8h, #1 shl v1.8h, v1.8h, #1 shl v2.8h, v4.8h, #1 shl v3.8h, v5.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 ldr d3, [x10, #16] ld1 {v2.16b}, [x10], x2 uaddlp v1.4h, v1.8b ldr d5, [x1, #16] uaddlp v0.8h, v0.16b ld1 {v4.16b}, [x1], x2 uaddlp v3.4h, v3.8b ldr d7, [x10, #16] uaddlp v2.8h, v2.16b ld1 {v6.16b}, [x10], x2 uaddlp v5.4h, v5.8b uaddlp v4.8h, v4.16b uaddlp v7.4h, v7.8b uaddlp v6.8h, v6.16b add v1.4h, v1.4h, v3.4h add v0.8h, v0.8h, v2.8h add v5.4h, v5.4h, v7.4h add v4.8h, v4.8h, v6.8h shl v1.4h, v1.4h, #1 shl v0.8h, v0.8h, #1 shl v3.4h, v5.4h, #1 shl v2.8h, v4.8h, #1 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 ld1 {v4.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v6.16b}, [x10], x2 uaddlp v2.8h, v2.16b uaddlp v4.8h, v4.16b uaddlp v6.8h, v6.16b add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v2.8h, v4.8h, #1 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 ld1 {v4.8b}, [x1], x2 uaddlp v0.4h, v0.8b ld1 {v6.8b}, [x10], x2 uaddlp v2.4h, v2.8b uaddlp v4.4h, v4.8b uaddlp v6.4h, v6.8b add v0.4h, v0.4h, v2.4h add v4.4h, v4.4h, v6.4h shl v0.4h, v0.4h, #1 shl v2.4h, v4.4h, #1 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b L(ipred_cfl_ac_420_w16_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 2b 3: // Double the height and reuse the w8 summing/subtracting lsl w6, w6, #1 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) L(ipred_cfl_ac_420_tbl): .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) .hword 0 L(ipred_cfl_ac_420_w16_tbl): .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) endfunc // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_422_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_422_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v1.8b}, [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 subs w8, w8, #4 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x10], x2 ld1 {v2.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v3.16b}, [x10], x2 uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 shl v2.8h, v2.8h, #2 shl v3.8h, v3.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v2.8b}, [x1], x2 ld1 {v2.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v2.8h, v2.16b shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v0.h[3] dup v5.8h, v0.h[7] dup v6.4h, v2.h[3] dup v7.8h, v2.h[7] trn2 v1.2d, v0.2d, v5.2d trn1 v0.2d, v0.2d, v4.2d trn2 v3.2d, v2.2d, v7.2d trn1 v2.2d, v2.2d, v6.2d subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_422_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_422_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 shl v2.8h, v2.8h, #2 shl v3.8h, v3.8h, #2 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 ldr d3, [x10, #16] ld1 {v2.16b}, [x10], x2 uaddlp v1.4h, v1.8b uaddlp v0.8h, v0.16b uaddlp v3.4h, v3.8b uaddlp v2.8h, v2.16b shl v1.4h, v1.4h, #2 shl v0.8h, v0.8h, #2 shl v3.4h, v3.4h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 uaddlp v0.8h, v0.16b uaddlp v2.8h, v2.16b shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 uaddlp v0.4h, v0.8b uaddlp v2.4h, v2.8b shl v0.4h, v0.4h, #2 shl v2.4h, v2.4h, #2 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_tbl): .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) .hword 0 L(ipred_cfl_ac_422_w16_tbl): .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) endfunc // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_444_tbl) sub w8, w8, #26 ldrh w8, [x7, w8, uxtw #1] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_444_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.s}[0], [x1], x2 ld1 {v0.s}[1], [x10], x2 ld1 {v1.s}[0], [x1], x2 ld1 {v1.s}[1], [x10], x2 ushll v0.8h, v0.8b, #3 ushll v1.8h, v1.8b, #3 subs w8, w8, #4 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v2.8b}, [x1], x2 ushll v0.8h, v0.8b, #3 ld1 {v3.8b}, [x10], x2 ushll v1.8h, v1.8b, #3 ushll v2.8h, v2.8b, #3 ushll v3.8h, v3.8b, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 ld1 {v4.16b}, [x1], x2 ushll2 v1.8h, v0.16b, #3 ushll v0.8h, v0.8b, #3 ld1 {v6.16b}, [x10], x2 ushll2 v3.8h, v2.16b, #3 ushll v2.8h, v2.8b, #3 ushll2 v5.8h, v4.16b, #3 ushll v4.8h, v4.8b, #3 ushll2 v7.8h, v6.16b, #3 ushll v6.8h, v6.8b, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b mov v0.16b, v6.16b mov v1.16b, v7.16b mov v2.16b, v6.16b mov v3.16b, v7.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 ld1 {v4.8b}, [x1], x2 ld1 {v6.8b}, [x10], x2 ushll v0.8h, v0.8b, #3 ushll v2.8h, v2.8b, #3 ushll v4.8h, v4.8b, #3 ushll v6.8h, v6.8b, #3 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] dup v5.8h, v4.h[7] dup v7.8h, v6.h[7] subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b mov v0.16b, v6.16b mov v1.16b, v7.16b mov v2.16b, v6.16b mov v3.16b, v7.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_444_w32_tbl) ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_444_w32_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, without padding ld1 {v2.16b, v3.16b}, [x1], x2 ld1 {v6.16b, v7.16b}, [x10], x2 ushll v0.8h, v2.8b, #3 ushll2 v1.8h, v2.16b, #3 ushll v2.8h, v3.8b, #3 ushll2 v3.8h, v3.16b, #3 ushll v4.8h, v6.8b, #3 ushll2 v5.8h, v6.16b, #3 ushll v6.8h, v7.8b, #3 ushll2 v7.8h, v7.16b, #3 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 8 ldr d2, [x1, #16] ld1 {v1.16b}, [x1], x2 ldr d6, [x10, #16] ld1 {v5.16b}, [x10], x2 ushll v2.8h, v2.8b, #3 ushll v0.8h, v1.8b, #3 ushll2 v1.8h, v1.16b, #3 ushll v6.8h, v6.8b, #3 ushll v4.8h, v5.8b, #3 ushll2 v5.8h, v5.16b, #3 dup v3.8h, v2.h[7] dup v7.8h, v6.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 16 ld1 {v1.16b}, [x1], x2 ld1 {v5.16b}, [x10], x2 ushll v0.8h, v1.8b, #3 ushll2 v1.8h, v1.16b, #3 ushll v4.8h, v5.8b, #3 ushll2 v5.8h, v5.16b, #3 dup v2.8h, v1.h[7] dup v3.8h, v1.h[7] dup v6.8h, v5.h[7] dup v7.8h, v5.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 24 ld1 {v0.8b}, [x1], x2 ld1 {v4.8b}, [x10], x2 ushll v0.8h, v0.8b, #3 ushll v4.8h, v4.8b, #3 dup v1.8h, v0.h[7] dup v2.8h, v0.h[7] dup v3.8h, v0.h[7] dup v5.8h, v4.h[7] dup v6.8h, v4.h[7] dup v7.8h, v4.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b L(ipred_cfl_ac_444_w32_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 2b 3: // Quadruple the height and reuse the w8 subtracting lsl w6, w6, #2 // Aggregate the sums, with wider intermediates earlier than in // ipred_cfl_ac_420_w8_calc_subtract_dc. uaddlp v0.4s, v16.8h uaddlp v1.4s, v17.8h uaddlp v2.4s, v18.8h uaddlp v3.4s, v19.8h add v0.4s, v0.4s, v1.4s add v2.4s, v2.4s, v3.4s add v0.4s, v0.4s, v2.4s addv s0, v0.4s // sum sub x0, x0, w6, uxtw #4 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] b L(ipred_cfl_ac_420_w8_subtract_dc) L(ipred_cfl_ac_444_tbl): .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) L(ipred_cfl_ac_444_w32_tbl): .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) endfunc av-scenechange-0.14.1/src/asm/arm/64/ipred16.S000064400000000000000000007170661046102023000165540ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/asm/arm/asm.S" #include "util.S" // void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height, // const int bitdepth_max); function ipred_dc_128_16bpc_neon, export=1 ldr w8, [sp] clz w3, w3 adr x5, L(ipred_dc_128_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] dup v0.8h, w8 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 urshr v0.8h, v0.8h, #1 br x5 4: AARCH64_VALID_JUMP_TARGET st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 16: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b sub x1, x1, #64 64: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 64b ret L(ipred_dc_128_tbl): .hword L(ipred_dc_128_tbl) - 640b .hword L(ipred_dc_128_tbl) - 320b .hword L(ipred_dc_128_tbl) - 160b .hword L(ipred_dc_128_tbl) - 8b .hword L(ipred_dc_128_tbl) - 4b endfunc // void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_16bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_v_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #2 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] 4: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] 8: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] 16: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 sub x1, x1, #64 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 64: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 b.gt 64b ret L(ipred_v_tbl): .hword L(ipred_v_tbl) - 640b .hword L(ipred_v_tbl) - 320b .hword L(ipred_v_tbl) - 160b .hword L(ipred_v_tbl) - 80b .hword L(ipred_v_tbl) - 40b endfunc // void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_16bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_h_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] sub x2, x2, #8 sub x5, x5, w3, uxtw mov x7, #-8 add x6, x0, x1 lsl x1, x1, #1 br x5 4: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.4h}, [x0], x1 st1 {v2.4h}, [x6], x1 subs w4, w4, #4 st1 {v1.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] stp q3, q3, [x0, #64] stp q2, q2, [x6, #64] stp q3, q3, [x0, #96] stp q2, q2, [x6, #96] st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] stp q1, q1, [x0, #64] stp q0, q0, [x6, #64] stp q1, q1, [x0, #96] stp q0, q0, [x6, #96] st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 64b ret L(ipred_h_tbl): .hword L(ipred_h_tbl) - 64b .hword L(ipred_h_tbl) - 32b .hword L(ipred_h_tbl) - 16b .hword L(ipred_h_tbl) - 8b .hword L(ipred_h_tbl) - 4b endfunc // void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_16bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_top_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #2 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.4h, v0.h[0] 4: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] 8: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] addp v0.8h, v0.8h, v1.8h addv h0, v0.8h urshr v2.4h, v0.4h, #4 dup v0.8h, v2.h[0] dup v1.8h, v2.h[0] 16: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h uaddlv s0, v0.8h rshrn v4.4h, v0.4s, #5 dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h uaddlv s0, v0.8h rshrn v4.4h, v0.4s, #6 sub x1, x1, #64 dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 64: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 64b ret L(ipred_dc_top_tbl): .hword L(ipred_dc_top_tbl) - 640b .hword L(ipred_dc_top_tbl) - 320b .hword L(ipred_dc_top_tbl) - 160b .hword L(ipred_dc_top_tbl) - 80b .hword L(ipred_dc_top_tbl) - 40b endfunc // void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_16bpc_neon, export=1 sub x2, x2, w4, uxtw #1 clz w3, w3 clz w7, w4 adr x5, L(ipred_dc_left_tbl) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 ldrh w3, [x5, w3, uxtw #1] ldrh w7, [x5, w7, uxtw #1] sub x3, x5, w3, uxtw sub x5, x5, w7, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_left_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] br x3 L(ipred_dc_left_w4): AARCH64_VALID_JUMP_TARGET st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt L(ipred_dc_left_w4) ret L(ipred_dc_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x3 L(ipred_dc_left_w8): AARCH64_VALID_JUMP_TARGET st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt L(ipred_dc_left_w8) ret L(ipred_dc_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] addp v0.8h, v0.8h, v1.8h addv h0, v0.8h urshr v2.4h, v0.4h, #4 dup v0.8h, v2.h[0] dup v1.8h, v2.h[0] br x3 L(ipred_dc_left_w16): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 1: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h uaddlp v0.4s, v0.8h addv s0, v0.4s rshrn v4.4h, v0.4s, #5 dup v0.8h, v4.h[0] br x3 L(ipred_dc_left_w32): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 1: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h uaddlv s0, v0.8h rshrn v4.4h, v0.4s, #6 dup v0.8h, v4.h[0] br x3 L(ipred_dc_left_w64): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b sub x1, x1, #64 1: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_tbl): .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) endfunc // void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_16bpc_neon, export=1 sub x2, x2, w4, uxtw #1 add w7, w3, w4 // width + height clz w3, w3 clz w6, w4 dup v16.4s, w7 // width + height adr x5, L(ipred_dc_tbl) rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) ldrh w3, [x5, w3, uxtw #1] ldrh w6, [x5, w6, uxtw #1] neg w7, w7 // -ctz(width + height) sub x3, x5, w3, uxtw sub x5, x5, w6, uxtw ushr v16.4s, v16.4s, #1 // (width + height) >> 1 dup v17.4s, w7 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h add x2, x2, #2 br x3 L(ipred_dc_w4): AARCH64_VALID_JUMP_TARGET ld1 {v1.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.4h cmp w4, #4 add v0.2s, v0.2s, v1.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 8/16 cmp w4, #16 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.4h, v0.h[0] 2: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 2b ret L(ipred_dc_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h add x2, x2, #2 br x3 L(ipred_dc_w8): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.8h cmp w4, #8 add v0.2s, v0.2s, v1.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] 2: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2], #32 addp v0.8h, v0.8h, v1.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w16): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h uaddlv s1, v1.8h cmp w4, #16 add v0.2s, v0.2s, v1.2s ushl v4.2s, v0.2s, v17.2s b.eq 1f // h = 4/8/32/64 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v4.2s, v4.2s, v16.2s ushr v4.2s, v4.2s, #17 1: dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] 2: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w32): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h addp v3.8h, v3.8h, v4.8h addp v1.8h, v1.8h, v3.8h uaddlv s1, v1.8h cmp w4, #32 add v0.2s, v0.2s, v1.2s ushl v4.2s, v0.2s, v17.2s b.eq 1f // h = 8/16/64 cmp w4, #8 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v4.2s, v4.2s, v16.2s ushr v4.2s, v4.2s, #17 1: dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 2: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w64): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] addp v3.8h, v3.8h, v4.8h addp v20.8h, v20.8h, v21.8h addp v22.8h, v22.8h, v23.8h addp v1.8h, v1.8h, v3.8h addp v20.8h, v20.8h, v22.8h addp v1.8h, v1.8h, v20.8h uaddlv s1, v1.8h cmp w4, #64 add v0.2s, v0.2s, v1.2s ushl v4.2s, v0.2s, v17.2s b.eq 1f // h = 16/32 cmp w4, #16 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v4.2s, v4.2s, v16.2s ushr v4.2s, v4.2s, #17 1: sub x1, x1, #64 dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 2: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_tbl): .hword L(ipred_dc_tbl) - L(ipred_dc_h64) .hword L(ipred_dc_tbl) - L(ipred_dc_h32) .hword L(ipred_dc_tbl) - L(ipred_dc_h16) .hword L(ipred_dc_tbl) - L(ipred_dc_h8) .hword L(ipred_dc_tbl) - L(ipred_dc_h4) .hword L(ipred_dc_tbl) - L(ipred_dc_w64) .hword L(ipred_dc_tbl) - L(ipred_dc_w32) .hword L(ipred_dc_tbl) - L(ipred_dc_w16) .hword L(ipred_dc_tbl) - L(ipred_dc_w8) .hword L(ipred_dc_tbl) - L(ipred_dc_w4) endfunc // void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_16bpc_neon, export=1 clz w9, w3 adr x5, L(ipred_paeth_tbl) sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.8h}, [x2] add x8, x2, #2 sub x2, x2, #8 sub x5, x5, w9, uxtw mov x7, #-8 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v5.2d}, [x8] sub v6.8h, v5.8h, v4.8h // top - topleft 4: ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 zip1 v0.2d, v0.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d add v16.8h, v6.8h, v0.8h // base add v17.8h, v6.8h, v2.8h sabd v20.8h, v5.8h, v16.8h // tdiff sabd v21.8h, v5.8h, v17.8h sabd v22.8h, v4.8h, v16.8h // tldiff sabd v23.8h, v4.8h, v17.8h sabd v16.8h, v0.8h, v16.8h // ldiff sabd v17.8h, v2.8h, v17.8h umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) umin v19.8h, v21.8h, v23.8h cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff cmge v21.8h, v23.8h, v21.8h cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff cmge v17.8h, v19.8h, v17.8h bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v20.16b, v5.16b, v4.16b bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... bit v20.16b, v0.16b, v16.16b st1 {v21.d}[1], [x0], x1 st1 {v21.d}[0], [x6], x1 subs w4, w4, #4 st1 {v20.d}[1], [x0], x1 st1 {v20.d}[0], [x6], x1 b.gt 4b ret 80: 160: 320: 640: AARCH64_VALID_JUMP_TARGET ld1 {v5.8h}, [x8], #16 mov w9, w3 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 1: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 2: sub v6.8h, v5.8h, v4.8h // top - topleft add v16.8h, v6.8h, v0.8h // base add v17.8h, v6.8h, v1.8h add v18.8h, v6.8h, v2.8h add v19.8h, v6.8h, v3.8h sabd v20.8h, v5.8h, v16.8h // tdiff sabd v21.8h, v5.8h, v17.8h sabd v22.8h, v5.8h, v18.8h sabd v23.8h, v5.8h, v19.8h sabd v24.8h, v4.8h, v16.8h // tldiff sabd v25.8h, v4.8h, v17.8h sabd v26.8h, v4.8h, v18.8h sabd v27.8h, v4.8h, v19.8h sabd v16.8h, v0.8h, v16.8h // ldiff sabd v17.8h, v1.8h, v17.8h sabd v18.8h, v2.8h, v18.8h sabd v19.8h, v3.8h, v19.8h umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) umin v29.8h, v21.8h, v25.8h umin v30.8h, v22.8h, v26.8h umin v31.8h, v23.8h, v27.8h cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff cmge v21.8h, v25.8h, v21.8h cmge v22.8h, v26.8h, v22.8h cmge v23.8h, v27.8h, v23.8h cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff cmge v17.8h, v29.8h, v17.8h cmge v18.8h, v30.8h, v18.8h cmge v19.8h, v31.8h, v19.8h bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v22.16b, v5.16b, v4.16b bsl v21.16b, v5.16b, v4.16b bsl v20.16b, v5.16b, v4.16b bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... bit v22.16b, v2.16b, v18.16b bit v21.16b, v1.16b, v17.16b bit v20.16b, v0.16b, v16.16b st1 {v23.8h}, [x0], #16 st1 {v22.8h}, [x6], #16 subs w3, w3, #8 st1 {v21.8h}, [x5], #16 st1 {v20.8h}, [x10], #16 b.le 8f ld1 {v5.8h}, [x8], #16 b 2b 8: subs w4, w4, #4 b.le 9f // End of horizontal loop, move pointers to next four rows sub x8, x8, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 // Load the top row as early as possible ld1 {v5.8h}, [x8], #16 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_paeth_tbl): .hword L(ipred_paeth_tbl) - 640b .hword L(ipred_paeth_tbl) - 320b .hword L(ipred_paeth_tbl) - 160b .hword L(ipred_paeth_tbl) - 80b .hword L(ipred_paeth_tbl) - 40b endfunc // void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_16bpc_neon, export=1 movrel x10, X(sm_weights) add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_tbl) sub x12, x2, w4, uxtw #1 sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.8h}, [x12] // bottom add x8, x2, #2 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2d}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor sub x2, x2, #8 mov x7, #-8 dup v5.8h, v6.h[3] // right sub v6.8h, v6.8h, v4.8h // top-bottom uxtl v7.8h, v7.8b // weights_hor add v31.4h, v4.4h, v5.4h // bottom+right 4: ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver ushll v20.4s, v31.4h, #8 // (bottom+right)*256 ushll v21.4s, v31.4h, #8 ushll v22.4s, v31.4h, #8 ushll v23.4s, v31.4h, #8 zip1 v1.2d, v1.2d, v0.2d // left, flipped zip1 v0.2d, v3.2d, v2.2d zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor smlal2 v21.4s, v0.8h, v7.8h smlal v22.4s, v1.4h, v7.4h smlal2 v23.4s, v1.8h, v7.8h smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver smlal2 v21.4s, v6.8h, v16.8h smlal v22.4s, v6.4h, v18.4h smlal2 v23.4s, v6.8h, v18.8h rshrn v20.4h, v20.4s, #9 rshrn v21.4h, v21.4s, #9 rshrn v22.4h, v22.4s, #9 rshrn v23.4h, v23.4s, #9 st1 {v20.4h}, [x0], x1 st1 {v21.4h}, [x6], x1 subs w4, w4, #4 st1 {v22.4h}, [x0], x1 st1 {v23.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8h}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor sub x2, x2, #8 mov x7, #-8 dup v5.8h, v6.h[7] // right sub v6.8h, v6.8h, v4.8h // top-bottom uxtl v7.8h, v7.8b // weights_hor add v31.4h, v4.4h, v5.4h // bottom+right 8: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver ushll v20.4s, v31.4h, #8 // (bottom+right)*256 ushll v21.4s, v31.4h, #8 ushll v22.4s, v31.4h, #8 ushll v23.4s, v31.4h, #8 ushll v24.4s, v31.4h, #8 ushll v25.4s, v31.4h, #8 ushll v26.4s, v31.4h, #8 ushll v27.4s, v31.4h, #8 sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h sub v2.8h, v2.8h, v5.8h sub v3.8h, v3.8h, v5.8h uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor smlal2 v21.4s, v3.8h, v7.8h // (left flipped) smlal v22.4s, v2.4h, v7.4h smlal2 v23.4s, v2.8h, v7.8h smlal v24.4s, v1.4h, v7.4h smlal2 v25.4s, v1.8h, v7.8h smlal v26.4s, v0.4h, v7.4h smlal2 v27.4s, v0.8h, v7.8h smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver smlal2 v21.4s, v6.8h, v16.8h smlal v22.4s, v6.4h, v17.4h smlal2 v23.4s, v6.8h, v17.8h smlal v24.4s, v6.4h, v18.4h smlal2 v25.4s, v6.8h, v18.8h smlal v26.4s, v6.4h, v19.4h smlal2 v27.4s, v6.8h, v19.8h rshrn v20.4h, v20.4s, #9 rshrn2 v20.8h, v21.4s, #9 rshrn v21.4h, v22.4s, #9 rshrn2 v21.8h, v23.4s, #9 rshrn v22.4h, v24.4s, #9 rshrn2 v22.8h, v25.4s, #9 rshrn v23.4h, v26.4s, #9 rshrn2 v23.8h, v27.4s, #9 st1 {v20.8h}, [x0], x1 st1 {v21.8h}, [x6], x1 subs w4, w4, #4 st1 {v22.8h}, [x0], x1 st1 {v23.8h}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET add x12, x2, w3, uxtw #1 sub x1, x1, w3, uxtw #1 ld1r {v5.8h}, [x12] // right sub x2, x2, #4 mov x7, #-4 mov w9, w3 add v31.4h, v4.4h, v5.4h // bottom+right 1: ld2r {v0.8h, v1.8h}, [x2], x7 // left ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b 2: ld1 {v7.16b}, [x10], #16 // weights_hor ld1 {v2.8h, v3.8h}, [x8], #32 // top ushll v20.4s, v31.4h, #8 // (bottom+right)*256 ushll v21.4s, v31.4h, #8 ushll v22.4s, v31.4h, #8 ushll v23.4s, v31.4h, #8 ushll v24.4s, v31.4h, #8 ushll v25.4s, v31.4h, #8 ushll v26.4s, v31.4h, #8 ushll v27.4s, v31.4h, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b sub v2.8h, v2.8h, v4.8h // top-bottom sub v3.8h, v3.8h, v4.8h smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor smlal2 v21.4s, v1.8h, v6.8h // (left flipped) smlal v22.4s, v1.4h, v7.4h smlal2 v23.4s, v1.8h, v7.8h smlal v24.4s, v0.4h, v6.4h smlal2 v25.4s, v0.8h, v6.8h smlal v26.4s, v0.4h, v7.4h smlal2 v27.4s, v0.8h, v7.8h smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver smlal2 v21.4s, v2.8h, v16.8h smlal v22.4s, v3.4h, v16.4h smlal2 v23.4s, v3.8h, v16.8h smlal v24.4s, v2.4h, v17.4h smlal2 v25.4s, v2.8h, v17.8h smlal v26.4s, v3.4h, v17.4h smlal2 v27.4s, v3.8h, v17.8h rshrn v20.4h, v20.4s, #9 rshrn2 v20.8h, v21.4s, #9 rshrn v21.4h, v22.4s, #9 rshrn2 v21.8h, v23.4s, #9 rshrn v22.4h, v24.4s, #9 rshrn2 v22.8h, v25.4s, #9 rshrn v23.4h, v26.4s, #9 rshrn2 v23.8h, v27.4s, #9 subs w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.8h, v23.8h}, [x6], #32 b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x8, w9, uxtw #1 sub x10, x10, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_tbl): .hword L(ipred_smooth_tbl) - 640b .hword L(ipred_smooth_tbl) - 320b .hword L(ipred_smooth_tbl) - 160b .hword L(ipred_smooth_tbl) - 80b .hword L(ipred_smooth_tbl) - 40b endfunc // void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_16bpc_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 adr x5, L(ipred_smooth_v_tbl) sub x8, x2, w4, uxtw #1 sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.8h}, [x8] // bottom add x2, x2, #2 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2d}, [x2] // top sub v6.8h, v6.8h, v4.8h // top-bottom 4: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s ushll v16.8h, v16.8b, #7 // weights_ver << 7 ushll v18.8h, v18.8b, #7 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 sqrdmulh v21.8h, v6.8h, v18.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v4.8h st1 {v20.d}[0], [x0], x1 st1 {v20.d}[1], [x6], x1 subs w4, w4, #4 st1 {v21.d}[0], [x0], x1 st1 {v21.d}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8h}, [x2] // top sub v6.8h, v6.8h, v4.8h // top-bottom 8: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver ushll v16.8h, v16.8b, #7 // weights_ver << 7 ushll v17.8h, v17.8b, #7 ushll v18.8h, v18.8b, #7 ushll v19.8h, v19.8b, #7 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 sqrdmulh v21.8h, v6.8h, v17.8h sqrdmulh v22.8h, v6.8h, v18.8h sqrdmulh v23.8h, v6.8h, v19.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v4.8h add v22.8h, v22.8h, v4.8h add v23.8h, v23.8h, v4.8h st1 {v20.8h}, [x0], x1 st1 {v21.8h}, [x6], x1 subs w4, w4, #4 st1 {v22.8h}, [x0], x1 st1 {v23.8h}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET // Set up pointers for four rows in parallel; x0, x6, x5, x8 add x5, x0, x1 add x8, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver ushll v16.8h, v16.8b, #7 // weights_ver << 7 ushll v17.8h, v17.8b, #7 ushll v18.8h, v18.8b, #7 ushll v19.8h, v19.8b, #7 2: ld1 {v2.8h, v3.8h}, [x2], #32 // top sub v2.8h, v2.8h, v4.8h // top-bottom sub v3.8h, v3.8h, v4.8h sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 sqrdmulh v21.8h, v3.8h, v16.8h sqrdmulh v22.8h, v2.8h, v17.8h sqrdmulh v23.8h, v3.8h, v17.8h sqrdmulh v24.8h, v2.8h, v18.8h sqrdmulh v25.8h, v3.8h, v18.8h sqrdmulh v26.8h, v2.8h, v19.8h sqrdmulh v27.8h, v3.8h, v19.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v4.8h add v22.8h, v22.8h, v4.8h add v23.8h, v23.8h, v4.8h add v24.8h, v24.8h, v4.8h add v25.8h, v25.8h, v4.8h add v26.8h, v26.8h, v4.8h add v27.8h, v27.8h, v4.8h subs w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.8h, v23.8h}, [x6], #32 st1 {v24.8h, v25.8h}, [x5], #32 st1 {v26.8h, v27.8h}, [x8], #32 b.gt 2b subs w4, w4, #4 b.le 9f sub x2, x2, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x8, x8, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_v_tbl): .hword L(ipred_smooth_v_tbl) - 640b .hword L(ipred_smooth_v_tbl) - 320b .hword L(ipred_smooth_v_tbl) - 160b .hword L(ipred_smooth_v_tbl) - 80b .hword L(ipred_smooth_v_tbl) - 40b endfunc // void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_16bpc_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_h_tbl) add x12, x2, w3, uxtw #1 sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v5.8h}, [x12] // right sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v7.2s}, [x8] // weights_hor sub x2, x2, #8 mov x7, #-8 ushll v7.8h, v7.8b, #7 // weights_hor << 7 4: ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left zip1 v1.2d, v1.2d, v0.2d // left, flipped zip1 v0.2d, v3.2d, v2.2d sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 sqrdmulh v21.8h, v1.8h, v7.8h add v20.8h, v20.8h, v5.8h add v21.8h, v21.8h, v5.8h st1 {v20.d}[0], [x0], x1 st1 {v20.d}[1], [x6], x1 subs w4, w4, #4 st1 {v21.d}[0], [x0], x1 st1 {v21.d}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v7.8b}, [x8] // weights_hor sub x2, x2, #8 mov x7, #-8 ushll v7.8h, v7.8b, #7 // weights_hor << 7 8: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left sub v3.8h, v3.8h, v5.8h // left-right sub v2.8h, v2.8h, v5.8h sub v1.8h, v1.8h, v5.8h sub v0.8h, v0.8h, v5.8h sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) sqrdmulh v22.8h, v1.8h, v7.8h sqrdmulh v23.8h, v0.8h, v7.8h add v20.8h, v20.8h, v5.8h add v21.8h, v21.8h, v5.8h add v22.8h, v22.8h, v5.8h add v23.8h, v23.8h, v5.8h st1 {v20.8h}, [x0], x1 st1 {v21.8h}, [x6], x1 subs w4, w4, #4 st1 {v22.8h}, [x0], x1 st1 {v23.8h}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET sub x2, x2, #8 mov x7, #-8 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h sub v2.8h, v2.8h, v5.8h sub v3.8h, v3.8h, v5.8h 2: ld1 {v7.16b}, [x8], #16 // weights_hor ushll v6.8h, v7.8b, #7 // weights_hor << 7 ushll2 v7.8h, v7.16b, #7 sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) sqrdmulh v22.8h, v2.8h, v6.8h sqrdmulh v23.8h, v2.8h, v7.8h sqrdmulh v24.8h, v1.8h, v6.8h sqrdmulh v25.8h, v1.8h, v7.8h sqrdmulh v26.8h, v0.8h, v6.8h sqrdmulh v27.8h, v0.8h, v7.8h add v20.8h, v20.8h, v5.8h add v21.8h, v21.8h, v5.8h add v22.8h, v22.8h, v5.8h add v23.8h, v23.8h, v5.8h add v24.8h, v24.8h, v5.8h add v25.8h, v25.8h, v5.8h add v26.8h, v26.8h, v5.8h add v27.8h, v27.8h, v5.8h subs w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.8h, v23.8h}, [x6], #32 st1 {v24.8h, v25.8h}, [x5], #32 st1 {v26.8h, v27.8h}, [x10], #32 b.gt 2b subs w4, w4, #4 b.le 9f sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_h_tbl): .hword L(ipred_smooth_h_tbl) - 640b .hword L(ipred_smooth_h_tbl) - 320b .hword L(ipred_smooth_h_tbl) - 160b .hword L(ipred_smooth_h_tbl) - 80b .hword L(ipred_smooth_h_tbl) - 40b endfunc const padding_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 padding_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz, // const pixel *const in, const int end, // const int bitdepth_max); function ipred_z1_upsample_edge_16bpc_neon, export=1 dup v30.8h, w4 // bitdepth_max movrel x4, padding_mask ld1 {v0.8h, v1.8h}, [x2] // in[] add x5, x2, w3, uxtw #1 // in[end] sub x4, x4, w3, uxtw #1 ld1r {v2.8h}, [x5] // padding ld1 {v3.8h, v4.8h}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v2.16b, v3.16b // padded in[] bit v1.16b, v2.16b, v4.16b ext v4.16b, v0.16b, v1.16b, #2 ext v5.16b, v1.16b, v2.16b, #2 ext v6.16b, v0.16b, v1.16b, #4 ext v7.16b, v1.16b, v2.16b, #4 ext v16.16b, v0.16b, v1.16b, #6 ext v17.16b, v1.16b, v2.16b, #6 add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2] add v19.8h, v5.8h, v7.8h add v20.8h, v0.8h, v16.8h add v21.8h, v1.8h, v17.8h umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2]) umull2 v23.4s, v18.8h, v31.8h umull v24.4s, v19.4h, v31.4h umull2 v25.4s, v19.8h, v31.8h usubw v22.4s, v22.4s, v20.4h usubw2 v23.4s, v23.4s, v20.8h usubw v24.4s, v24.4s, v21.4h usubw2 v25.4s, v25.4s, v21.8h sqrshrun v16.4h, v22.4s, #4 sqrshrun2 v16.8h, v23.4s, #4 sqrshrun v17.4h, v24.4s, #4 sqrshrun2 v17.8h, v25.4s, #4 smin v16.8h, v16.8h, v30.8h smin v17.8h, v17.8h, v30.8h zip1 v0.8h, v4.8h, v16.8h zip2 v1.8h, v4.8h, v16.8h zip1 v2.8h, v5.8h, v17.8h zip2 v3.8h, v5.8h, v17.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] ret endfunc // void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz, // const pixel *const in, // const int bitdepth_max); function ipred_z2_upsample_edge_16bpc_neon, export=1 dup v30.8h, w3 // bitdepth_max // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. movrel x4, padding_mask ld1 {v0.8h, v1.8h}, [x2] // in[] add x5, x2, w1, uxtw #1 // in[sz] sub x4, x4, w1, uxtw #1 ld1r {v3.8h}, [x2] // in[0] for padding ld1r {v2.8h}, [x5] // padding ld1 {v4.8h, v5.8h}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v2.16b, v4.16b // padded in[] bit v1.16b, v2.16b, v5.16b ext v4.16b, v3.16b, v0.16b, #14 ext v5.16b, v0.16b, v1.16b, #2 ext v6.16b, v0.16b, v1.16b, #4 add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1] add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2] umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2]) umull2 v19.4s, v16.8h, v31.8h usubw v18.4s, v18.4s, v17.4h usubw2 v19.4s, v19.4s, v17.8h sqrshrun v16.4h, v18.4s, #4 sqrshrun2 v16.8h, v19.4s, #4 add x5, x0, #2*16 smin v16.8h, v16.8h, v30.8h zip1 v4.8h, v0.8h, v16.8h zip2 v5.8h, v0.8h, v16.8h st1 {v2.h}[0], [x5] // In case sz=8, output one single pixel in out[16]. st1 {v4.8h, v5.8h}, [x0] ret endfunc const edge_filter .short 0, 4, 8, 0 .short 0, 5, 6, 0 // Leaving out the coeffs for strength=3 // .byte 2, 4, 4, 0 endconst // void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz, // const pixel *const in, const int end, // const int strength); function ipred_z1_filter_edge_16bpc_neon, export=1 cmp w4, #3 b.eq L(fivetap) // if (strength == 3) goto fivetap movrel x5, edge_filter, -6 add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1) ld1 {v31.s}[0], [x5] // kernel[1-2] ld1 {v0.8h}, [x2], #16 dup v30.8h, v31.h[0] dup v31.8h, v31.h[1] 1: // in[end], is the last valid pixel. We produce 16 pixels out by // using 18 pixels in - the last pixel used is [17] of the ones // read/buffered. cmp w3, #17 ld1 {v1.8h, v2.8h}, [x2], #32 b.lt 2f ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 mul v16.8h, v0.8h, v30.8h mla v16.8h, v3.8h, v31.8h mla v16.8h, v5.8h, v30.8h mul v17.8h, v1.8h, v30.8h mla v17.8h, v4.8h, v31.8h mla v17.8h, v6.8h, v30.8h subs w1, w1, #16 mov v0.16b, v2.16b urshr v16.8h, v16.8h, #4 urshr v17.8h, v17.8h, #4 sub w3, w3, #16 st1 {v16.8h, v17.8h}, [x0], #32 b.gt 1b ret 2: // Right padding // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead) movrel x5, padding_mask sub w6, w3, #24 sub x5, x5, w3, uxtw #1 add x6, x2, w6, sxtw #1 ld1 {v3.8h, v4.8h}, [x5] // padding_mask ld1r {v2.8h}, [x6] bit v0.16b, v2.16b, v3.16b // Pad v0-v1 bit v1.16b, v2.16b, v4.16b // Filter one block ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 mul v16.8h, v0.8h, v30.8h mla v16.8h, v3.8h, v31.8h mla v16.8h, v5.8h, v30.8h mul v17.8h, v1.8h, v30.8h mla v17.8h, v4.8h, v31.8h mla v17.8h, v6.8h, v30.8h subs w1, w1, #16 urshr v16.8h, v16.8h, #4 urshr v17.8h, v17.8h, #4 st1 {v16.8h, v17.8h}, [x0], #32 b.le 9f 5: // After one block, any remaining output would only be filtering // padding - thus just store the padding. subs w1, w1, #16 st1 {v2.16b}, [x0], #16 b.gt 5b 9: ret L(fivetap): sub x2, x2, #2 // topleft -= 1 pixel movi v29.8h, #2 ld1 {v0.8h}, [x2], #16 movi v30.8h, #4 movi v31.8h, #4 ins v0.h[0], v0.h[1] 1: // in[end+1], is the last valid pixel. We produce 16 pixels out by // using 20 pixels in - the last pixel used is [19] of the ones // read/buffered. cmp w3, #18 ld1 {v1.8h, v2.8h}, [x2], #32 b.lt 2f // if (end + 1 < 19) ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 ext v16.16b, v0.16b, v1.16b, #6 ext v17.16b, v1.16b, v2.16b, #6 ext v18.16b, v0.16b, v1.16b, #8 ext v19.16b, v1.16b, v2.16b, #8 mul v20.8h, v0.8h, v29.8h mla v20.8h, v3.8h, v30.8h mla v20.8h, v5.8h, v31.8h mla v20.8h, v16.8h, v30.8h mla v20.8h, v18.8h, v29.8h mul v21.8h, v1.8h, v29.8h mla v21.8h, v4.8h, v30.8h mla v21.8h, v6.8h, v31.8h mla v21.8h, v17.8h, v30.8h mla v21.8h, v19.8h, v29.8h subs w1, w1, #16 mov v0.16b, v2.16b urshr v20.8h, v20.8h, #4 urshr v21.8h, v21.8h, #4 sub w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 b.gt 1b ret 2: // Right padding // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead) movrel x5, padding_mask, -2 sub w6, w3, #23 sub x5, x5, w3, uxtw #1 add x6, x2, w6, sxtw #1 ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask ld1r {v28.8h}, [x6] bit v0.16b, v28.16b, v3.16b // Pad v0-v2 bit v1.16b, v28.16b, v4.16b bit v2.16b, v28.16b, v5.16b 4: // Filter one block ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 ext v16.16b, v0.16b, v1.16b, #6 ext v17.16b, v1.16b, v2.16b, #6 ext v18.16b, v0.16b, v1.16b, #8 ext v19.16b, v1.16b, v2.16b, #8 mul v20.8h, v0.8h, v29.8h mla v20.8h, v3.8h, v30.8h mla v20.8h, v5.8h, v31.8h mla v20.8h, v16.8h, v30.8h mla v20.8h, v18.8h, v29.8h mul v21.8h, v1.8h, v29.8h mla v21.8h, v4.8h, v30.8h mla v21.8h, v6.8h, v31.8h mla v21.8h, v17.8h, v30.8h mla v21.8h, v19.8h, v29.8h subs w1, w1, #16 mov v0.16b, v2.16b mov v1.16b, v28.16b mov v2.16b, v28.16b urshr v20.8h, v20.8h, #4 urshr v21.8h, v21.8h, #4 sub w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 b.le 9f // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to // filter properly once more - aka (w3 >= 0). cmp w3, #0 b.ge 4b 5: // When w3 <= 0, all remaining pixels in v0-v1 are equal to the // last valid pixel - thus just output that without filtering. subs w1, w1, #8 st1 {v28.8h}, [x0], #16 b.gt 5b 9: ret endfunc // void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px, // const int n); function ipred_pixel_set_16bpc_neon, export=1 dup v0.8h, w1 1: subs w2, w2, #8 st1 {v0.8h}, [x0], #16 b.gt 1b ret endfunc // void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const int width, const int height, // const int dx, const int max_base_x); function ipred_z1_fill1_16bpc_neon, export=1 clz w9, w3 adr x8, L(ipred_z1_fill1_tbl) sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] add x10, x2, w6, uxtw #1 // top[max_base_x] sub x8, x8, w9, uxtw ld1r {v31.8h}, [x10] // padding mov w7, w5 mov w15, #64 br x8 40: AARCH64_VALID_JUMP_TARGET 4: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 ext v1.16b, v0.16b, v0.16b, #2 // top[base+1] ext v3.16b, v2.16b, v2.16b, #2 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 st1 {v16.4h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.4h}, [x0], x1 b.gt 4b ret 49: st1 {v31.4h}, [x0], x1 subs w4, w4, #2 st1 {v31.4h}, [x0], x1 b.gt 49b ret 80: AARCH64_VALID_JUMP_TARGET 8: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h}, [x8] // top[base] ld1 {v2.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 ldr h1, [x8, #16] ldr h3, [x10, #16] dup v6.8h, w9 // 64 - frac dup v7.8h, w11 ext v1.16b, v0.16b, v1.16b, #2 // top[base+1] ext v3.16b, v2.16b, v3.16b, #2 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v1.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v3.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v3.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 st1 {v16.8h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8h}, [x0], x1 b.gt 8b ret 89: st1 {v31.8h}, [x0], x1 subs w4, w4, #2 st1 {v31.8h}, [x0], x1 b.gt 89b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov w12, w3 add x13, x0, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 169f add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v6.8h, w9 // frac dup v7.8h, w11 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base] ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v16.8h, w9 // 64 - frac dup v17.8h, w11 add w7, w7, w5 // xpos += dx 2: ext v18.16b, v0.16b, v1.16b, #2 // top[base+1] ext v19.16b, v1.16b, v2.16b, #2 ext v20.16b, v3.16b, v4.16b, #2 ext v21.16b, v4.16b, v5.16b, #2 subs w3, w3, #16 umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac) umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac umull2 v23.4s, v0.8h, v16.8h umlal2 v23.4s, v18.8h, v6.8h umull v24.4s, v1.4h, v16.4h umlal v24.4s, v19.4h, v6.4h umull2 v25.4s, v1.8h, v16.8h umlal2 v25.4s, v19.8h, v6.8h umull v26.4s, v3.4h, v17.4h umlal v26.4s, v20.4h, v7.4h umull2 v27.4s, v3.8h, v17.8h umlal2 v27.4s, v20.8h, v7.8h umull v28.4s, v4.4h, v17.4h umlal v28.4s, v21.4h, v7.4h umull2 v29.4s, v4.8h, v17.8h umlal2 v29.4s, v21.8h, v7.8h rshrn v22.4h, v22.4s, #6 rshrn2 v22.8h, v23.4s, #6 rshrn v23.4h, v24.4s, #6 rshrn2 v23.8h, v25.4s, #6 rshrn v24.4h, v26.4s, #6 rshrn2 v24.8h, v27.4s, #6 rshrn v25.4h, v28.4s, #6 rshrn2 v25.8h, v29.4s, #6 st1 {v22.8h, v23.8h}, [x0], #32 st1 {v24.8h, v25.8h}, [x13], #32 b.le 3f mov v0.16b, v2.16b ld1 {v1.8h, v2.8h}, [x8], #32 // top[base] mov v3.16b, v5.16b ld1 {v4.8h, v5.8h}, [x10], #32 b 2b 3: subs w4, w4, #2 b.le 9f add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 1b 9: ret 169: st1 {v31.8h}, [x0], #16 subs w3, w3, #8 st1 {v31.8h}, [x13], #16 b.gt 169b subs w4, w4, #2 b.le 9b add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 169b L(ipred_z1_fill1_tbl): .hword L(ipred_z1_fill1_tbl) - 640b .hword L(ipred_z1_fill1_tbl) - 320b .hword L(ipred_z1_fill1_tbl) - 160b .hword L(ipred_z1_fill1_tbl) - 80b .hword L(ipred_z1_fill1_tbl) - 40b endfunc function ipred_z1_fill2_16bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // top[max_base_x] ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 b.eq 8f 4: // w == 4 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] uzp1 v0.8h, v0.8h, v0.8h // top[base] uzp2 v3.8h, v2.8h, v2.8h uzp1 v2.8h, v2.8h, v2.8h sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 st1 {v16.4h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.4h}, [x0], x1 b.gt 4b ret 49: st1 {v31.4h}, [x0], x1 subs w4, w4, #2 st1 {v31.4h}, [x0], x1 b.gt 49b ret 8: // w == 8 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h, v1.8h}, [x8] // top[base] ld1 {v2.8h, v3.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8h, w9 // 64 - frac dup v7.8h, w11 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] uzp1 v0.8h, v0.8h, v1.8h // top[base] uzp2 v21.8h, v2.8h, v3.8h uzp1 v2.8h, v2.8h, v3.8h umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v20.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v21.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v21.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 st1 {v16.8h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8h}, [x0], x1 b.gt 8b ret 89: st1 {v31.8h}, [x0], x1 subs w4, w4, #2 st1 {v31.8h}, [x0], x1 b.gt 89b ret endfunc // void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src, // const int n); function ipred_reverse_16bpc_neon, export=1 sub x1, x1, #16 add x3, x0, #8 mov x4, #16 1: ld1 {v0.8h}, [x1] subs w2, w2, #8 rev64 v0.8h, v0.8h sub x1, x1, #16 st1 {v0.d}[1], [x0], x4 st1 {v0.d}[0], [x3], x4 b.gt 1b ret endfunc const increments .short 0, 1, 2, 3, 4, 5, 6, 7 endconst // void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const pixel *const left, // const int width, const int height, // const int dx, const int dy); function ipred_z2_fill1_16bpc_neon, export=1 clz w10, w4 adr x9, L(ipred_z2_fill1_tbl) sub w10, w10, #25 ldrh w10, [x9, w10, uxtw #1] mov w8, #(1 << 6) // xpos = 1 << 6 sub x9, x9, w10, uxtw sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy br x9 40: AARCH64_VALID_JUMP_TARGET dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.8h, #0x3e add v30.4h, v16.4h, v30.4h // -= dy // Worst case height for w=4 is 16, but we need at least h+1 elements ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v30.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 movi v23.4h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y zip1 v29.8b, v29.8b, v29.8b // duplicate elements movi v17.8b, #2 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) tbl v18.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 sub v28.4h, v26.4h, v27.4h // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y movi v29.16b, #4 4: asr w9, w8, #6 // base_x dup v16.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f lsl w9, w9, #1 lsl w11, w11, #1 dup v17.4h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] trn1 v16.2d, v16.2d, v17.2d // xpos // Cut corners here; only doing tbl over v0-v1 here; we only // seem to need the last pixel, from v2, after skipping to the // left-only codepath below. tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] sshr v20.8h, v16.8h, #6 // first base_x for each row ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v17.8h, v26.8h, v16.8h // 64 - frac_x add v20.8h, v20.8h, v31.8h // actual base_x umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v22.4s, v18.8h, v28.8h umlal2 v22.4s, v19.8h, v27.8h umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v24.4s, v4.8h, v17.8h umlal2 v24.4s, v5.8h, v16.8h cmge v20.8h, v20.8h, #0 rshrn v21.4h, v21.4s, #6 rshrn2 v21.8h, v22.4s, #6 rshrn v22.4h, v23.4s, #6 rshrn2 v22.8h, v24.4s, #6 bit v21.16b, v22.16b, v20.16b st1 {v21.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v21.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 4b 49: tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2] trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v21.4s, v18.8h, v28.8h umlal2 v21.4s, v19.8h, v27.8h rshrn v20.4h, v20.4s, #6 rshrn2 v20.8h, v21.4s, #6 st1 {v20.d}[0], [x0], x1 subs w5, w5, #2 st1 {v20.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 49b 9: ret 80: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v18.8h, w7 // -dy add x3, x3, #2 // Skip past left[0] mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.8h, #0x3e add v16.8h, v16.8h, v18.8h // -= dy // Worst case height for w=8 is 32. ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] ld1r {v15.8h}, [x2] // left[0] == top[0] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v16.8h, #6 // ypos >> 6 and v27.16b, v16.16b, v25.16b // frac_y movi v23.8h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y mov v18.16b, v15.16b // left[0] zip1 v29.16b, v29.16b, v29.16b // duplicate elements movi v17.16b, #2 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbx v18.16b, {v0.16b}, v29.16b // left[base_y] add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) sub v28.8h, v26.8h, v27.8h // 64 - frac_y movi v24.16b, #4 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] mov v19.16b, v15.16b // left[0] ld1 {v6.8h, v7.8h}, [x11] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] mov v20.16b, v15.16b // left[0] sshr v21.8h, v16.8h, #6 // first base_x sshr v22.8h, v17.8h, #6 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v7.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y sub v8.8h, v26.8h, v16.8h // 64 - frac_x sub v9.8h, v26.8h, v17.8h umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h add v21.8h, v21.8h, v31.8h // actual base_x add v22.8h, v22.8h, v31.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v5.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v7.4h, v17.4h umull2 v18.4s, v6.8h, v9.8h umlal2 v18.4s, v7.8h, v17.8h cmge v21.8h, v21.8h, #0 cmge v22.8h, v22.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 bit v10.16b, v12.16b, v21.16b bit v11.16b, v13.16b, v22.16b st1 {v10.8h}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 8b 89: mov v19.16b, v15.16b mov v20.16b, v15.16b tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v5.4s, v18.8h, v28.8h umlal2 v5.4s, v19.8h, v27.8h umull v6.4s, v19.4h, v28.4h umlal v6.4s, v20.4h, v27.4h umull2 v7.4s, v19.8h, v28.8h umlal2 v7.4s, v20.8h, v27.8h rshrn v4.4h, v4.4s, #6 rshrn2 v4.8h, v5.4s, #6 rshrn v5.4h, v6.4s, #6 rshrn2 v5.8h, v7.4s, #6 st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 89b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v25.8h, w7 // -dy add x3, x3, #2 // Skip past left[0] add x13, x0, x1 // alternating row lsl x1, x1, #1 // stride *= 2 sub x1, x1, w4, uxtw #1 // stride -= width movi v11.8h, #8 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy add v26.8h, v26.8h, v25.8h // -= dy mul v25.8h, v25.8h, v11.8h // -8*dy // Worst case height is 64, but we can only fit 32 pixels into // v0-v3 usable within one tbx instruction. As long as base_y is // up to 32, we use tbx. ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] ld1r {v15.8h}, [x2] // left[0] == top[0] mov w12, w4 // orig w neg w14, w4 // -w 1: mov v23.16b, v26.16b // reset ypos asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, w14 // base_x <= -2*w asr w11, w8, #6 // base_x b.le 169f dup v17.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 sshr v21.8h, v16.8h, #6 // first base_x sshr v22.8h, v17.8h, #6 ld1 {v4.8h}, [x9], #16 // top[base_x] ld1 {v6.8h}, [x11], #16 movi v10.8h, #0x3e movi v11.8h, #64 and v16.16b, v16.16b, v10.16b // frac_x and v17.16b, v17.16b, v10.16b sub v8.8h, v11.8h, v16.8h // 64 - frac_x sub v9.8h, v11.8h, v17.8h add v21.8h, v21.8h, v31.8h // actual base_x add v22.8h, v22.8h, v31.8h 2: smov w10, v22.h[0] shrn v29.8b, v23.8h, #6 // ypos >> 6 movi v12.8h, #64 cmp w10, #0 // base_x (bottom left) >= 0 smov w10, v29.b[0] // base_y[0] movi v10.8h, #0x3e b.ge 4f and v27.16b, v23.16b, v10.16b // frac_y cmp w10, #(32-3) mov v18.16b, v15.16b // left[0] sub v28.8h, v12.8h, v27.8h // 64 - frac_y b.gt 22f 21: // base_y < 32, using tbx shl v29.8b, v29.8b, #1 // 2*base_y movi v11.8h, #1, lsl #8 zip1 v29.16b, v29.16b, v29.16b // duplicate elements add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... movi v13.16b, #2 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v13.16b // base_y + 1 (*2) mov v19.16b, v15.16b // left[0] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v13.16b // base_y + 2 (*2) mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] b 23f 22: // base_y >= 32, using separate loads. smov w15, v29.b[1] smov w16, v29.b[2] add x10, x3, w10, sxtw #1 smov w17, v29.b[3] add x15, x3, w15, sxtw #1 ld3 {v18.h, v19.h, v20.h}[0], [x10] smov w10, v29.b[4] add x16, x3, w16, sxtw #1 ld3 {v18.h, v19.h, v20.h}[1], [x15] smov w15, v29.b[5] add x17, x3, w17, sxtw #1 ld3 {v18.h, v19.h, v20.h}[2], [x16] smov w16, v29.b[6] add x10, x3, w10, sxtw #1 ld3 {v18.h, v19.h, v20.h}[3], [x17] smov w17, v29.b[7] add x15, x3, w15, sxtw #1 add x16, x3, w16, sxtw #1 ld3 {v18.h, v19.h, v20.h}[4], [x10] add x17, x3, w17, sxtw #1 ld3 {v18.h, v19.h, v20.h}[5], [x15] ld3 {v18.h, v19.h, v20.h}[6], [x16] ld3 {v18.h, v19.h, v20.h}[7], [x17] 23: ld1 {v5.8h}, [x9], #16 // top[base_x] ld1 {v7.8h}, [x11], #16 add v23.8h, v23.8h, v25.8h // ypos -= 8*dy umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #2 rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v18.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v19.4h, v17.4h umull2 v20.4s, v6.8h, v9.8h umlal2 v20.4s, v19.8h, v17.8h cmge v18.8h, v21.8h, #0 cmge v19.8h, v22.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v20.4s, #6 bit v10.16b, v12.16b, v18.16b bit v11.16b, v13.16b, v19.16b st1 {v10.8h}, [x0], #16 subs w4, w4, #8 st1 {v11.8h}, [x13], #16 b.le 3f movi v10.8h, #8 mov v4.16b, v5.16b mov v6.16b, v7.16b add v21.8h, v21.8h, v10.8h // base_x += 8 add v22.8h, v22.8h, v10.8h b 2b 3: subs w5, w5, #2 b.le 9f movi v10.8h, #128 add x0, x0, x1 add x13, x13, x1 mov w4, w12 // reset w add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) b 1b 4: // The rest of the row only predicted from top[] ld1 {v5.8h}, [x9], #16 // top[base_x] ld1 {v7.8h}, [x11], #16 ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #2 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v18.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v19.4h, v17.4h umull2 v20.4s, v6.8h, v9.8h umlal2 v20.4s, v19.8h, v17.8h rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v20.4s, #6 st1 {v12.8h}, [x0], #16 subs w4, w4, #8 st1 {v13.8h}, [x13], #16 b.le 3b mov v4.16b, v5.16b mov v6.16b, v7.16b b 4b 169: // The rest of the block only predicted from left[] add x1, x1, w4, uxtw #1 // restore stride mov w12, w5 // orig remaining h 1: movi v12.8h, #64 movi v10.8h, #0x3e shrn v29.8b, v23.8h, #6 // ypos >> 6 and v27.16b, v23.16b, v10.16b // frac_y smov w10, v29.b[0] // base_y[0] shl v29.8b, v29.8b, #1 // 2*base_y movi v11.8h, #1, lsl #8 zip1 v29.16b, v29.16b, v29.16b // duplicate elements add v23.8h, v23.8h, v25.8h // ypos -= 8*dy add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... cmp w10, #(32-1) mov v18.16b, v15.16b // left[0] movi v21.16b, #2 sub v28.8h, v12.8h, v27.8h // 64 - frac_y b.gt 31f tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v21.16b // base_y + 1 (*2) 2: // base_y < 32, using tbx. smov w10, v29.b[0] // base_y[0] mov v19.16b, v15.16b // left[0] cmp w10, #(64-4) b.gt 32f tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v21.16b // base_y + 2 (*2) mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] add v29.16b, v29.16b, v21.16b // next base_y umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 st1 {v10.8h}, [x0], x1 subs w5, w5, #2 st1 {v11.8h}, [x13], x1 b.le 4f mov v18.16b, v20.16b b 2b 31: // base_y >= 32, using separate loads, loading v18 if we had to bail // in the prologue. smov w10, v29.b[0] smov w15, v29.b[2] movi v21.16b, #2 smov w16, v29.b[4] add x10, x3, w10, sxtw smov w17, v29.b[6] add x15, x3, w15, sxtw ld1 {v18.h}[0], [x10] smov w10, v29.b[8] add x16, x3, w16, sxtw ld1 {v18.h}[1], [x15] smov w15, v29.b[10] add x17, x3, w17, sxtw ld1 {v18.h}[2], [x16] smov w16, v29.b[12] add x10, x3, w10, sxtw ld1 {v18.h}[3], [x17] smov w17, v29.b[14] add x15, x3, w15, sxtw add x16, x3, w16, sxtw ld1 {v18.h}[4], [x10] add x17, x3, w17, sxtw ld1 {v18.h}[5], [x15] add v29.16b, v29.16b, v21.16b // next base_y ld1 {v18.h}[6], [x16] ld1 {v18.h}[7], [x17] 32: // base_y >= 32, using separate loads. cmp w5, #4 b.lt 34f 33: // h >= 4, preserving v18 from the previous round, loading v19-v22. smov w10, v29.b[0] subs w5, w5, #4 smov w15, v29.b[2] movi v10.16b, #8 smov w16, v29.b[4] add x10, x3, w10, sxtw smov w17, v29.b[6] add x15, x3, w15, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[0], [x10] smov w10, v29.b[8] add x16, x3, w16, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[1], [x15] smov w15, v29.b[10] add x17, x3, w17, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[2], [x16] smov w16, v29.b[12] add x10, x3, w10, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[3], [x17] smov w17, v29.b[14] add x15, x3, w15, sxtw add x16, x3, w16, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[4], [x10] add x17, x3, w17, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[5], [x15] ld4 {v19.h, v20.h, v21.h, v22.h}[6], [x16] add v29.16b, v29.16b, v10.16b // next base_y ld4 {v19.h, v20.h, v21.h, v22.h}[7], [x17] umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v20.4h, v28.4h // left[base_y]*(64-frac_y) umlal v12.4s, v21.4h, v27.4h // + left[base_y+1]*frac_y umull2 v13.4s, v20.8h, v28.8h umlal2 v13.4s, v21.8h, v27.8h umull v14.4s, v21.4h, v28.4h umlal v14.4s, v22.4h, v27.4h umull2 v18.4s, v21.8h, v28.8h umlal2 v18.4s, v22.8h, v27.8h rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 st1 {v10.8h}, [x0], x1 cmp w5, #2 st1 {v11.8h}, [x13], x1 st1 {v12.8h}, [x0], x1 st1 {v13.8h}, [x13], x1 b.lt 4f mov v18.16b, v22.16b b.gt 33b 34: // h == 2, preserving v18 from the previous round, loading v19-v20. smov w10, v29.b[0] smov w15, v29.b[2] movi v21.16b, #4 smov w16, v29.b[4] add x10, x3, w10, sxtw smov w17, v29.b[6] add x15, x3, w15, sxtw ld2 {v19.h, v20.h}[0], [x10] smov w10, v29.b[8] add x16, x3, w16, sxtw ld2 {v19.h, v20.h}[1], [x15] smov w15, v29.b[10] add x17, x3, w17, sxtw ld2 {v19.h, v20.h}[2], [x16] smov w16, v29.b[12] add x10, x3, w10, sxtw ld2 {v19.h, v20.h}[3], [x17] smov w17, v29.b[14] add x15, x3, w15, sxtw add x16, x3, w16, sxtw ld2 {v19.h, v20.h}[4], [x10] add x17, x3, w17, sxtw ld2 {v19.h, v20.h}[5], [x15] ld2 {v19.h, v20.h}[6], [x16] add v29.16b, v29.16b, v21.16b // next base_y ld2 {v19.h, v20.h}[7], [x17] umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 st1 {v10.8h}, [x0], x1 st1 {v11.8h}, [x13], x1 // The h==2 case only happens once at the end, if at all. 4: subs w4, w4, #8 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w5, w12 // reset h b 1b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret L(ipred_z2_fill1_tbl): .hword L(ipred_z2_fill1_tbl) - 640b .hword L(ipred_z2_fill1_tbl) - 320b .hword L(ipred_z2_fill1_tbl) - 160b .hword L(ipred_z2_fill1_tbl) - 80b .hword L(ipred_z2_fill1_tbl) - 40b endfunc function ipred_z2_fill2_16bpc_neon, export=1 cmp w4, #8 mov w8, #(2 << 6) // xpos = 2 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.8h, #0x3e add v30.4h, v16.4h, v30.4h // -= dy // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.8h, v1.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v30.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 movi v23.4h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y zip1 v29.8b, v29.8b, v29.8b // duplicate elements movi v17.8b, #2 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) tbl v18.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 sub v28.4h, v26.4h, v27.4h // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y movi v29.16b, #4 add v31.8h, v31.8h, v31.8h // {0,2,4,6,0,2,4,6} 4: asr w9, w8, #6 // base_x dup v16.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 49f lsl w9, w9, #1 lsl w11, w11, #1 dup v17.4h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] trn1 v16.2d, v16.2d, v17.2d // xpos tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] sshr v20.8h, v16.8h, #6 // first base_x for each row uzp2 v5.8h, v4.8h, v6.8h // top[base_x+1] uzp1 v4.8h, v4.8h, v6.8h // top[base_x] and v16.16b, v16.16b, v25.16b // frac_x trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] sub v17.8h, v26.8h, v16.8h // 64 - frac_x add v20.8h, v20.8h, v31.8h // actual base_x umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v22.4s, v18.8h, v28.8h umlal2 v22.4s, v19.8h, v27.8h umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v24.4s, v4.8h, v17.8h umlal2 v24.4s, v5.8h, v16.8h cmge v20.8h, v20.8h, #0 rshrn v21.4h, v21.4s, #6 rshrn2 v21.8h, v22.4s, #6 rshrn v22.4h, v23.4s, #6 rshrn2 v22.8h, v24.4s, #6 bit v21.16b, v22.16b, v20.16b st1 {v21.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v21.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 4b 49: tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v21.4s, v18.8h, v28.8h umlal2 v21.4s, v19.8h, v27.8h rshrn v20.4h, v20.4s, #6 rshrn2 v20.8h, v21.4s, #6 st1 {v20.d}[0], [x0], x1 subs w5, w5, #2 st1 {v20.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 49b 9: ret 80: stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v18.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.8h, #0x3e add v16.8h, v16.8h, v18.8h // -= dy // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.8h, v1.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v16.8h, #6 // ypos >> 6 and v27.16b, v16.16b, v25.16b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 movi v23.8h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y zip1 v29.16b, v29.16b, v29.16b // duplicate elements movi v17.16b, #2 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbl v18.16b, {v0.16b}, v29.16b // left[base_y] add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) sub v28.8h, v26.8h, v27.8h // 64 - frac_y movi v24.16b, #4 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] ld1 {v6.8h, v7.8h}, [x11] tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] sshr v21.8h, v16.8h, #6 // first base_x sshr v22.8h, v17.8h, #6 tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] uzp2 v2.8h, v4.8h, v5.8h // top[base_x+1] uzp1 v4.8h, v4.8h, v5.8h // top[base_x] uzp2 v3.8h, v6.8h, v7.8h uzp1 v6.8h, v6.8h, v7.8h mov v5.16b, v2.16b mov v7.16b, v3.16b and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y sub v8.8h, v26.8h, v16.8h // 64 - frac_x sub v9.8h, v26.8h, v17.8h umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h add v21.8h, v21.8h, v31.8h // actual base_x add v22.8h, v22.8h, v31.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v5.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v7.4h, v17.4h umull2 v18.4s, v6.8h, v9.8h umlal2 v18.4s, v7.8h, v17.8h cmge v21.8h, v21.8h, #0 cmge v22.8h, v22.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 bit v10.16b, v12.16b, v21.16b bit v11.16b, v13.16b, v22.16b st1 {v10.8h}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 8b 89: tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v5.4s, v18.8h, v28.8h umlal2 v5.4s, v19.8h, v27.8h umull v6.4s, v19.4h, v28.4h umlal v6.4s, v20.4h, v27.4h umull2 v7.4s, v19.8h, v28.8h umlal2 v7.4s, v20.8h, v27.8h rshrn v4.4h, v4.4s, #6 rshrn2 v4.8h, v5.4s, #6 rshrn v5.4h, v6.4s, #6 rshrn2 v5.8h, v7.4s, #6 st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 89b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc function ipred_z2_fill3_16bpc_neon, export=1 cmp w4, #8 mov w8, #(1 << 6) // xpos = 1 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.8h, #0x3e add v30.4h, v16.4h, v30.4h // -= dy // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #2 shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v30.8b, v25.8b // frac_y add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 movi v23.4h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y movi v19.16b, #4 zip1 v29.8b, v29.8b, v29.8b // duplicate elements movi v17.8b, #2 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} add v24.8b, v30.8b, v19.8b // base_y + 3 (*2) trn1 v29.2d, v29.2d, v28.2d // base_y + 0, base_y + 2 trn1 v30.2d, v30.2d, v24.2d // base_y + 1, base_y + 3 sub v28.4h, v26.4h, v27.4h // 64 - frac_y trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y movi v24.16b, #8 4: asr w9, w8, #6 // base_x dup v16.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f lsl w9, w9, #1 lsl w11, w11, #1 dup v17.4h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] trn1 v16.2d, v16.2d, v17.2d // xpos tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] sshr v20.8h, v16.8h, #6 // first base_x for each row ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v17.8h, v26.8h, v16.8h // 64 - frac_x add v20.8h, v20.8h, v31.8h // actual base_x umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v22.4s, v18.8h, v28.8h umlal2 v22.4s, v19.8h, v27.8h umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v24.4s, v4.8h, v17.8h umlal2 v24.4s, v5.8h, v16.8h cmge v20.8h, v20.8h, #0 rshrn v21.4h, v21.4s, #6 rshrn2 v21.8h, v22.4s, #6 rshrn v22.4h, v23.4s, #6 rshrn2 v22.8h, v24.4s, #6 movi v24.16b, #8 bit v21.16b, v22.16b, v20.16b st1 {v21.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v21.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) b 4b 49: tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v21.4s, v18.8h, v28.8h umlal2 v21.4s, v19.8h, v27.8h rshrn v20.4h, v20.4s, #6 rshrn2 v20.8h, v21.4s, #6 st1 {v20.d}[0], [x0], x1 subs w5, w5, #2 st1 {v20.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) b 49b 9: ret 80: stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v18.8h, w7 // -dy movi v17.16b, #2 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.8h, #0x3e add v16.8h, v16.8h, v18.8h // -= dy // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v16.8h, #6 // ypos >> 6 and v27.16b, v16.16b, v25.16b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 2 movi v23.8h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y mov v18.16b, v15.16b // left[0] zip1 v29.16b, v29.16b, v29.16b // duplicate elements add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... add v30.16b, v29.16b, v17.16b // base_y + 1 (*2) sub v28.8h, v26.8h, v27.8h // 64 - frac_y movi v24.16b, #4 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] ld1 {v6.8h, v7.8h}, [x11] tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] add v30.16b, v30.16b, v24.16b sshr v22.8h, v16.8h, #6 // first base_x tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] sshr v23.8h, v17.8h, #6 tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v7.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y sub v8.8h, v26.8h, v16.8h // 64 - frac_x sub v9.8h, v26.8h, v17.8h umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h add v22.8h, v22.8h, v31.8h // actual base_x add v23.8h, v23.8h, v31.8h umull v12.4s, v20.4h, v28.4h umlal v12.4s, v21.4h, v27.4h umull2 v13.4s, v20.8h, v28.8h umlal2 v13.4s, v21.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v5.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v7.4h, v17.4h umull2 v18.4s, v6.8h, v9.8h umlal2 v18.4s, v7.8h, v17.8h cmge v22.8h, v22.8h, #0 cmge v23.8h, v23.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 bit v10.16b, v12.16b, v22.16b bit v11.16b, v13.16b, v23.16b st1 {v10.8h}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.8h}, [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b b 8b 89: tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] add v30.16b, v30.16b, v24.16b tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v5.4s, v18.8h, v28.8h umlal2 v5.4s, v19.8h, v27.8h umull v6.4s, v20.4h, v28.4h umlal v6.4s, v21.4h, v27.4h umull2 v7.4s, v20.8h, v28.8h umlal2 v7.4s, v21.8h, v27.8h rshrn v4.4h, v4.4s, #6 rshrn2 v4.8h, v5.4s, #6 rshrn v5.4h, v6.4s, #6 rshrn2 v5.8h, v7.4s, #6 st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b b 89b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc // void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const left, // const int width, const int height, // const int dy, const int max_base_y); function ipred_z3_fill1_16bpc_neon, export=1 clz w9, w4 adr x8, L(ipred_z3_fill1_tbl) sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] add x10, x2, w6, uxtw #1 // left[max_base_y] sub x8, x8, w9, uxtw ld1r {v31.8h}, [x10] // padding mov w7, w5 mov w15, #64 add x13, x0, x1 lsl x1, x1, #1 br x8 40: AARCH64_VALID_JUMP_TARGET 4: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // left[base] ldr q2, [x2, w10, uxtw] dup v4.8h, w9 // frac dup v5.8h, w11 ext v1.16b, v0.16b, v0.16b, #2 // left[base+1] ext v3.16b, v2.16b, v2.16b, #2 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 add w7, w7, w5 // xpos += dx st1 {v18.s}[2], [x0] st1 {v18.s}[3], [x13] b.le 9f sub x0, x0, x1 // ptr -= 4 * (2*stride) sub x13, x13, x1 add x0, x0, #4 add x13, x13, #4 b 4b 9: ret 80: AARCH64_VALID_JUMP_TARGET 8: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h}, [x8] // left[base] ld1 {v2.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 ldr h1, [x8, #16] ldr h3, [x10, #16] dup v6.8h, w9 // 64 - frac dup v7.8h, w11 ext v1.16b, v0.16b, v1.16b, #2 // left[base+1] ext v3.16b, v2.16b, v3.16b, #2 umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac) umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v1.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v3.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v3.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h zip2 v19.8h, v16.8h, v17.8h add w7, w7, w5 // xpos += dx st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 st1 {v18.s}[2], [x0], x1 st1 {v18.s}[3], [x13], x1 st1 {v19.s}[0], [x0], x1 st1 {v19.s}[1], [x13], x1 st1 {v19.s}[2], [x0], x1 st1 {v19.s}[3], [x13], x1 b.le 9f sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) sub x13, x13, x1, lsl #2 add x0, x0, #4 add x13, x13, #4 b 8b 9: ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov w12, w4 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // ypos += dy cmp w8, w6 // base >= max_base_y lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v6.8h, w9 // frac dup v7.8h, w11 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base] ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v16.8h, w9 // 64 - frac dup v17.8h, w11 add w7, w7, w5 // ypos += dy 2: ext v18.16b, v0.16b, v1.16b, #2 // left[base+1] ext v19.16b, v1.16b, v2.16b, #2 ext v20.16b, v3.16b, v4.16b, #2 ext v21.16b, v4.16b, v5.16b, #2 subs w4, w4, #16 umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac) umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac umull2 v23.4s, v0.8h, v16.8h umlal2 v23.4s, v18.8h, v6.8h umull v24.4s, v1.4h, v16.4h umlal v24.4s, v19.4h, v6.4h umull2 v25.4s, v1.8h, v16.8h umlal2 v25.4s, v19.8h, v6.8h umull v26.4s, v3.4h, v17.4h umlal v26.4s, v20.4h, v7.4h umull2 v27.4s, v3.8h, v17.8h umlal2 v27.4s, v20.8h, v7.8h umull v28.4s, v4.4h, v17.4h umlal v28.4s, v21.4h, v7.4h umull2 v29.4s, v4.8h, v17.8h umlal2 v29.4s, v21.8h, v7.8h rshrn v22.4h, v22.4s, #6 rshrn2 v22.8h, v23.4s, #6 rshrn v23.4h, v24.4s, #6 rshrn2 v23.8h, v25.4s, #6 rshrn v24.4h, v26.4s, #6 rshrn2 v24.8h, v27.4s, #6 rshrn v25.4h, v28.4s, #6 rshrn2 v25.8h, v29.4s, #6 zip1 v18.8h, v22.8h, v24.8h zip2 v19.8h, v22.8h, v24.8h zip1 v20.8h, v23.8h, v25.8h zip2 v21.8h, v23.8h, v25.8h st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 st1 {v18.s}[2], [x0], x1 st1 {v18.s}[3], [x13], x1 st1 {v19.s}[0], [x0], x1 st1 {v19.s}[1], [x13], x1 st1 {v19.s}[2], [x0], x1 st1 {v19.s}[3], [x13], x1 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x13], x1 st1 {v20.s}[2], [x0], x1 st1 {v20.s}[3], [x13], x1 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x13], x1 st1 {v21.s}[2], [x0], x1 st1 {v21.s}[3], [x13], x1 b.le 3f mov v0.16b, v2.16b ld1 {v1.8h, v2.8h}, [x8], #32 // left[base] mov v3.16b, v5.16b ld1 {v4.8h, v5.8h}, [x10], #32 b 2b 3: subs w3, w3, #2 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #4 add x13, x13, #4 mov w4, w12 b 1b 9: ret L(ipred_z3_fill1_tbl): .hword L(ipred_z3_fill1_tbl) - 640b .hword L(ipred_z3_fill1_tbl) - 320b .hword L(ipred_z3_fill1_tbl) - 160b .hword L(ipred_z3_fill1_tbl) - 80b .hword L(ipred_z3_fill1_tbl) - 40b endfunc function ipred_z3_fill_padding_neon, export=0 cmp w3, #8 adr x8, L(ipred_z3_fill_padding_tbl) b.gt L(ipred_z3_fill_padding_wide) // w3 = remaining width, w4 = constant height mov w12, w4 1: // Fill a WxH rectangle with padding. W can be any number; // this fills the exact width by filling in the largest // power of two in the remaining width, and repeating. clz w9, w3 sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] sub x9, x8, w9, uxtw br x9 2: st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 st1 {v31.s}[0], [x0], x1 st1 {v31.s}[0], [x13], x1 b.gt 2b subs w3, w3, #2 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #4 add x13, x13, #4 mov w4, w12 b 1b 4: st1 {v31.4h}, [x0], x1 subs w4, w4, #4 st1 {v31.4h}, [x13], x1 st1 {v31.4h}, [x0], x1 st1 {v31.4h}, [x13], x1 b.gt 4b subs w3, w3, #4 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #8 add x13, x13, #8 mov w4, w12 b 1b 8: 16: 32: 64: st1 {v31.8h}, [x0], x1 subs w4, w4, #4 st1 {v31.8h}, [x13], x1 st1 {v31.8h}, [x0], x1 st1 {v31.8h}, [x13], x1 b.gt 4b subs w3, w3, #8 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w4, w12 b 1b 9: ret L(ipred_z3_fill_padding_tbl): .hword L(ipred_z3_fill_padding_tbl) - 64b .hword L(ipred_z3_fill_padding_tbl) - 32b .hword L(ipred_z3_fill_padding_tbl) - 16b .hword L(ipred_z3_fill_padding_tbl) - 8b .hword L(ipred_z3_fill_padding_tbl) - 4b .hword L(ipred_z3_fill_padding_tbl) - 2b L(ipred_z3_fill_padding_wide): // Fill a WxH rectangle with padding, with W > 8. lsr x1, x1, #1 mov w12, w3 sub x1, x1, w3, uxtw #1 1: ands w5, w3, #7 b.eq 2f // If the width isn't aligned to 8, first do one 8 pixel write // and align the start pointer. sub w3, w3, w5 st1 {v31.8h}, [x0] add x0, x0, w5, uxtw #1 2: // Fill the rest of the line with aligned 8 pixel writes. subs w3, w3, #8 st1 {v31.8h}, [x0], #16 b.gt 2b subs w4, w4, #1 add x0, x0, x1 b.le 9f mov w3, w12 b 1b 9: ret endfunc function ipred_z3_fill2_16bpc_neon, export=1 cmp w4, #8 add x10, x2, w6, uxtw // left[max_base_y] ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 add x13, x0, x1 lsl x1, x1, #1 b.eq 8f 4: // h == 4 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] uzp1 v0.8h, v0.8h, v0.8h // top[base] uzp2 v3.8h, v2.8h, v2.8h uzp1 v2.8h, v2.8h, v2.8h sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 add w7, w7, w5 // xpos += dx st1 {v18.s}[2], [x0] st1 {v18.s}[3], [x13] b.le 9f sub x0, x0, x1 // ptr -= 4 * (2*stride) sub x13, x13, x1 add x0, x0, #4 add x13, x13, #4 b 4b 9: ret 8: // h == 8 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h, v1.8h}, [x8] // top[base] ld1 {v2.8h, v3.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8h, w9 // 64 - frac dup v7.8h, w11 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] uzp1 v0.8h, v0.8h, v1.8h // top[base] uzp2 v21.8h, v2.8h, v3.8h uzp1 v2.8h, v2.8h, v3.8h umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v20.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v21.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v21.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h zip2 v19.8h, v16.8h, v17.8h add w7, w7, w5 // xpos += dx st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 st1 {v18.s}[2], [x0], x1 st1 {v18.s}[3], [x13], x1 st1 {v19.s}[0], [x0], x1 st1 {v19.s}[1], [x13], x1 st1 {v19.s}[2], [x0], x1 st1 {v19.s}[3], [x13], x1 b.le 9f sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) sub x13, x13, x1, lsl #2 add x0, x0, #4 add x13, x13, #4 b 8b 9: ret endfunc // void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height, // const int bitdepth_max); .macro filter_fn bpc function ipred_filter_\bpc\()bpc_neon and w5, w5, #511 movrel x6, X(filter_intra_taps) lsl w5, w5, #6 add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 adr x5, L(ipred_filter\bpc\()_tbl) ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 ldrh w9, [x5, w9, uxtw #1] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b sub x5, x5, w9, uxtw sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 lsl x1, x1, #1 sxtl v20.8h, v20.8b sxtl v21.8h, v21.8b sxtl v22.8h, v22.8b dup v31.8h, w8 .if \bpc == 10 movi v30.8h, #0 .endif br x5 40: AARCH64_VALID_JUMP_TARGET ldur d0, [x2, #2] // top (0-3) sub x2, x2, #4 mov x7, #-4 4: ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) .if \bpc == 10 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) srshr v2.8h, v2.8h, #4 smax v2.8h, v2.8h, v30.8h .else smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) sqrshrun v2.4h, v2.4s, #4 sqrshrun2 v2.8h, v3.4s, #4 .endif smin v2.8h, v2.8h, v31.8h subs w4, w4, #2 st1 {v2.d}[0], [x0], x1 ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] st1 {v2.d}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ldur q0, [x2, #2] // top (0-7) sub x2, x2, #4 mov x7, #-4 8: ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) .if \bpc == 10 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) srshr v2.8h, v2.8h, #4 smax v2.8h, v2.8h, v30.8h smin v2.8h, v2.8h, v31.8h mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) srshr v3.8h, v3.8h, #4 smax v3.8h, v3.8h, v30.8h .else smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) sqrshrun v2.4h, v2.4s, #4 sqrshrun2 v2.8h, v3.4s, #4 smin v2.8h, v2.8h, v31.8h smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) sqrshrun v3.4h, v4.4s, #4 sqrshrun2 v3.8h, v5.4s, #4 .endif smin v3.8h, v3.8h, v31.8h subs w4, w4, #2 st2 {v2.d, v3.d}[0], [x0], x1 zip2 v0.2d, v2.2d, v3.2d st2 {v2.d, v3.d}[1], [x6], x1 b.gt 8b ret 160: 320: AARCH64_VALID_JUMP_TARGET add x8, x2, #2 sub x2, x2, #4 mov x7, #-4 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) 2: ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) .if \bpc == 10 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) srshr v3.8h, v3.8h, #4 smax v3.8h, v3.8h, v30.8h smin v3.8h, v3.8h, v31.8h mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) srshr v4.8h, v4.8h, #4 smax v4.8h, v4.8h, v30.8h smin v4.8h, v4.8h, v31.8h mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) srshr v5.8h, v5.8h, #4 smax v5.8h, v5.8h, v30.8h smin v5.8h, v5.8h, v31.8h mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 srshr v6.8h, v6.8h, #4 smax v6.8h, v6.8h, v30.8h .else smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) sqrshrun v3.4h, v3.4s, #4 sqrshrun2 v3.8h, v4.4s, #4 smin v3.8h, v3.8h, v31.8h smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) sqrshrun v4.4h, v5.4s, #4 sqrshrun2 v4.8h, v6.4s, #4 smin v4.8h, v4.8h, v31.8h smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) sqrshrun v5.4h, v24.4s, #4 sqrshrun2 v5.8h, v25.4s, #4 smin v5.8h, v5.8h, v31.8h smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 sqrshrun v6.4h, v26.4s, #4 sqrshrun2 v6.8h, v27.4s, #4 .endif smin v6.8h, v6.8h, v31.8h ins v0.h[2], v2.h[7] st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 ins v0.h[0], v6.h[7] st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 ins v0.h[1], v6.h[3] b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x6, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_filter\bpc\()_tbl): .hword L(ipred_filter\bpc\()_tbl) - 320b .hword L(ipred_filter\bpc\()_tbl) - 160b .hword L(ipred_filter\bpc\()_tbl) - 80b .hword L(ipred_filter\bpc\()_tbl) - 40b endfunc .endm filter_fn 10 filter_fn 12 function ipred_filter_16bpc_neon, export=1 ldr w8, [sp] cmp w8, 0x3ff b.le ipred_filter_10bpc_neon b ipred_filter_12bpc_neon endfunc // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint16_t *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_16bpc_neon, export=1 ld1 {v30.8h}, [x2] clz w9, w4 adr x6, L(pal_pred_tbl) sub w9, w9, #25 ldrh w9, [x6, w9, uxtw #1] movi v31.8h, #1, lsl #8 sub x6, x6, w9, uxtw br x6 40: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 4: ld1 {v1.16b}, [x3], #16 subs w5, w5, #4 // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... add v1.16b, v1.16b, v1.16b zip1 v0.16b, v1.16b, v1.16b zip2 v1.16b, v1.16b, v1.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b st1 {v0.d}[0], [x0], x1 tbl v1.16b, {v30.16b}, v1.16b st1 {v0.d}[1], [x2], x1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x2], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 8: ld1 {v2.16b, v3.16b}, [x3], #32 subs w5, w5, #4 add v2.16b, v2.16b, v2.16b add v3.16b, v3.16b, v3.16b zip1 v0.16b, v2.16b, v2.16b zip2 v1.16b, v2.16b, v2.16b zip1 v2.16b, v3.16b, v3.16b zip2 v3.16b, v3.16b, v3.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b tbl v1.16b, {v30.16b}, v1.16b st1 {v0.8h}, [x0], x1 tbl v2.16b, {v30.16b}, v2.16b st1 {v1.8h}, [x2], x1 tbl v3.16b, {v30.16b}, v3.16b st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x2], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 16: ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 subs w5, w5, #4 add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b add v7.16b, v7.16b, v7.16b zip1 v0.16b, v4.16b, v4.16b zip2 v1.16b, v4.16b, v4.16b zip1 v2.16b, v5.16b, v5.16b zip2 v3.16b, v5.16b, v5.16b zip1 v4.16b, v6.16b, v6.16b zip2 v5.16b, v6.16b, v6.16b zip1 v6.16b, v7.16b, v7.16b zip2 v7.16b, v7.16b, v7.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h add v4.8h, v4.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b add v5.8h, v5.8h, v31.8h tbl v1.16b, {v30.16b}, v1.16b add v6.8h, v6.8h, v31.8h tbl v2.16b, {v30.16b}, v2.16b add v7.8h, v7.8h, v31.8h tbl v3.16b, {v30.16b}, v3.16b tbl v4.16b, {v30.16b}, v4.16b tbl v5.16b, {v30.16b}, v5.16b st1 {v0.8h, v1.8h}, [x0], x1 tbl v6.16b, {v30.16b}, v6.16b st1 {v2.8h, v3.8h}, [x2], x1 tbl v7.16b, {v30.16b}, v7.16b st1 {v4.8h, v5.8h}, [x0], x1 st1 {v6.8h, v7.8h}, [x2], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 32: ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 subs w5, w5, #2 add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b add v7.16b, v7.16b, v7.16b zip1 v0.16b, v4.16b, v4.16b zip2 v1.16b, v4.16b, v4.16b zip1 v2.16b, v5.16b, v5.16b zip2 v3.16b, v5.16b, v5.16b zip1 v4.16b, v6.16b, v6.16b zip2 v5.16b, v6.16b, v6.16b zip1 v6.16b, v7.16b, v7.16b zip2 v7.16b, v7.16b, v7.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h add v4.8h, v4.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b add v5.8h, v5.8h, v31.8h tbl v1.16b, {v30.16b}, v1.16b add v6.8h, v6.8h, v31.8h tbl v2.16b, {v30.16b}, v2.16b add v7.8h, v7.8h, v31.8h tbl v3.16b, {v30.16b}, v3.16b tbl v4.16b, {v30.16b}, v4.16b tbl v5.16b, {v30.16b}, v5.16b st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 tbl v6.16b, {v30.16b}, v6.16b tbl v7.16b, {v30.16b}, v7.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET add x2, x0, #64 64: ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 subs w5, w5, #1 add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b add v7.16b, v7.16b, v7.16b zip1 v0.16b, v4.16b, v4.16b zip2 v1.16b, v4.16b, v4.16b zip1 v2.16b, v5.16b, v5.16b zip2 v3.16b, v5.16b, v5.16b zip1 v4.16b, v6.16b, v6.16b zip2 v5.16b, v6.16b, v6.16b zip1 v6.16b, v7.16b, v7.16b zip2 v7.16b, v7.16b, v7.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h add v4.8h, v4.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b add v5.8h, v5.8h, v31.8h tbl v1.16b, {v30.16b}, v1.16b add v6.8h, v6.8h, v31.8h tbl v2.16b, {v30.16b}, v2.16b add v7.8h, v7.8h, v31.8h tbl v3.16b, {v30.16b}, v3.16b tbl v4.16b, {v30.16b}, v4.16b tbl v5.16b, {v30.16b}, v5.16b st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 tbl v6.16b, {v30.16b}, v6.16b tbl v7.16b, {v30.16b}, v7.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 b.gt 64b ret L(pal_pred_tbl): .hword L(pal_pred_tbl) - 640b .hword L(pal_pred_tbl) - 320b .hword L(pal_pred_tbl) - 160b .hword L(pal_pred_tbl) - 80b .hword L(pal_pred_tbl) - 40b endfunc // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_128_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max clz w9, w3 adr x7, L(ipred_cfl_128_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] urshr v0.8h, v31.8h, #1 dup v1.8h, w6 // alpha sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_splat_w4): AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #4 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h cmlt v16.4s, v2.4s, #0 // sign cmlt v17.4s, v3.4s, #0 cmlt v18.4s, v4.4s, #0 cmlt v19.4s, v5.4s, #0 add v2.4s, v2.4s, v16.4s // diff + sign add v3.4s, v3.4s, v17.4s add v4.4s, v4.4s, v18.4s add v5.4s, v5.4s, v19.4s rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h st1 {v2.d}[0], [x0], x1 st1 {v2.d}[1], [x6], x1 st1 {v3.d}[0], [x0], x1 st1 {v3.d}[1], [x6], x1 b.gt L(ipred_cfl_splat_w4) ret L(ipred_cfl_splat_w8): AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #2 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h cmlt v16.4s, v2.4s, #0 // sign cmlt v17.4s, v3.4s, #0 cmlt v18.4s, v4.4s, #0 cmlt v19.4s, v5.4s, #0 add v2.4s, v2.4s, v16.4s // diff + sign add v3.4s, v3.4s, v17.4s add v4.4s, v4.4s, v18.4s add v5.4s, v5.4s, v19.4s rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x6], x1 b.gt L(ipred_cfl_splat_w8) ret L(ipred_cfl_splat_w16): AARCH64_VALID_JUMP_TARGET add x7, x5, w3, uxtw #1 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld1 {v2.8h, v3.8h}, [x5], #32 ld1 {v4.8h, v5.8h}, [x7], #32 subs w3, w3, #16 smull v16.4s, v2.4h, v1.4h // diff = ac * alpha smull2 v17.4s, v2.8h, v1.8h smull v18.4s, v3.4h, v1.4h smull2 v19.4s, v3.8h, v1.8h smull v2.4s, v4.4h, v1.4h smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h cmlt v20.4s, v16.4s, #0 // sign cmlt v21.4s, v17.4s, #0 cmlt v22.4s, v18.4s, #0 cmlt v23.4s, v19.4s, #0 cmlt v24.4s, v2.4s, #0 cmlt v25.4s, v3.4s, #0 cmlt v26.4s, v4.4s, #0 cmlt v27.4s, v5.4s, #0 add v16.4s, v16.4s, v20.4s // diff + sign add v17.4s, v17.4s, v21.4s add v18.4s, v18.4s, v22.4s add v19.4s, v19.4s, v23.4s add v2.4s, v2.4s, v24.4s add v3.4s, v3.4s, v25.4s add v4.4s, v4.4s, v26.4s add v5.4s, v5.4s, v27.4s rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 rshrn v6.4h, v2.4s, #6 rshrn2 v6.8h, v3.4s, #6 rshrn v7.4h, v4.4s, #6 rshrn2 v7.8h, v5.4s, #6 add v2.8h, v16.8h, v0.8h // dc + apply_sign() add v3.8h, v17.8h, v0.8h add v4.8h, v6.8h, v0.8h add v5.8h, v7.8h, v0.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smax v4.8h, v4.8h, v30.8h smax v5.8h, v5.8h, v30.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h smin v4.8h, v4.8h, v31.8h smin v5.8h, v5.8h, v31.8h st1 {v2.8h, v3.8h}, [x0], #32 st1 {v4.8h, v5.8h}, [x6], #32 b.gt 1b subs w4, w4, #2 add x5, x5, w9, uxtw #1 add x7, x7, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b.gt 1b ret L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) endfunc // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_top_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max clz w9, w3 adr x7, L(ipred_cfl_top_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] dup v1.8h, w6 // alpha add x2, x2, #2 sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 4: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] addp v0.8h, v2.8h, v3.8h addv h0, v0.8h urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h uaddlv s0, v0.8h rshrn v0.4h, v0.4s, #5 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_top_tbl): .hword L(ipred_cfl_top_tbl) - 32b .hword L(ipred_cfl_top_tbl) - 16b .hword L(ipred_cfl_top_tbl) - 8b .hword L(ipred_cfl_top_tbl) - 4b endfunc // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_left_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max sub x2, x2, w4, uxtw #1 clz w9, w3 clz w8, w4 adr x10, L(ipred_cfl_splat_tbl) adr x7, L(ipred_cfl_left_tbl) sub w9, w9, #26 sub w8, w8, #26 ldrh w9, [x10, w9, uxtw #1] ldrh w8, [x7, w8, uxtw #1] dup v1.8h, w6 // alpha sub x9, x10, w9, uxtw sub x7, x7, w8, uxtw add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_left_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] addp v0.8h, v2.8h, v3.8h addv h0, v0.8h urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h uaddlv s0, v0.8h rshrn v0.4h, v0.4s, #5 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_tbl): .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) endfunc // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max sub x2, x2, w4, uxtw #1 add w8, w3, w4 // width + height dup v1.8h, w6 // alpha clz w9, w3 clz w6, w4 dup v16.4s, w8 // width + height adr x7, L(ipred_cfl_tbl) rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) ldrh w9, [x7, w9, uxtw #1] ldrh w6, [x7, w6, uxtw #1] neg w8, w8 // -ctz(width + height) sub x9, x7, w9, uxtw sub x7, x7, w6, uxtw ushr v16.4s, v16.4s, #1 // (width + height) >> 1 dup v17.4s, w8 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h add x2, x2, #2 br x9 L(ipred_cfl_w4): AARCH64_VALID_JUMP_TARGET ld1 {v2.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.4h cmp w4, #4 add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 8/16 cmp w4, #16 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h add x2, x2, #2 br x9 L(ipred_cfl_w8): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.8h cmp w4, #8 add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2], #32 addp v0.8h, v2.8h, v3.8h add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w16): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v2.8h, v2.8h, v3.8h uaddlv s2, v2.8h cmp w4, #16 add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 4/8/32 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w32): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] add v0.4s, v0.4s, v16.4s addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v2.8h, v2.8h, v4.8h cmp w4, #32 uaddlv s2, v2.8h add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 8/16 cmp w4, #8 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_tbl): .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) endfunc // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_420_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_420_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v2.8h addp v1.8h, v1.8h, v3.8h add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 subs w8, w8, #2 st1 {v0.8h}, [x0], #16 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h b.gt 1b trn2 v1.2d, v0.2d, v0.2d trn2 v0.2d, v0.2d, v0.2d L(ipred_cfl_ac_420_w4_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 2b 3: L(ipred_cfl_ac_420_w4_calc_subtract_dc): // Aggregate the sums add v24.4s, v24.4s, v25.4s add v26.4s, v26.4s, v27.4s add v0.4s, v24.4s, v26.4s addv s0, v0.4s // sum sub x0, x0, w6, uxtw #3 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h}, [x0] subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 6b ret L(ipred_cfl_ac_420_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 ld1 {v4.8h, v5.8h}, [x1], x2 addp v0.8h, v0.8h, v1.8h ld1 {v6.8h, v7.8h}, [x10], x2 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v1.8h, v4.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 1b mov v0.16b, v1.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v2.8h addp v1.8h, v1.8h, v3.8h add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 dup v1.4h, v0.h[3] dup v3.4h, v0.h[7] trn2 v2.2d, v0.2d, v0.2d subs w8, w8, #2 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw v25.4s, v25.4s, v1.4h uaddw v26.4s, v26.4s, v2.4h uaddw v27.4s, v27.4s, v3.4h b.gt 1b trn1 v0.2d, v2.2d, v3.2d trn1 v1.2d, v2.2d, v3.2d L(ipred_cfl_ac_420_w8_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 2b 3: // Double the height and reuse the w4 summing/subtracting lsl w6, w6, #1 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_w16): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_420_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_420_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 add v0.8h, v0.8h, v4.8h ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 add v2.8h, v2.8h, v6.8h addp v16.8h, v16.8h, v17.8h addp v18.8h, v18.8h, v19.8h addp v20.8h, v20.8h, v21.8h addp v22.8h, v22.8h, v23.8h add v16.8h, v16.8h, v20.8h add v18.8h, v18.8h, v22.8h shl v0.8h, v0.8h, #1 shl v1.8h, v2.8h, #1 shl v2.8h, v16.8h, #1 shl v3.8h, v18.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr q2, [x1, #32] ld1 {v0.8h, v1.8h}, [x1], x2 ldr q5, [x10, #32] ld1 {v3.8h, v4.8h}, [x10], x2 addp v2.8h, v2.8h, v2.8h addp v0.8h, v0.8h, v1.8h addp v5.8h, v5.8h, v5.8h addp v3.8h, v3.8h, v4.8h ldr q18, [x1, #32] add v2.4h, v2.4h, v5.4h ld1 {v16.8h, v17.8h}, [x1], x2 add v0.8h, v0.8h, v3.8h ldr q21, [x10, #32] ld1 {v19.8h, v20.8h}, [x10], x2 addp v18.8h, v18.8h, v18.8h addp v16.8h, v16.8h, v17.8h addp v21.8h, v21.8h, v21.8h addp v19.8h, v19.8h, v20.8h add v18.4h, v18.4h, v21.4h add v16.8h, v16.8h, v19.8h shl v1.4h, v2.4h, #1 shl v0.8h, v0.8h, #1 shl v3.4h, v18.4h, #1 shl v2.8h, v16.8h, #1 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 ld1 {v4.8h, v5.8h}, [x1], x2 addp v0.8h, v0.8h, v1.8h ld1 {v6.8h, v7.8h}, [x10], x2 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v2.8h, v4.8h, #1 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 ld1 {v4.8h}, [x1], x2 ld1 {v6.8h}, [x10], x2 addp v0.8h, v0.8h, v4.8h addp v2.8h, v2.8h, v6.8h add v0.8h, v0.8h, v2.8h shl v0.8h, v0.8h, #1 dup v1.8h, v0.h[3] dup v3.8h, v0.h[7] trn2 v2.2d, v0.2d, v3.2d trn1 v0.2d, v0.2d, v1.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b L(ipred_cfl_ac_420_w16_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 2b 3: // Quadruple the height and reuse the w4 summing/subtracting lsl w6, w6, #2 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_tbl): .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) .hword 0 L(ipred_cfl_ac_420_w16_tbl): .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) endfunc // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_422_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_422_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #2 shl v1.8h, v2.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 ld1 {v4.8h, v5.8h}, [x1], x2 addp v0.8h, v0.8h, v1.8h ld1 {v6.8h, v7.8h}, [x10], x2 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h shl v0.8h, v0.8h, #2 shl v1.8h, v2.8h, #2 shl v2.8h, v4.8h, #2 shl v3.8h, v6.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v0.h[3] dup v5.8h, v0.h[7] dup v6.4h, v2.h[3] dup v7.8h, v2.h[7] trn2 v1.2d, v0.2d, v5.2d trn1 v0.2d, v0.2d, v4.2d trn2 v3.2d, v2.2d, v7.2d trn1 v2.2d, v2.2d, v6.2d subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_422_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_422_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h shl v0.8h, v0.8h, #2 shl v1.8h, v2.8h, #2 shl v2.8h, v4.8h, #2 shl v3.8h, v6.8h, #2 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr q2, [x1, #32] ld1 {v0.8h, v1.8h}, [x1], x2 ldr q6, [x10, #32] ld1 {v4.8h, v5.8h}, [x10], x2 addp v2.8h, v2.8h, v2.8h addp v0.8h, v0.8h, v1.8h addp v6.8h, v6.8h, v6.8h addp v4.8h, v4.8h, v5.8h shl v1.4h, v2.4h, #2 shl v0.8h, v0.8h, #2 shl v3.4h, v6.4h, #2 shl v2.8h, v4.8h, #2 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 addp v0.8h, v0.8h, v0.8h addp v2.8h, v2.8h, v2.8h shl v0.4h, v0.4h, #2 shl v2.4h, v2.4h, #2 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_tbl): .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) .hword 0 L(ipred_cfl_ac_422_w16_tbl): .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) endfunc // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_444_tbl) sub w8, w8, #26 ldrh w8, [x7, w8, uxtw #1] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_444_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.4h}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v1.4h}, [x1], x2 ld1 {v1.d}[1], [x10], x2 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 shl v0.8h, v0.8h, #3 ld1 {v3.8h}, [x10], x2 shl v1.8h, v1.8h, #3 shl v2.8h, v2.8h, #3 shl v3.8h, v3.8h, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 shl v2.8h, v2.8h, #3 shl v3.8h, v3.8h, #3 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 shl v0.8h, v0.8h, #3 shl v2.8h, v2.8h, #3 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_444_w32_tbl) ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 lsr x2, x2, #1 // Restore the stride to one line increments sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_444_w32_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 shl v2.8h, v2.8h, #3 shl v3.8h, v3.8h, #3 subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 8 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 shl v2.8h, v2.8h, #3 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 dup v3.8h, v2.h[7] subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 16 ld1 {v0.8h, v1.8h}, [x1], x2 shl v1.8h, v1.8h, #3 shl v0.8h, v0.8h, #3 dup v2.8h, v1.h[7] dup v3.8h, v1.h[7] subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 24 ld1 {v0.8h}, [x1], x2 shl v0.8h, v0.8h, #3 dup v1.8h, v0.h[7] dup v2.8h, v0.h[7] dup v3.8h, v0.h[7] subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b L(ipred_cfl_ac_444_w32_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 2b 3: // Multiply the height by eight and reuse the w4 subtracting lsl w6, w6, #3 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_444_tbl): .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) L(ipred_cfl_ac_444_w32_tbl): .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) endfunc av-scenechange-0.14.1/src/asm/arm/64/mc.S000064400000000000000000003600351046102023000156670ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Janne Grunau * Copyright © 2018, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/asm/arm/asm.S" #include "util.S" .macro avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 add \t0\().8h, \t0\().8h, \t2\().8h add \t1\().8h, \t1\().8h, \t3\().8h sqrshrun \dst\().8b, \t0\().8h, #5 sqrshrun2 \dst\().16b, \t1\().8h, #5 .endm .macro w_avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 sub \t0\().8h, \t2\().8h, \t0\().8h sub \t1\().8h, \t3\().8h, \t1\().8h sqdmulh \t0\().8h, \t0\().8h, v30.8h sqdmulh \t1\().8h, \t1\().8h, v30.8h add \t0\().8h, \t2\().8h, \t0\().8h add \t1\().8h, \t3\().8h, \t1\().8h sqrshrun \dst\().8b, \t0\().8h, #4 sqrshrun2 \dst\().16b, \t1\().8h, #4 .endm .macro mask dst, t0, t1, t2, t3 ld1 {v30.16b}, [x6], 16 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 mul v30.16b, v30.16b, v31.16b ld1 {\t2\().8h,\t3\().8h}, [x3], 32 shll v28.8h, v30.8b, #8 shll2 v29.8h, v30.16b, #8 sub \t0\().8h, \t2\().8h, \t0\().8h sub \t1\().8h, \t3\().8h, \t1\().8h sqdmulh \t0\().8h, \t0\().8h, v28.8h sqdmulh \t1\().8h, \t1\().8h, v29.8h add \t0\().8h, \t2\().8h, \t0\().8h add \t1\().8h, \t3\().8h, \t1\().8h sqrshrun \dst\().8b, \t0\().8h, #4 sqrshrun2 \dst\().16b, \t1\().8h, #4 .endm .macro bidir_fn type function \type\()_8bpc_neon, export=1 clz w4, w4 .ifc \type, w_avg dup v30.8h, w6 neg v30.8h, v30.8h shl v30.8h, v30.8h, #11 .endif .ifc \type, mask movi v31.16b, #256-2 .endif adr x7, L(\type\()_tbl) sub w4, w4, #24 ldrh w4, [x7, x4, lsl #1] \type v4, v0, v1, v2, v3 sub x7, x7, w4, uxtw br x7 40: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 4: cmp w5, #4 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x7], x1 st1 {v4.s}[2], [x0], x1 st1 {v4.s}[3], [x7], x1 b.eq 0f \type v5, v0, v1, v2, v3 cmp w5, #8 st1 {v5.s}[0], [x0], x1 st1 {v5.s}[1], [x7], x1 st1 {v5.s}[2], [x0], x1 st1 {v5.s}[3], [x7], x1 b.eq 0f \type v4, v0, v1, v2, v3 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x7], x1 \type v5, v0, v1, v2, v3 st1 {v4.s}[2], [x0], x1 st1 {v4.s}[3], [x7], x1 st1 {v5.s}[0], [x0], x1 st1 {v5.s}[1], [x7], x1 st1 {v5.s}[2], [x0], x1 st1 {v5.s}[3], [x7], x1 ret 80: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 8: st1 {v4.d}[0], [x0], x1 \type v5, v0, v1, v2, v3 st1 {v4.d}[1], [x7], x1 st1 {v5.d}[0], [x0], x1 subs w5, w5, #4 st1 {v5.d}[1], [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 8b 16: AARCH64_VALID_JUMP_TARGET \type v5, v0, v1, v2, v3 st1 {v4.16b}, [x0], x1 \type v6, v0, v1, v2, v3 st1 {v5.16b}, [x0], x1 \type v7, v0, v1, v2, v3 st1 {v6.16b}, [x0], x1 subs w5, w5, #4 st1 {v7.16b}, [x0], x1 b.le 0f \type v4, v0, v1, v2, v3 b 16b 320: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 32: \type v5, v0, v1, v2, v3 \type v6, v0, v1, v2, v3 st1 {v4.16b,v5.16b}, [x0], x1 \type v7, v0, v1, v2, v3 subs w5, w5, #2 st1 {v6.16b,v7.16b}, [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 32b 640: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 64: \type v5, v0, v1, v2, v3 \type v6, v0, v1, v2, v3 \type v7, v0, v1, v2, v3 \type v16, v0, v1, v2, v3 \type v17, v0, v1, v2, v3 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 \type v18, v0, v1, v2, v3 \type v19, v0, v1, v2, v3 subs w5, w5, #2 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 64b 1280: AARCH64_VALID_JUMP_TARGET add x7, x0, #64 128: \type v5, v0, v1, v2, v3 \type v6, v0, v1, v2, v3 \type v7, v0, v1, v2, v3 \type v16, v0, v1, v2, v3 \type v17, v0, v1, v2, v3 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 \type v18, v0, v1, v2, v3 \type v19, v0, v1, v2, v3 subs w5, w5, #1 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 128b 0: ret L(\type\()_tbl): .hword L(\type\()_tbl) - 1280b .hword L(\type\()_tbl) - 640b .hword L(\type\()_tbl) - 320b .hword L(\type\()_tbl) - 16b .hword L(\type\()_tbl) - 80b .hword L(\type\()_tbl) - 40b endfunc .endm bidir_fn avg bidir_fn w_avg bidir_fn mask .macro w_mask_fn type function w_mask_\type\()_8bpc_neon, export=1 clz w8, w4 adr x9, L(w_mask_\type\()_tbl) sub w8, w8, #24 ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw mov w10, #6903 dup v0.8h, w10 .if \type == 444 movi v1.16b, #64 .elseif \type == 422 dup v2.8b, w7 movi v3.8b, #129 sub v3.8b, v3.8b, v2.8b .elseif \type == 420 dup v2.8h, w7 movi v3.8h, #1, lsl #8 sub v3.8h, v3.8h, v2.8h .endif add x12, x0, x1 lsl x1, x1, #1 br x9 4: AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) subs w5, w5, #4 sub v16.8h, v6.8h, v4.8h sub v17.8h, v7.8h, v5.8h sabd v18.8h, v4.8h, v6.8h sabd v19.8h, v5.8h, v7.8h uqsub v18.8h, v0.8h, v18.8h uqsub v19.8h, v0.8h, v19.8h ushr v18.8h, v18.8h, #8 ushr v19.8h, v19.8h, #8 shl v20.8h, v18.8h, #9 shl v21.8h, v19.8h, #9 sqdmulh v20.8h, v20.8h, v16.8h sqdmulh v21.8h, v21.8h, v17.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v5.8h sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 addp v18.8h, v18.8h, v19.8h xtn v18.8b, v18.8h uhsub v18.8b, v3.8b, v18.8b st1 {v18.8b}, [x6], #8 .elseif \type == 420 trn1 v24.2d, v18.2d, v19.2d trn2 v25.2d, v18.2d, v19.2d add v24.8h, v24.8h, v25.8h addp v18.8h, v24.8h, v24.8h sub v18.4h, v3.4h, v18.4h rshrn v18.8b, v18.8h, #2 st1 {v18.s}[0], [x6], #4 .endif st1 {v22.s}[0], [x0], x1 st1 {v22.s}[1], [x12], x1 st1 {v23.s}[0], [x0], x1 st1 {v23.s}[1], [x12], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v6.8h, v7.8h}, [x3], #32 subs w5, w5, #2 sub v16.8h, v6.8h, v4.8h sub v17.8h, v7.8h, v5.8h sabd v18.8h, v4.8h, v6.8h sabd v19.8h, v5.8h, v7.8h uqsub v18.8h, v0.8h, v18.8h uqsub v19.8h, v0.8h, v19.8h ushr v18.8h, v18.8h, #8 ushr v19.8h, v19.8h, #8 shl v20.8h, v18.8h, #9 shl v21.8h, v19.8h, #9 sqdmulh v20.8h, v20.8h, v16.8h sqdmulh v21.8h, v21.8h, v17.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v5.8h sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 addp v18.8h, v18.8h, v19.8h xtn v18.8b, v18.8h uhsub v18.8b, v3.8b, v18.8b st1 {v18.8b}, [x6], #8 .elseif \type == 420 add v18.8h, v18.8h, v19.8h addp v18.8h, v18.8h, v18.8h sub v18.4h, v3.4h, v18.4h rshrn v18.8b, v18.8h, #2 st1 {v18.s}[0], [x6], #4 .endif st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x12], x1 b.gt 8b ret 1280: 640: 320: 160: AARCH64_VALID_JUMP_TARGET mov w11, w4 sub x1, x1, w4, uxtw .if \type == 444 add x10, x6, w4, uxtw .elseif \type == 422 add x10, x6, x11, lsr #1 .endif add x9, x3, w4, uxtw #1 add x7, x2, w4, uxtw #1 161: mov w8, w4 16: ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v6.8h, v7.8h}, [x3], #32 ld1 {v16.8h, v17.8h}, [x7], #32 ld1 {v18.8h, v19.8h}, [x9], #32 subs w8, w8, #16 sub v6.8h, v6.8h, v4.8h sub v7.8h, v7.8h, v5.8h sub v18.8h, v18.8h, v16.8h sub v19.8h, v19.8h, v17.8h abs v20.8h, v6.8h abs v21.8h, v7.8h abs v22.8h, v18.8h abs v23.8h, v19.8h uqsub v20.8h, v0.8h, v20.8h uqsub v21.8h, v0.8h, v21.8h uqsub v22.8h, v0.8h, v22.8h uqsub v23.8h, v0.8h, v23.8h ushr v20.8h, v20.8h, #8 ushr v21.8h, v21.8h, #8 ushr v22.8h, v22.8h, #8 ushr v23.8h, v23.8h, #8 shl v24.8h, v20.8h, #9 shl v25.8h, v21.8h, #9 shl v26.8h, v22.8h, #9 shl v27.8h, v23.8h, #9 sqdmulh v24.8h, v24.8h, v6.8h sqdmulh v25.8h, v25.8h, v7.8h sqdmulh v26.8h, v26.8h, v18.8h sqdmulh v27.8h, v27.8h, v19.8h add v24.8h, v24.8h, v4.8h add v25.8h, v25.8h, v5.8h add v26.8h, v26.8h, v16.8h add v27.8h, v27.8h, v17.8h sqrshrun v24.8b, v24.8h, #4 sqrshrun v25.8b, v25.8h, #4 sqrshrun v26.8b, v26.8h, #4 sqrshrun v27.8b, v27.8h, #4 .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2 uzp1 v21.16b, v22.16b, v23.16b // Ditto sub v20.16b, v1.16b, v20.16b sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 st1 {v21.16b}, [x10], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h addp v21.8h, v22.8h, v23.8h xtn v20.8b, v20.8h xtn v21.8b, v21.8h uhsub v20.8b, v3.8b, v20.8b uhsub v21.8b, v3.8b, v21.8b st1 {v20.8b}, [x6], #8 st1 {v21.8b}, [x10], #8 .elseif \type == 420 add v20.8h, v20.8h, v22.8h add v21.8h, v21.8h, v23.8h addp v20.8h, v20.8h, v21.8h sub v20.8h, v3.8h, v20.8h rshrn v20.8b, v20.8h, #2 st1 {v20.8b}, [x6], #8 .endif st1 {v24.8b, v25.8b}, [x0], #16 st1 {v26.8b, v27.8b}, [x12], #16 b.gt 16b subs w5, w5, #2 add x2, x2, w4, uxtw #1 add x3, x3, w4, uxtw #1 add x7, x7, w4, uxtw #1 add x9, x9, w4, uxtw #1 .if \type == 444 add x6, x6, w4, uxtw add x10, x10, w4, uxtw .elseif \type == 422 add x6, x6, x11, lsr #1 add x10, x10, x11, lsr #1 .endif add x0, x0, x1 add x12, x12, x1 b.gt 161b ret L(w_mask_\type\()_tbl): .hword L(w_mask_\type\()_tbl) - 1280b .hword L(w_mask_\type\()_tbl) - 640b .hword L(w_mask_\type\()_tbl) - 320b .hword L(w_mask_\type\()_tbl) - 160b .hword L(w_mask_\type\()_tbl) - 8b .hword L(w_mask_\type\()_tbl) - 4b endfunc .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_8bpc_neon, export=1 adr x6, L(blend_tbl) clz w3, w3 sub w3, w3, #26 ldrh w3, [x6, x3, lsl #1] sub x6, x6, w3, uxtw movi v4.16b, #64 add x8, x0, x1 lsl x1, x1, #1 br x6 4: AARCH64_VALID_JUMP_TARGET ld1 {v2.8b}, [x5], #8 ld1 {v1.d}[0], [x2], #8 ld1 {v0.s}[0], [x0] subs w4, w4, #2 ld1 {v0.s}[1], [x8] sub v3.8b, v4.8b, v2.8b umull v5.8h, v1.8b, v2.8b umlal v5.8h, v0.8b, v3.8b rshrn v6.8b, v5.8h, #6 st1 {v6.s}[0], [x0], x1 st1 {v6.s}[1], [x8], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v2.16b}, [x5], #16 ld1 {v1.16b}, [x2], #16 ld1 {v0.d}[0], [x0] ld1 {v0.d}[1], [x8] sub v3.16b, v4.16b, v2.16b subs w4, w4, #2 umull v5.8h, v1.8b, v2.8b umlal v5.8h, v0.8b, v3.8b umull2 v6.8h, v1.16b, v2.16b umlal2 v6.8h, v0.16b, v3.16b rshrn v7.8b, v5.8h, #6 rshrn2 v7.16b, v6.8h, #6 st1 {v7.d}[0], [x0], x1 st1 {v7.d}[1], [x8], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x5], #32 ld1 {v5.16b, v6.16b}, [x2], #32 ld1 {v0.16b}, [x0] subs w4, w4, #2 sub v7.16b, v4.16b, v1.16b sub v20.16b, v4.16b, v2.16b ld1 {v3.16b}, [x8] umull v16.8h, v5.8b, v1.8b umlal v16.8h, v0.8b, v7.8b umull2 v17.8h, v5.16b, v1.16b umlal2 v17.8h, v0.16b, v7.16b umull v21.8h, v6.8b, v2.8b umlal v21.8h, v3.8b, v20.8b umull2 v22.8h, v6.16b, v2.16b umlal2 v22.8h, v3.16b, v20.16b rshrn v18.8b, v16.8h, #6 rshrn2 v18.16b, v17.8h, #6 rshrn v19.8b, v21.8h, #6 rshrn2 v19.16b, v22.8h, #6 st1 {v18.16b}, [x0], x1 st1 {v19.16b}, [x8], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ld1 {v20.16b, v21.16b}, [x0] subs w4, w4, #2 ld1 {v22.16b, v23.16b}, [x8] sub v5.16b, v4.16b, v0.16b sub v6.16b, v4.16b, v1.16b sub v30.16b, v4.16b, v2.16b sub v31.16b, v4.16b, v3.16b umull v24.8h, v16.8b, v0.8b umlal v24.8h, v20.8b, v5.8b umull2 v26.8h, v16.16b, v0.16b umlal2 v26.8h, v20.16b, v5.16b umull v28.8h, v17.8b, v1.8b umlal v28.8h, v21.8b, v6.8b umull2 v7.8h, v17.16b, v1.16b umlal2 v7.8h, v21.16b, v6.16b umull v27.8h, v18.8b, v2.8b umlal v27.8h, v22.8b, v30.8b umull2 v1.8h, v18.16b, v2.16b umlal2 v1.8h, v22.16b, v30.16b umull v29.8h, v19.8b, v3.8b umlal v29.8h, v23.8b, v31.8b umull2 v21.8h, v19.16b, v3.16b umlal2 v21.8h, v23.16b, v31.16b rshrn v24.8b, v24.8h, #6 rshrn2 v24.16b, v26.8h, #6 rshrn v25.8b, v28.8h, #6 rshrn2 v25.16b, v7.8h, #6 rshrn v27.8b, v27.8h, #6 rshrn2 v27.16b, v1.8h, #6 rshrn v28.8b, v29.8h, #6 rshrn2 v28.16b, v21.8h, #6 st1 {v24.16b, v25.16b}, [x0], x1 st1 {v27.16b, v28.16b}, [x8], x1 b.gt 32b ret L(blend_tbl): .hword L(blend_tbl) - 32b .hword L(blend_tbl) - 16b .hword L(blend_tbl) - 8b .hword L(blend_tbl) - 4b endfunc function blend_h_8bpc_neon, export=1 adr x6, L(blend_h_tbl) movrel x5, X(obmc_masks) add x5, x5, w4, uxtw sub w4, w4, w4, lsr #2 clz w7, w3 movi v4.16b, #64 add x8, x0, x1 lsl x1, x1, #1 sub w7, w7, #24 ldrh w7, [x6, x7, lsl #1] sub x6, x6, w7, uxtw br x6 2: AARCH64_VALID_JUMP_TARGET ld1 {v0.h}[0], [x5], #2 ld1 {v1.s}[0], [x2], #4 subs w4, w4, #2 ld1 {v2.h}[0], [x0] zip1 v0.8b, v0.8b, v0.8b sub v3.8b, v4.8b, v0.8b ld1 {v2.h}[1], [x8] umull v5.8h, v1.8b, v0.8b umlal v5.8h, v2.8b, v3.8b rshrn v5.8b, v5.8h, #6 st1 {v5.h}[0], [x0], x1 st1 {v5.h}[1], [x8], x1 b.gt 2b ret 4: AARCH64_VALID_JUMP_TARGET ld2r {v0.8b, v1.8b}, [x5], #2 ld1 {v2.8b}, [x2], #8 subs w4, w4, #2 ext v0.8b, v0.8b, v1.8b, #4 ld1 {v3.s}[0], [x0] sub v5.8b, v4.8b, v0.8b ld1 {v3.s}[1], [x8] umull v6.8h, v2.8b, v0.8b umlal v6.8h, v3.8b, v5.8b rshrn v6.8b, v6.8h, #6 st1 {v6.s}[0], [x0], x1 st1 {v6.s}[1], [x8], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld2r {v0.16b, v1.16b}, [x5], #2 ld1 {v2.16b}, [x2], #16 ld1 {v3.d}[0], [x0] ext v0.16b, v0.16b, v1.16b, #8 sub v5.16b, v4.16b, v0.16b ld1 {v3.d}[1], [x8] subs w4, w4, #2 umull v6.8h, v0.8b, v2.8b umlal v6.8h, v3.8b, v5.8b umull2 v7.8h, v0.16b, v2.16b umlal2 v7.8h, v3.16b, v5.16b rshrn v16.8b, v6.8h, #6 rshrn2 v16.16b, v7.8h, #6 st1 {v16.d}[0], [x0], x1 st1 {v16.d}[1], [x8], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld2r {v0.16b, v1.16b}, [x5], #2 ld1 {v2.16b, v3.16b}, [x2], #32 ld1 {v5.16b}, [x0] sub v7.16b, v4.16b, v0.16b sub v16.16b, v4.16b, v1.16b ld1 {v6.16b}, [x8] subs w4, w4, #2 umull v17.8h, v0.8b, v2.8b umlal v17.8h, v5.8b, v7.8b umull2 v18.8h, v0.16b, v2.16b umlal2 v18.8h, v5.16b, v7.16b umull v19.8h, v1.8b, v3.8b umlal v19.8h, v6.8b, v16.8b umull2 v20.8h, v1.16b, v3.16b umlal2 v20.8h, v6.16b, v16.16b rshrn v21.8b, v17.8h, #6 rshrn2 v21.16b, v18.8h, #6 rshrn v22.8b, v19.8h, #6 rshrn2 v22.16b, v20.8h, #6 st1 {v21.16b}, [x0], x1 st1 {v22.16b}, [x8], x1 b.gt 16b ret 1280: 640: 320: AARCH64_VALID_JUMP_TARGET sub x1, x1, w3, uxtw add x7, x2, w3, uxtw 321: ld2r {v0.16b, v1.16b}, [x5], #2 mov w6, w3 sub v20.16b, v4.16b, v0.16b sub v21.16b, v4.16b, v1.16b 32: ld1 {v16.16b, v17.16b}, [x2], #32 ld1 {v2.16b, v3.16b}, [x0] subs w6, w6, #32 umull v23.8h, v0.8b, v16.8b umlal v23.8h, v2.8b, v20.8b ld1 {v18.16b, v19.16b}, [x7], #32 umull2 v27.8h, v0.16b, v16.16b umlal2 v27.8h, v2.16b, v20.16b ld1 {v6.16b, v7.16b}, [x8] umull v24.8h, v0.8b, v17.8b umlal v24.8h, v3.8b, v20.8b umull2 v28.8h, v0.16b, v17.16b umlal2 v28.8h, v3.16b, v20.16b umull v25.8h, v1.8b, v18.8b umlal v25.8h, v6.8b, v21.8b umull2 v5.8h, v1.16b, v18.16b umlal2 v5.8h, v6.16b, v21.16b rshrn v29.8b, v23.8h, #6 rshrn2 v29.16b, v27.8h, #6 umull v26.8h, v1.8b, v19.8b umlal v26.8h, v7.8b, v21.8b umull2 v31.8h, v1.16b, v19.16b umlal2 v31.8h, v7.16b, v21.16b rshrn v30.8b, v24.8h, #6 rshrn2 v30.16b, v28.8h, #6 rshrn v23.8b, v25.8h, #6 rshrn2 v23.16b, v5.8h, #6 rshrn v24.8b, v26.8h, #6 st1 {v29.16b, v30.16b}, [x0], #32 rshrn2 v24.16b, v31.8h, #6 st1 {v23.16b, v24.16b}, [x8], #32 b.gt 32b subs w4, w4, #2 add x0, x0, x1 add x8, x8, x1 add x2, x2, w3, uxtw add x7, x7, w3, uxtw b.gt 321b ret L(blend_h_tbl): .hword L(blend_h_tbl) - 1280b .hword L(blend_h_tbl) - 640b .hword L(blend_h_tbl) - 320b .hword L(blend_h_tbl) - 16b .hword L(blend_h_tbl) - 8b .hword L(blend_h_tbl) - 4b .hword L(blend_h_tbl) - 2b endfunc function blend_v_8bpc_neon, export=1 adr x6, L(blend_v_tbl) movrel x5, X(obmc_masks) add x5, x5, w3, uxtw clz w3, w3 movi v4.16b, #64 add x8, x0, x1 lsl x1, x1, #1 sub w3, w3, #26 ldrh w3, [x6, x3, lsl #1] sub x6, x6, w3, uxtw br x6 20: AARCH64_VALID_JUMP_TARGET ld1r {v0.8b}, [x5] sub v1.8b, v4.8b, v0.8b 2: ld1 {v2.h}[0], [x2], #2 ld1 {v3.b}[0], [x0] subs w4, w4, #2 ld1 {v2.b}[1], [x2] ld1 {v3.b}[1], [x8] umull v5.8h, v2.8b, v0.8b umlal v5.8h, v3.8b, v1.8b rshrn v5.8b, v5.8h, #6 add x2, x2, #2 st1 {v5.b}[0], [x0], x1 st1 {v5.b}[1], [x8], x1 b.gt 2b ret 40: AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x5] sub x1, x1, #2 sub v1.8b, v4.8b, v0.8b 4: ld1 {v2.8b}, [x2], #8 ld1 {v3.s}[0], [x0] ld1 {v3.s}[1], [x8] subs w4, w4, #2 umull v5.8h, v2.8b, v0.8b umlal v5.8h, v3.8b, v1.8b rshrn v5.8b, v5.8h, #6 st1 {v5.h}[0], [x0], #2 st1 {v5.h}[2], [x8], #2 st1 {v5.b}[2], [x0], x1 st1 {v5.b}[6], [x8], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1r {v0.2d}, [x5] sub x1, x1, #4 sub v1.16b, v4.16b, v0.16b 8: ld1 {v2.16b}, [x2], #16 ld1 {v3.d}[0], [x0] ld1 {v3.d}[1], [x8] subs w4, w4, #2 umull v5.8h, v0.8b, v2.8b umlal v5.8h, v3.8b, v1.8b umull2 v6.8h, v0.16b, v2.16b umlal2 v6.8h, v3.16b, v1.16b rshrn v7.8b, v5.8h, #6 rshrn2 v7.16b, v6.8h, #6 st1 {v7.s}[0], [x0], #4 st1 {v7.s}[2], [x8], #4 st1 {v7.h}[2], [x0], x1 st1 {v7.h}[6], [x8], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x5] sub x1, x1, #8 sub v2.16b, v4.16b, v0.16b 16: ld1 {v5.16b, v6.16b}, [x2], #32 ld1 {v7.16b}, [x0] subs w4, w4, #2 ld1 {v16.16b}, [x8] umull v17.8h, v5.8b, v0.8b umlal v17.8h, v7.8b, v2.8b umull2 v18.8h, v5.16b, v0.16b umlal2 v18.8h, v7.16b, v2.16b umull v20.8h, v6.8b, v0.8b umlal v20.8h, v16.8b, v2.8b umull2 v21.8h, v6.16b, v0.16b umlal2 v21.8h, v16.16b, v2.16b rshrn v19.8b, v17.8h, #6 rshrn2 v19.16b, v18.8h, #6 rshrn v22.8b, v20.8h, #6 rshrn2 v22.16b, v21.8h, #6 st1 {v19.8b}, [x0], #8 st1 {v22.8b}, [x8], #8 st1 {v19.s}[2], [x0], x1 st1 {v22.s}[2], [x8], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x5] sub x1, x1, #16 sub v2.16b, v4.16b, v0.16b sub v3.8b, v4.8b, v1.8b 32: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ld1 {v5.16b, v6.16b}, [x0] subs w4, w4, #2 ld1 {v20.16b, v21.16b}, [x8] umull v22.8h, v16.8b, v0.8b umlal v22.8h, v5.8b, v2.8b umull2 v23.8h, v16.16b, v0.16b umlal2 v23.8h, v5.16b, v2.16b umull v28.8h, v17.8b, v1.8b umlal v28.8h, v6.8b, v3.8b umull v30.8h, v18.8b, v0.8b umlal v30.8h, v20.8b, v2.8b umull2 v31.8h, v18.16b, v0.16b umlal2 v31.8h, v20.16b, v2.16b umull v25.8h, v19.8b, v1.8b umlal v25.8h, v21.8b, v3.8b rshrn v24.8b, v22.8h, #6 rshrn2 v24.16b, v23.8h, #6 rshrn v28.8b, v28.8h, #6 rshrn v30.8b, v30.8h, #6 rshrn2 v30.16b, v31.8h, #6 rshrn v27.8b, v25.8h, #6 st1 {v24.16b}, [x0], #16 st1 {v30.16b}, [x8], #16 st1 {v28.8b}, [x0], x1 st1 {v27.8b}, [x8], x1 b.gt 32b ret L(blend_v_tbl): .hword L(blend_v_tbl) - 320b .hword L(blend_v_tbl) - 160b .hword L(blend_v_tbl) - 80b .hword L(blend_v_tbl) - 40b .hword L(blend_v_tbl) - 20b endfunc // This has got the same signature as the put_8tap functions, // and assumes that x8 is set to (clz(w)-24). function put_neon adr x9, L(put_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 2: AARCH64_VALID_JUMP_TARGET ld1 {v0.h}[0], [x2], x3 ld1 {v1.h}[0], [x2], x3 subs w5, w5, #2 st1 {v0.h}[0], [x0], x1 st1 {v1.h}[0], [x0], x1 b.gt 2b ret 4: AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], x3 ld1 {v1.s}[0], [x2], x3 subs w5, w5, #2 st1 {v0.s}[0], [x0], x1 st1 {v1.s}[0], [x0], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], x3 ld1 {v1.8b}, [x2], x3 subs w5, w5, #2 st1 {v0.8b}, [x0], x1 st1 {v1.8b}, [x0], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET add x8, x0, x1 lsl x1, x1, #1 add x9, x2, x3 lsl x3, x3, #1 16: ld1 {v0.16b}, [x2], x3 ld1 {v1.16b}, [x9], x3 subs w5, w5, #2 st1 {v0.16b}, [x0], x1 st1 {v1.16b}, [x8], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] subs w5, w5, #1 stp x8, x9, [x0, #16] add x2, x2, x3 add x0, x0, x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] ldp x10, x11, [x2, #32] stp x8, x9, [x0, #16] subs w5, w5, #1 ldp x12, x13, [x2, #48] stp x10, x11, [x0, #32] stp x12, x13, [x0, #48] add x2, x2, x3 add x0, x0, x1 b.gt 64b ret 128: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] ldp q4, q5, [x2, #64] stp q2, q3, [x0, #32] ldp q6, q7, [x2, #96] subs w5, w5, #1 stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x2, x2, x3 add x0, x0, x1 b.gt 128b ret L(put_tbl): .hword L(put_tbl) - 128b .hword L(put_tbl) - 64b .hword L(put_tbl) - 32b .hword L(put_tbl) - 160b .hword L(put_tbl) - 8b .hword L(put_tbl) - 4b .hword L(put_tbl) - 2b endfunc // This has got the same signature as the prep_8tap functions, // and assumes that x8 is set to (clz(w)-24), and x7 to w*2. function prep_neon adr x9, L(prep_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 4: AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x1], x2 ld1 {v1.s}[0], [x1], x2 subs w4, w4, #2 ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 st1 {v0.4h, v1.4h}, [x0], #16 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x1], x2 subs w4, w4, #2 ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 st1 {v0.8h, v1.8h}, [x0], #32 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET add x9, x1, x2 lsl x2, x2, #1 16: ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x9], x2 subs w4, w4, #2 ushll v4.8h, v0.8b, #4 ushll2 v5.8h, v0.16b, #4 ushll v6.8h, v1.8b, #4 ushll2 v7.8h, v1.16b, #4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET add x8, x0, w3, uxtw 32: ld1 {v0.16b, v1.16b}, [x1], x2 subs w4, w4, #2 ushll v4.8h, v0.8b, #4 ushll2 v5.8h, v0.16b, #4 ld1 {v2.16b, v3.16b}, [x1], x2 ushll v6.8h, v1.8b, #4 ushll2 v7.8h, v1.16b, #4 ushll v16.8h, v2.8b, #4 st1 {v4.8h, v5.8h}, [x0], x7 ushll2 v17.8h, v2.16b, #4 st1 {v6.8h, v7.8h}, [x8], x7 ushll v18.8h, v3.8b, #4 st1 {v16.8h, v17.8h}, [x0], x7 ushll2 v19.8h, v3.16b, #4 st1 {v18.8h, v19.8h}, [x8], x7 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET add x8, x0, #32 mov x6, #64 64: ldp q0, q1, [x1] subs w4, w4, #1 ushll v4.8h, v0.8b, #4 ushll2 v5.8h, v0.16b, #4 ldp q2, q3, [x1, #32] ushll v6.8h, v1.8b, #4 ushll2 v7.8h, v1.16b, #4 add x1, x1, x2 ushll v16.8h, v2.8b, #4 st1 {v4.8h, v5.8h}, [x0], x6 ushll2 v17.8h, v2.16b, #4 ushll v18.8h, v3.8b, #4 st1 {v6.8h, v7.8h}, [x8], x6 ushll2 v19.8h, v3.16b, #4 st1 {v16.8h, v17.8h}, [x0], x6 st1 {v18.8h, v19.8h}, [x8], x6 b.gt 64b ret 1280: AARCH64_VALID_JUMP_TARGET add x8, x0, #64 mov x6, #128 128: ldp q0, q1, [x1] ldp q2, q3, [x1, #32] ushll v16.8h, v0.8b, #4 ushll2 v17.8h, v0.16b, #4 ushll v18.8h, v1.8b, #4 ushll2 v19.8h, v1.16b, #4 ushll v20.8h, v2.8b, #4 ushll2 v21.8h, v2.16b, #4 ldp q4, q5, [x1, #64] st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 ushll v22.8h, v3.8b, #4 ushll2 v23.8h, v3.16b, #4 ushll v24.8h, v4.8b, #4 ushll2 v25.8h, v4.16b, #4 ushll v26.8h, v5.8b, #4 ushll2 v27.8h, v5.16b, #4 ldp q6, q7, [x1, #96] st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 ushll v28.8h, v6.8b, #4 ushll2 v29.8h, v6.16b, #4 ushll v30.8h, v7.8b, #4 ushll2 v31.8h, v7.16b, #4 subs w4, w4, #1 add x1, x1, x2 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 b.gt 128b ret L(prep_tbl): .hword L(prep_tbl) - 1280b .hword L(prep_tbl) - 640b .hword L(prep_tbl) - 320b .hword L(prep_tbl) - 160b .hword L(prep_tbl) - 8b .hword L(prep_tbl) - 4b endfunc .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}[0], [\s0], \strd ld1 {\d1\wd}[0], [\s1], \strd .ifnb \d2 ld1 {\d2\wd}[0], [\s0], \strd ld1 {\d3\wd}[0], [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}[0], [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}[0], [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}[0], [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}, [\s0], \strd ld1 {\d1\wd}, [\s1], \strd .ifnb \d2 ld1 {\d2\wd}, [\s0], \strd ld1 {\d3\wd}, [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}, [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}, [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}, [\s0], \strd .endif .endm .macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro interleave_1 wd, r0, r1, r2, r3, r4 trn1 \r0\wd, \r0\wd, \r1\wd trn1 \r1\wd, \r1\wd, \r2\wd .ifnb \r3 trn1 \r2\wd, \r2\wd, \r3\wd trn1 \r3\wd, \r3\wd, \r4\wd .endif .endm .macro interleave_1_h r0, r1, r2, r3, r4 interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 .endm .macro interleave_1_s r0, r1, r2, r3, r4 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 .endm .macro interleave_2 wd, r0, r1, r2, r3, r4, r5 trn1 \r0\wd, \r0\wd, \r2\wd trn1 \r1\wd, \r1\wd, \r3\wd trn1 \r2\wd, \r2\wd, \r4\wd trn1 \r3\wd, \r3\wd, \r5\wd .endm .macro interleave_2_s r0, r1, r2, r3, r4, r5 interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 .endm .macro uxtl_b r0, r1, r2, r3, r4, r5, r6 uxtl \r0\().8h, \r0\().8b uxtl \r1\().8h, \r1\().8b .ifnb \r2 uxtl \r2\().8h, \r2\().8b uxtl \r3\().8h, \r3\().8b .endif .ifnb \r4 uxtl \r4\().8h, \r4\().8b .endif .ifnb \r5 uxtl \r5\().8h, \r5\().8b .endif .ifnb \r6 uxtl \r6\().8h, \r6\().8b .endif .endm .macro mul_mla_4 d, s0, s1, s2, s3, wd mul \d\wd, \s0\wd, v0.h[0] mla \d\wd, \s1\wd, v0.h[1] mla \d\wd, \s2\wd, v0.h[2] mla \d\wd, \s3\wd, v0.h[3] .endm // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. .macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().4h, \s0\().4h, v0.h[0] mla \d0\().4h, \s1\().4h, v0.h[1] mla \d0\().4h, \s2\().4h, v0.h[2] mla \d0\().4h, \s3\().4h, v0.h[3] mla \d0\().4h, \s4\().4h, v0.h[4] mla \d0\().4h, \s5\().4h, v0.h[5] mla \d0\().4h, \s6\().4h, v0.h[6] mla \d0\().4h, \s7\().4h, v0.h[7] .endm .macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] .endm .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] mul \d1\().8h, \s1\().8h, v0.h[0] mla \d1\().8h, \s2\().8h, v0.h[1] mla \d1\().8h, \s3\().8h, v0.h[2] mla \d1\().8h, \s4\().8h, v0.h[3] mla \d1\().8h, \s5\().8h, v0.h[4] mla \d1\().8h, \s6\().8h, v0.h[5] mla \d1\().8h, \s7\().8h, v0.h[6] mla \d1\().8h, \s8\().8h, v0.h[7] .endm .macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] mul \d1\().8h, \s2\().8h, v0.h[0] mla \d1\().8h, \s3\().8h, v0.h[1] mla \d1\().8h, \s4\().8h, v0.h[2] mla \d1\().8h, \s5\().8h, v0.h[3] mla \d1\().8h, \s6\().8h, v0.h[4] mla \d1\().8h, \s7\().8h, v0.h[5] mla \d1\().8h, \s8\().8h, v0.h[6] mla \d1\().8h, \s9\().8h, v0.h[7] .endm .macro sqrshrun_b shift, r0, r1, r2, r3 sqrshrun \r0\().8b, \r0\().8h, #\shift .ifnb \r1 sqrshrun \r1\().8b, \r1\().8h, #\shift .endif .ifnb \r2 sqrshrun \r2\().8b, \r2\().8h, #\shift sqrshrun \r3\().8b, \r3\().8h, #\shift .endif .endm .macro srshr_h shift, r0, r1, r2, r3 srshr \r0\().8h, \r0\().8h, #\shift .ifnb \r1 srshr \r1\().8h, \r1\().8h, #\shift .endif .ifnb \r2 srshr \r2\().8h, \r2\().8h, #\shift srshr \r3\().8h, \r3\().8h, #\shift .endif .endm .macro st_h strd, reg, lanes st1 {\reg\().h}[0], [x0], \strd st1 {\reg\().h}[1], [x8], \strd .if \lanes > 2 st1 {\reg\().h}[2], [x0], \strd st1 {\reg\().h}[3], [x8], \strd .endif .endm .macro st_s strd, r0, r1 st1 {\r0\().s}[0], [x0], \strd st1 {\r0\().s}[1], [x8], \strd .ifnb \r1 st1 {\r1\().s}[0], [x0], \strd st1 {\r1\().s}[1], [x8], \strd .endif .endm .macro st_d strd, r0, r1 st1 {\r0\().d}[0], [x0], \strd st1 {\r0\().d}[1], [x8], \strd .ifnb \r1 st1 {\r1\().d}[0], [x0], \strd st1 {\r1\().d}[1], [x8], \strd .endif .endm .macro shift_store_4 type, strd, r0, r1 .ifc \type, put sqrshrun_b 6, \r0, \r1 st_s \strd, \r0, \r1 .else srshr_h 2, \r0, \r1 st_d \strd, \r0, \r1 .endif .endm .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 st1 {\r0\wd}, [x0], \strd st1 {\r1\wd}, [x8], \strd .ifnb \r2 st1 {\r2\wd}, [x0], \strd st1 {\r3\wd}, [x8], \strd .endif .ifnb \r4 st1 {\r4\wd}, [x0], \strd st1 {\r5\wd}, [x8], \strd st1 {\r6\wd}, [x0], \strd st1 {\r7\wd}, [x8], \strd .endif .endm .macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endm .macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endm .macro shift_store_8 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun_b 6, \r0, \r1, \r2, \r3 st_8b \strd, \r0, \r1, \r2, \r3 .else srshr_h 2, \r0, \r1, \r2, \r3 st_16b \strd, \r0, \r1, \r2, \r3 .endif .endm .macro shift_store_16 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun \r0\().8b, \r0\().8h, #6 sqrshrun2 \r0\().16b, \r1\().8h, #6 sqrshrun \r2\().8b, \r2\().8h, #6 sqrshrun2 \r2\().16b, \r3\().8h, #6 st_16b \strd, \r0, \r2 .else srshr_h 2, \r0, \r1, \r2, \r3 st1 {\r0\().8h, \r1\().8h}, [x0], \strd st1 {\r2\().8h, \r3\().8h}, [x8], \strd .endif .endm .macro make_8tap_fn op, type, type_h, type_v function \op\()_8tap_\type\()_8bpc_neon, export=1 mov x8, \type_h mov x9, \type_v b \op\()_8tap_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv make_8tap_fn \type, regular, REGULAR, REGULAR make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH make_8tap_fn \type, regular_sharp, REGULAR, SHARP make_8tap_fn \type, smooth, SMOOTH, SMOOTH make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP make_8tap_fn \type, sharp, SHARP, SHARP make_8tap_fn \type, sharp_regular, SHARP, REGULAR make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH function \type\()_8tap_neon mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, w10 mul \my, \my, w10 add \mx, \mx, w8 // mx, 8tap_h, 4tap_h add \my, \my, w9 // my, 8tap_v, 4tap_v .ifc \type, prep uxtw \d_strd, \w lsl \d_strd, \d_strd, #1 .endif clz w8, \w tst \mx, #(0x7f << 14) sub w8, w8, #24 movrel x10, X(mc_subpel_filters), -8 b.ne L(\type\()_8tap_h) tst \my, #(0x7f << 14) b.ne L(\type\()_8tap_v) b \type\()_neon L(\type\()_8tap_h): cmp \w, #4 ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7f b.le 4f mov \mx, w9 4: tst \my, #(0x7f << 14) add \xmx, x10, \mx, uxtw #3 b.ne L(\type\()_8tap_hv) adr x9, L(\type\()_8tap_h_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 2: ld1 {v4.8b}, [\src], \s_strd ld1 {v6.8b}, [\sr2], \s_strd uxtl v4.8h, v4.8b uxtl v6.8h, v6.8b ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 subs \h, \h, #2 trn1 v3.2s, v4.2s, v6.2s trn2 v6.2s, v4.2s, v6.2s trn1 v4.2s, v5.2s, v7.2s trn2 v7.2s, v5.2s, v7.2s mul v3.4h, v3.4h, v0.h[0] mla v3.4h, v4.4h, v0.h[1] mla v3.4h, v6.4h, v0.h[2] mla v3.4h, v7.4h, v0.h[3] srshr v3.4h, v3.4h, #2 sqrshrun v3.8b, v3.8h, #4 st1 {v3.h}[0], [\dst], \d_strd st1 {v3.h}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 4: ld1 {v16.8b}, [\src], \s_strd ld1 {v20.8b}, [\sr2], \s_strd uxtl v16.8h, v16.8b uxtl v20.8h, v20.8b ext v17.16b, v16.16b, v16.16b, #2 ext v18.16b, v16.16b, v16.16b, #4 ext v19.16b, v16.16b, v16.16b, #6 ext v21.16b, v20.16b, v20.16b, #2 ext v22.16b, v20.16b, v20.16b, #4 ext v23.16b, v20.16b, v20.16b, #6 subs \h, \h, #2 mul v16.4h, v16.4h, v0.h[0] mla v16.4h, v17.4h, v0.h[1] mla v16.4h, v18.4h, v0.h[2] mla v16.4h, v19.4h, v0.h[3] mul v20.4h, v20.4h, v0.h[0] mla v20.4h, v21.4h, v0.h[1] mla v20.4h, v22.4h, v0.h[2] mla v20.4h, v23.4h, v0.h[3] srshr v16.4h, v16.4h, #2 srshr v20.4h, v20.4h, #2 .ifc \type, put sqrshrun v16.8b, v16.8h, #4 sqrshrun v20.8b, v20.8h, #4 st1 {v16.s}[0], [\dst], \d_strd st1 {v20.s}[0], [\ds2], \d_strd .else st1 {v16.4h}, [\dst], \d_strd st1 {v20.4h}, [\ds2], \d_strd .endif b.gt 4b ret 80: // 8xN h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 8: ld1 {v16.8b, v17.8b}, [\src], \s_strd ld1 {v20.8b, v21.8b}, [\sr2], \s_strd uxtl v16.8h, v16.8b uxtl v17.8h, v17.8b uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b mul v18.8h, v16.8h, v0.h[0] mul v22.8h, v20.8h, v0.h[0] .irpc i, 1234567 ext v19.16b, v16.16b, v17.16b, #(2*\i) ext v23.16b, v20.16b, v21.16b, #(2*\i) mla v18.8h, v19.8h, v0.h[\i] mla v22.8h, v23.8h, v0.h[\i] .endr subs \h, \h, #2 srshr v18.8h, v18.8h, #2 srshr v22.8h, v22.8h, #2 .ifc \type, put sqrshrun v18.8b, v18.8h, #4 sqrshrun v22.8b, v22.8h, #4 st1 {v18.8b}, [\dst], \d_strd st1 {v22.8b}, [\ds2], \d_strd .else st1 {v18.8h}, [\dst], \d_strd st1 {v22.8h}, [\ds2], \d_strd .endif b.gt 8b ret 160: 320: 640: 1280: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b sub \s_strd, \s_strd, \w, uxtw sub \s_strd, \s_strd, #8 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw .endif 161: ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 mov \mx, \w uxtl v16.8h, v16.8b uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b uxtl v22.8h, v22.8b 16: mul v24.8h, v16.8h, v0.h[0] mul v25.8h, v17.8h, v0.h[0] mul v26.8h, v20.8h, v0.h[0] mul v27.8h, v21.8h, v0.h[0] .irpc i, 1234567 ext v28.16b, v16.16b, v17.16b, #(2*\i) ext v29.16b, v17.16b, v18.16b, #(2*\i) ext v30.16b, v20.16b, v21.16b, #(2*\i) ext v31.16b, v21.16b, v22.16b, #(2*\i) mla v24.8h, v28.8h, v0.h[\i] mla v25.8h, v29.8h, v0.h[\i] mla v26.8h, v30.8h, v0.h[\i] mla v27.8h, v31.8h, v0.h[\i] .endr srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 srshr v26.8h, v26.8h, #2 srshr v27.8h, v27.8h, #2 subs \mx, \mx, #16 .ifc \type, put sqrshrun v24.8b, v24.8h, #4 sqrshrun2 v24.16b, v25.8h, #4 sqrshrun v26.8b, v26.8h, #4 sqrshrun2 v26.16b, v27.8h, #4 st1 {v24.16b}, [\dst], #16 st1 {v26.16b}, [\ds2], #16 .else st1 {v24.8h, v25.8h}, [\dst], #32 st1 {v26.8h, v27.8h}, [\ds2], #32 .endif b.le 9f mov v16.16b, v18.16b mov v20.16b, v22.16b ld1 {v17.8b, v18.8b}, [\src], #16 ld1 {v21.8b, v22.8b}, [\sr2], #16 uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v21.8h, v21.8b uxtl v22.8h, v22.8b b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 161b ret L(\type\()_8tap_h_tbl): .hword L(\type\()_8tap_h_tbl) - 1280b .hword L(\type\()_8tap_h_tbl) - 640b .hword L(\type\()_8tap_h_tbl) - 320b .hword L(\type\()_8tap_h_tbl) - 160b .hword L(\type\()_8tap_h_tbl) - 80b .hword L(\type\()_8tap_h_tbl) - 40b .hword L(\type\()_8tap_h_tbl) - 20b .hword 0 L(\type\()_8tap_v): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w9 4: add \xmy, x10, \my, uxtw #3 adr x9, L(\type\()_8tap_v_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put b.gt 28f cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b // 2x2 v load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_h v1, v2, v3, v4, v5 b.gt 24f uxtl_b v1, v2, v3, v4 mul_mla_4 v6, v1, v2, v3, v4, .4h sqrshrun_b 6, v6 st_h \d_strd, v6, 2 ret 24: // 2x4 v load_h \sr2, \src, \s_strd, v6, v7 interleave_1_h v5, v6, v7 interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 mul_mla_4 v6, v1, v2, v3, v4, .8h sqrshrun_b 6, v6 st_h \d_strd, v6, 4 ret 28: // 2x6, 2x8, 2x12, 2x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 interleave_1_h v1, v2, v3, v4, v5 interleave_1_h v5, v6, v7 interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 216: subs \h, \h, #4 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 interleave_1_h v7, v16, v17, v18, v19 interleave_2_s v5, v6, v7, v16, v17, v18 uxtl_b v5, v6, v7, v16 mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 4 b.le 0f cmp \h, #2 mov v1.16b, v5.16b mov v2.16b, v6.16b mov v3.16b, v7.16b mov v4.16b, v16.16b mov v5.16b, v17.16b mov v6.16b, v18.16b mov v7.16b, v19.16b b.eq 26f b 216b 26: load_h \sr2, \src, \s_strd, v16, v17 interleave_1_h v7, v16, v17 uxtl_b v5, v6, v7, v16 mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 2 0: ret .endif 40: AARCH64_VALID_JUMP_TARGET b.gt 480f // 4x2, 4x4 v cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4 mul_mla_4 v6, v1, v2, v3, v4, .8h shift_store_4 \type, \d_strd, v6 b.le 0f load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 uxtl_b v5, v6 mul_mla_4 v7, v3, v4, v5, v6, .8h shift_store_4 \type, \d_strd, v7 0: ret 480: // 4x6, 4x8, 4x12, 4x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 interleave_1_s v16, v17, v18 interleave_1_s v18, v19, v20, v21, v22 uxtl_b v16, v17 uxtl_b v18, v19, v20, v21 48: subs \h, \h, #4 load_s \sr2, \src, \s_strd, v23, v24, v25, v26 interleave_1_s v22, v23, v24, v25, v26 uxtl_b v22, v23, v24, v25 mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 shift_store_4 \type, \d_strd, v1, v2 b.le 0f load_s \sr2, \src, \s_strd, v27, v16 subs \h, \h, #2 interleave_1_s v26, v27, v16 uxtl_b v26, v27 mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 shift_store_4 \type, \d_strd, v1 b.le 0f load_s \sr2, \src, \s_strd, v17, v18 subs \h, \h, #2 interleave_1_s v16, v17, v18 uxtl_b v16, v17 mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 shift_store_4 \type, \d_strd, v2 b.le 0f subs \h, \h, #4 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 interleave_1_s v18, v19, v20, v21, v22 uxtl_b v18, v19, v20, v21 mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 shift_store_4 \type, \d_strd, v1, v2 b.gt 48b 0: ret 80: AARCH64_VALID_JUMP_TARGET b.gt 880f // 8x2, 8x4 v cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4, v5 mul_mla_4 v6, v1, v2, v3, v4, .8h mul_mla_4 v7, v2, v3, v4, v5, .8h shift_store_8 \type, \d_strd, v6, v7 b.le 0f load_8b \sr2, \src, \s_strd, v6, v7 uxtl_b v6, v7 mul_mla_4 v1, v3, v4, v5, v6, .8h mul_mla_4 v2, v4, v5, v6, v7, .8h shift_store_8 \type, \d_strd, v1, v2 0: ret 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: 1280: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmy] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 uxtl_b v16, v17, v18, v19, v20, v21, v22 88: subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v23, v24 uxtl_b v23, v24 mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v25, v26 uxtl_b v25, v26 mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v27, v16 uxtl_b v27, v16 mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v17, v18 uxtl_b v17, v18 mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #4 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 uxtl_b v19, v20, v21, v22 mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.gt 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 168b 0: ret 160: AARCH64_VALID_JUMP_TARGET b.gt 1680b // 16x2, 16x4 v add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b cmp \h, #2 load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 uxtl v16.8h, v1.8b uxtl v17.8h, v2.8b uxtl v18.8h, v3.8b uxtl v19.8h, v4.8b uxtl v20.8h, v5.8b uxtl2 v23.8h, v1.16b uxtl2 v24.8h, v2.16b uxtl2 v25.8h, v3.16b uxtl2 v26.8h, v4.16b uxtl2 v27.8h, v5.16b mul_mla_4 v1, v16, v17, v18, v19, .8h mul_mla_4 v16, v17, v18, v19, v20, .8h mul_mla_4 v2, v23, v24, v25, v26, .8h mul_mla_4 v17, v24, v25, v26, v27, .8h shift_store_16 \type, \d_strd, v1, v2, v16, v17 b.le 0f load_16b \sr2, \src, \s_strd, v6, v7 uxtl v21.8h, v6.8b uxtl v22.8h, v7.8b uxtl2 v28.8h, v6.16b uxtl2 v29.8h, v7.16b mul_mla_4 v1, v18, v19, v20, v21, .8h mul_mla_4 v3, v19, v20, v21, v22, .8h mul_mla_4 v2, v25, v26, v27, v28, .8h mul_mla_4 v4, v26, v27, v28, v29, .8h shift_store_16 \type, \d_strd, v1, v2, v3, v4 0: ret L(\type\()_8tap_v_tbl): .hword L(\type\()_8tap_v_tbl) - 1280b .hword L(\type\()_8tap_v_tbl) - 640b .hword L(\type\()_8tap_v_tbl) - 320b .hword L(\type\()_8tap_v_tbl) - 160b .hword L(\type\()_8tap_v_tbl) - 80b .hword L(\type\()_8tap_v_tbl) - 40b .hword L(\type\()_8tap_v_tbl) - 20b .hword 0 L(\type\()_8tap_hv): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w9 4: add \xmy, x10, \my, uxtw #3 adr x9, L(\type\()_8tap_hv_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 280f add \xmy, \xmy, #2 ld1 {v1.s}[0], [\xmy] // 2x2, 2x4 hv sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v28.8b}, [\src], \s_strd uxtl v28.8h, v28.8b ext v29.16b, v28.16b, v28.16b, #2 mul v28.4h, v28.4h, v0.4h mul v29.4h, v29.4h, v0.4h addp v28.4h, v28.4h, v29.4h addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 bl L(\type\()_8tap_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b 2: bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v28.8b, #4 smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v28.4h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h subs \h, \h, #2 st1 {v2.h}[0], [\dst], \d_strd st1 {v2.h}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v28.8b b 2b 280: // 2x8, 2x16, 2x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v28.8b}, [\src], \s_strd uxtl v28.8h, v28.8b ext v29.16b, v28.16b, v28.16b, #2 mul v28.4h, v28.4h, v0.4h mul v29.4h, v29.4h, v0.4h addp v28.4h, v28.4h, v29.4h addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 bl L(\type\()_8tap_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v28.8b, #4 mov v19.8b, v28.8b bl L(\type\()_8tap_filter_2) ext v20.8b, v19.8b, v28.8b, #4 mov v21.8b, v28.8b 28: bl L(\type\()_8tap_filter_2) ext v22.8b, v21.8b, v28.8b, #4 smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal v2.4s, v28.4h, v1.h[7] sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h subs \h, \h, #2 st1 {v2.h}[0], [\dst], \d_strd st1 {v2.h}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v28.8b b 28b 0: ret x15 L(\type\()_8tap_filter_2): ld1 {v28.8b}, [\sr2], \s_strd ld1 {v30.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v30.8h, v30.8b ext v29.16b, v28.16b, v28.16b, #2 ext v31.16b, v30.16b, v30.16b, #2 trn1 v27.2s, v28.2s, v30.2s trn2 v30.2s, v28.2s, v30.2s trn1 v28.2s, v29.2s, v31.2s trn2 v31.2s, v29.2s, v31.2s mul v27.4h, v27.4h, v0.h[0] mla v27.4h, v28.4h, v0.h[1] mla v27.4h, v30.4h, v0.h[2] mla v27.4h, v31.4h, v0.h[3] srshr v28.4h, v27.4h, #2 ret .endif 40: AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 480f add \xmy, \xmy, #2 ld1 {v1.s}[0], [\xmy] sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 // 4x2, 4x4 hv ld1 {v26.8b}, [\src], \s_strd uxtl v26.8h, v26.8b ext v28.16b, v26.16b, v26.16b, #2 ext v29.16b, v26.16b, v26.16b, #4 ext v30.16b, v26.16b, v26.16b, #6 mul v31.4h, v26.4h, v0.h[0] mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] srshr v16.4h, v31.4h, #2 bl L(\type\()_8tap_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b 4: bl L(\type\()_8tap_filter_4) // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v28.4h, v1.h[3] smull v3.4s, v17.4h, v1.h[0] smlal v3.4s, v18.4h, v1.h[1] smlal v3.4s, v28.4h, v1.h[2] smlal v3.4s, v29.4h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn v3.4h, v3.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h st1 {v2.s}[0], [\dst], \d_strd st1 {v3.s}[0], [\ds2], \d_strd .else st1 {v2.4h}, [\dst], \d_strd st1 {v3.4h}, [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b mov v17.8b, v28.8b mov v18.8b, v29.8b b 4b 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v26.8b}, [\src], \s_strd uxtl v26.8h, v26.8b ext v28.16b, v26.16b, v26.16b, #2 ext v29.16b, v26.16b, v26.16b, #4 ext v30.16b, v26.16b, v26.16b, #6 mul v31.4h, v26.4h, v0.h[0] mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] srshr v16.4h, v31.4h, #2 bl L(\type\()_8tap_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b bl L(\type\()_8tap_filter_4) mov v19.8b, v28.8b mov v20.8b, v29.8b bl L(\type\()_8tap_filter_4) mov v21.8b, v28.8b mov v22.8b, v29.8b 48: bl L(\type\()_8tap_filter_4) smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal v2.4s, v28.4h, v1.h[7] smull v3.4s, v17.4h, v1.h[0] smlal v3.4s, v18.4h, v1.h[1] smlal v3.4s, v19.4h, v1.h[2] smlal v3.4s, v20.4h, v1.h[3] smlal v3.4s, v21.4h, v1.h[4] smlal v3.4s, v22.4h, v1.h[5] smlal v3.4s, v28.4h, v1.h[6] smlal v3.4s, v29.4h, v1.h[7] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn v3.4h, v3.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h st1 {v2.s}[0], [\dst], \d_strd st1 {v3.s}[0], [\ds2], \d_strd .else st1 {v2.4h}, [\dst], \d_strd st1 {v3.4h}, [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v28.8b mov v22.8b, v29.8b b 48b 0: ret x15 L(\type\()_8tap_filter_4): ld1 {v26.8b}, [\sr2], \s_strd ld1 {v27.8b}, [\src], \s_strd uxtl v26.8h, v26.8b uxtl v27.8h, v27.8b ext v28.16b, v26.16b, v26.16b, #2 ext v29.16b, v26.16b, v26.16b, #4 ext v30.16b, v26.16b, v26.16b, #6 mul v31.4h, v26.4h, v0.h[0] mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] ext v28.16b, v27.16b, v27.16b, #2 ext v29.16b, v27.16b, v27.16b, #4 ext v30.16b, v27.16b, v27.16b, #6 mul v27.4h, v27.4h, v0.h[0] mla v27.4h, v28.4h, v0.h[1] mla v27.4h, v29.4h, v0.h[2] mla v27.4h, v30.4h, v0.h[3] srshr v28.4h, v31.4h, #2 srshr v29.4h, v27.4h, #2 ret 80: 160: 320: AARCH64_VALID_JUMP_TARGET b.gt 880f add \xmy, \xmy, #2 ld1 {v0.8b}, [\xmx] ld1 {v1.s}[0], [\xmy] sub \src, \src, #3 sub \src, \src, \s_strd sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 bl L(\type\()_8tap_filter_8_first) bl L(\type\()_8tap_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_8tap_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v24.4h, v1.h[2] smlal2 v5.4s, v24.8h, v1.h[2] smlal v2.4s, v24.4h, v1.h[3] smlal2 v3.4s, v24.8h, v1.h[3] smlal v4.4s, v25.4h, v1.h[3] smlal2 v5.4s, v25.8h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn2 v2.8h, v3.4s, #\shift_hv sqrshrn v4.4h, v4.4s, #\shift_hv sqrshrn2 v4.8h, v5.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v4.8b, v4.8h st1 {v2.8b}, [\dst], \d_strd st1 {v4.8b}, [\ds2], \d_strd .else st1 {v2.8h}, [\dst], \d_strd st1 {v4.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b mov v17.16b, v24.16b mov v18.16b, v25.16b b 8b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 164b 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #3 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 bl L(\type\()_8tap_filter_8_first) bl L(\type\()_8tap_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b bl L(\type\()_8tap_filter_8) mov v19.16b, v24.16b mov v20.16b, v25.16b bl L(\type\()_8tap_filter_8) mov v21.16b, v24.16b mov v22.16b, v25.16b 88: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_8tap_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v19.4h, v1.h[2] smlal2 v5.4s, v19.8h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal2 v3.4s, v19.8h, v1.h[3] smlal v4.4s, v20.4h, v1.h[3] smlal2 v5.4s, v20.8h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal2 v3.4s, v20.8h, v1.h[4] smlal v4.4s, v21.4h, v1.h[4] smlal2 v5.4s, v21.8h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal2 v3.4s, v21.8h, v1.h[5] smlal v4.4s, v22.4h, v1.h[5] smlal2 v5.4s, v22.8h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal2 v3.4s, v22.8h, v1.h[6] smlal v4.4s, v24.4h, v1.h[6] smlal2 v5.4s, v24.8h, v1.h[6] smlal v2.4s, v24.4h, v1.h[7] smlal2 v3.4s, v24.8h, v1.h[7] smlal v4.4s, v25.4h, v1.h[7] smlal2 v5.4s, v25.8h, v1.h[7] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn2 v2.8h, v3.4s, #\shift_hv sqrshrn v4.4h, v4.4s, #\shift_hv sqrshrn2 v4.8h, v5.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v4.8b, v4.8h st1 {v2.8b}, [\dst], \d_strd st1 {v4.8b}, [\ds2], \d_strd .else st1 {v2.8h}, [\dst], \d_strd st1 {v4.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b mov v17.16b, v19.16b mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b mov v21.16b, v24.16b mov v22.16b, v25.16b b 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 168b 0: ret x15 L(\type\()_8tap_filter_8_first): ld1 {v28.8b, v29.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b mul v16.8h, v28.8h, v0.h[0] ext v24.16b, v28.16b, v29.16b, #(2*1) ext v25.16b, v28.16b, v29.16b, #(2*2) ext v26.16b, v28.16b, v29.16b, #(2*3) ext v27.16b, v28.16b, v29.16b, #(2*4) mla v16.8h, v24.8h, v0.h[1] mla v16.8h, v25.8h, v0.h[2] mla v16.8h, v26.8h, v0.h[3] mla v16.8h, v27.8h, v0.h[4] ext v24.16b, v28.16b, v29.16b, #(2*5) ext v25.16b, v28.16b, v29.16b, #(2*6) ext v26.16b, v28.16b, v29.16b, #(2*7) mla v16.8h, v24.8h, v0.h[5] mla v16.8h, v25.8h, v0.h[6] mla v16.8h, v26.8h, v0.h[7] srshr v16.8h, v16.8h, #2 ret L(\type\()_8tap_filter_8): ld1 {v28.8b, v29.8b}, [\sr2], \s_strd ld1 {v30.8b, v31.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b uxtl v30.8h, v30.8b uxtl v31.8h, v31.8b mul v24.8h, v28.8h, v0.h[0] mul v25.8h, v30.8h, v0.h[0] .irpc i, 1234567 ext v26.16b, v28.16b, v29.16b, #(2*\i) ext v27.16b, v30.16b, v31.16b, #(2*\i) mla v24.8h, v26.8h, v0.h[\i] mla v25.8h, v27.8h, v0.h[\i] .endr srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 ret L(\type\()_8tap_hv_tbl): .hword L(\type\()_8tap_hv_tbl) - 1280b .hword L(\type\()_8tap_hv_tbl) - 640b .hword L(\type\()_8tap_hv_tbl) - 320b .hword L(\type\()_8tap_hv_tbl) - 160b .hword L(\type\()_8tap_hv_tbl) - 80b .hword L(\type\()_8tap_hv_tbl) - 40b .hword L(\type\()_8tap_hv_tbl) - 20b .hword 0 endfunc function \type\()_bilin_8bpc_neon, export=1 dup v1.16b, \mx dup v3.16b, \my mov w9, #16 sub w8, w9, \mx sub w9, w9, \my dup v0.16b, w8 dup v2.16b, w9 .ifc \type, prep uxtw \d_strd, \w lsl \d_strd, \d_strd, #1 .endif clz w8, \w sub w8, w8, #24 cbnz \mx, L(\type\()_bilin_h) cbnz \my, L(\type\()_bilin_v) b \type\()_neon L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) adr x9, L(\type\()_bilin_h_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: ld1 {v4.s}[0], [\src], \s_strd ld1 {v6.s}[0], [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #1 ext v7.8b, v6.8b, v6.8b, #1 trn1 v4.4h, v4.4h, v6.4h trn1 v5.4h, v5.4h, v7.4h subs \h, \h, #2 umull v4.8h, v4.8b, v0.8b umlal v4.8h, v5.8b, v1.8b uqrshrn v4.8b, v4.8h, #4 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: ld1 {v4.8b}, [\src], \s_strd ld1 {v6.8b}, [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #1 ext v7.8b, v6.8b, v6.8b, #1 trn1 v4.2s, v4.2s, v6.2s trn1 v5.2s, v5.2s, v7.2s subs \h, \h, #2 umull v4.8h, v4.8b, v0.8b umlal v4.8h, v5.8b, v1.8b .ifc \type, put uqrshrn v4.8b, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.gt 4b ret 80: // 8xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: ld1 {v4.16b}, [\src], \s_strd ld1 {v6.16b}, [\sr2], \s_strd ext v5.16b, v4.16b, v4.16b, #1 ext v7.16b, v6.16b, v6.16b, #1 subs \h, \h, #2 umull v4.8h, v4.8b, v0.8b umull v6.8h, v6.8b, v0.8b umlal v4.8h, v5.8b, v1.8b umlal v6.8h, v7.8b, v1.8b .ifc \type, put uqrshrn v4.8b, v4.8h, #4 uqrshrn v6.8b, v6.8h, #4 st1 {v4.8b}, [\dst], \d_strd st1 {v6.8b}, [\ds2], \d_strd .else st1 {v4.8h}, [\dst], \d_strd st1 {v6.8h}, [\ds2], \d_strd .endif b.gt 8b ret 160: 320: 640: 1280: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w, uxtw sub \s_strd, \s_strd, #8 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw .endif 161: ld1 {v16.d}[1], [\src], #8 ld1 {v20.d}[1], [\sr2], #8 mov \mx, \w 16: ld1 {v18.16b}, [\src], #16 ld1 {v22.16b}, [\sr2], #16 ext v17.16b, v16.16b, v18.16b, #8 ext v19.16b, v16.16b, v18.16b, #9 ext v21.16b, v20.16b, v22.16b, #8 ext v23.16b, v20.16b, v22.16b, #9 umull v16.8h, v17.8b, v0.8b umull2 v17.8h, v17.16b, v0.16b umull v20.8h, v21.8b, v0.8b umull2 v21.8h, v21.16b, v0.16b umlal v16.8h, v19.8b, v1.8b umlal2 v17.8h, v19.16b, v1.16b umlal v20.8h, v23.8b, v1.8b umlal2 v21.8h, v23.16b, v1.16b subs \mx, \mx, #16 .ifc \type, put uqrshrn v16.8b, v16.8h, #4 uqrshrn2 v16.16b, v17.8h, #4 uqrshrn v20.8b, v20.8h, #4 uqrshrn2 v20.16b, v21.8h, #4 st1 {v16.16b}, [\dst], #16 st1 {v20.16b}, [\ds2], #16 .else st1 {v16.8h, v17.8h}, [\dst], #32 st1 {v20.8h, v21.8h}, [\ds2], #32 .endif b.le 9f mov v16.16b, v18.16b mov v20.16b, v22.16b b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 161b ret L(\type\()_bilin_h_tbl): .hword L(\type\()_bilin_h_tbl) - 1280b .hword L(\type\()_bilin_h_tbl) - 640b .hword L(\type\()_bilin_h_tbl) - 320b .hword L(\type\()_bilin_h_tbl) - 160b .hword L(\type\()_bilin_h_tbl) - 80b .hword L(\type\()_bilin_h_tbl) - 40b .hword L(\type\()_bilin_h_tbl) - 20b .hword 0 L(\type\()_bilin_v): cmp \h, #4 adr x9, L(\type\()_bilin_v_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v ld1 {v16.h}[0], [\src], \s_strd b.gt 24f 22: ld1 {v17.h}[0], [\sr2], \s_strd ld1 {v18.h}[0], [\src], \s_strd trn1 v16.4h, v16.4h, v17.4h trn1 v17.4h, v17.4h, v18.4h umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b uqrshrn v4.8b, v4.8h, #4 st1 {v4.h}[0], [\dst] st1 {v4.h}[1], [\ds2] ret 24: // 2x4, 2x6, 2x8, ... v ld1 {v17.h}[0], [\sr2], \s_strd ld1 {v18.h}[0], [\src], \s_strd ld1 {v19.h}[0], [\sr2], \s_strd ld1 {v20.h}[0], [\src], \s_strd sub \h, \h, #4 trn1 v16.4h, v16.4h, v17.4h trn1 v17.4h, v17.4h, v18.4h trn1 v18.4h, v18.4h, v19.4h trn1 v19.4h, v19.4h, v20.4h trn1 v16.2s, v16.2s, v18.2s trn1 v17.2s, v17.2s, v19.2s umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b cmp \h, #2 uqrshrn v4.8b, v4.8h, #4 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd st1 {v4.h}[2], [\dst], \d_strd st1 {v4.h}[3], [\ds2], \d_strd b.lt 0f mov v16.8b, v20.8b b.eq 22b b 24b 0: ret .endif 40: // 4xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.s}[0], [\src], \s_strd 4: ld1 {v17.s}[0], [\sr2], \s_strd ld1 {v18.s}[0], [\src], \s_strd trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b b 4b 0: ret 80: // 8xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.8b}, [\src], \s_strd 8: ld1 {v17.8b}, [\sr2], \s_strd ld1 {v18.8b}, [\src], \s_strd umull v4.8h, v16.8b, v2.8b umull v5.8h, v17.8b, v2.8b umlal v4.8h, v17.8b, v3.8b umlal v5.8h, v18.8b, v3.8b subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #4 uqrshrn v5.8b, v5.8h, #4 st1 {v4.8b}, [\dst], \d_strd st1 {v5.8b}, [\ds2], \d_strd .else st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b b 8b 0: ret 160: // 16xN, 32xN, ... 320: 640: 1280: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.16b}, [\src], \s_strd 2: ld1 {v17.16b}, [\sr2], \s_strd ld1 {v18.16b}, [\src], \s_strd umull v4.8h, v16.8b, v2.8b umull2 v5.8h, v16.16b, v2.16b umull v6.8h, v17.8b, v2.8b umull2 v7.8h, v17.16b, v2.16b umlal v4.8h, v17.8b, v3.8b umlal2 v5.8h, v17.16b, v3.16b umlal v6.8h, v18.8b, v3.8b umlal2 v7.8h, v18.16b, v3.16b subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #4 uqrshrn2 v4.16b, v5.8h, #4 uqrshrn v6.8b, v6.8h, #4 uqrshrn2 v6.16b, v7.8h, #4 st1 {v4.16b}, [\dst], \d_strd st1 {v6.16b}, [\ds2], \d_strd .else st1 {v4.8h, v5.8h}, [\dst], \d_strd st1 {v6.8h, v7.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b b 2b 9: subs \w, \w, #16 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 .ifc \type, put add \dst, \dst, #16 .else add \dst, \dst, #32 .endif b 1b 0: ret L(\type\()_bilin_v_tbl): .hword L(\type\()_bilin_v_tbl) - 1280b .hword L(\type\()_bilin_v_tbl) - 640b .hword L(\type\()_bilin_v_tbl) - 320b .hword L(\type\()_bilin_v_tbl) - 160b .hword L(\type\()_bilin_v_tbl) - 80b .hword L(\type\()_bilin_v_tbl) - 40b .hword L(\type\()_bilin_v_tbl) - 20b .hword 0 L(\type\()_bilin_hv): uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b adr x9, L(\type\()_bilin_hv_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN hv AARCH64_VALID_JUMP_TARGET .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v28.s}[0], [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 2: ld1 {v28.s}[0], [\sr2], \s_strd ld1 {v30.s}[0], [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 ext v31.8b, v30.8b, v30.8b, #1 trn1 v28.4h, v28.4h, v30.4h trn1 v29.4h, v29.4h, v31.4h umull v17.8h, v28.8b, v0.8b umlal v17.8h, v29.8b, v1.8b trn1 v16.2s, v16.2s, v17.2s mul v4.4h, v16.4h, v2.4h mla v4.4h, v17.4h, v3.4h uqrshrn v4.8b, v4.8h, #8 subs \h, \h, #2 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd b.le 0f trn2 v16.2s, v17.2s, v17.2s b 2b 0: ret .endif 40: // 4xN hv AARCH64_VALID_JUMP_TARGET add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v28.8b}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 4: ld1 {v28.8b}, [\sr2], \s_strd ld1 {v30.8b}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 ext v31.8b, v30.8b, v30.8b, #1 trn1 v28.2s, v28.2s, v30.2s trn1 v29.2s, v29.2s, v31.2s umull v17.8h, v28.8b, v0.8b umlal v17.8h, v29.8b, v1.8b trn1 v16.2d, v16.2d, v17.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #8 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else urshr v4.8h, v4.8h, #4 st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.le 0f trn2 v16.2d, v17.2d, v17.2d b 4b 0: ret 80: // 8xN, 16xN, ... hv 160: 320: 640: 1280: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v28.16b}, [\src], \s_strd ext v29.16b, v28.16b, v28.16b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 2: ld1 {v28.16b}, [\sr2], \s_strd ld1 {v30.16b}, [\src], \s_strd ext v29.16b, v28.16b, v28.16b, #1 ext v31.16b, v30.16b, v30.16b, #1 umull v17.8h, v28.8b, v0.8b umlal v17.8h, v29.8b, v1.8b umull v18.8h, v30.8b, v0.8b umlal v18.8h, v31.8b, v1.8b mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h mul v5.8h, v17.8h, v2.8h mla v5.8h, v18.8h, v3.8h subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #8 uqrshrn v5.8b, v5.8h, #8 st1 {v4.8b}, [\dst], \d_strd st1 {v5.8b}, [\ds2], \d_strd .else urshr v4.8h, v4.8h, #4 urshr v5.8h, v5.8h, #4 st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b b 2b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 1b 0: ret L(\type\()_bilin_hv_tbl): .hword L(\type\()_bilin_hv_tbl) - 1280b .hword L(\type\()_bilin_hv_tbl) - 640b .hword L(\type\()_bilin_hv_tbl) - 320b .hword L(\type\()_bilin_hv_tbl) - 160b .hword L(\type\()_bilin_hv_tbl) - 80b .hword L(\type\()_bilin_hv_tbl) - 40b .hword L(\type\()_bilin_hv_tbl) - 20b .hword 0 endfunc .endm filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 .macro load_filter_row dst, src, inc asr w13, \src, #10 add \src, \src, \inc ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon add w12, w5, #512 ld1 {v16.8b, v17.8b}, [x2], x3 load_filter_row d0, w12, w7 load_filter_row d1, w12, w7 load_filter_row d2, w12, w7 load_filter_row d3, w12, w7 load_filter_row d4, w12, w7 load_filter_row d5, w12, w7 load_filter_row d6, w12, w7 // subtract by 128 to allow using smull eor v16.8b, v16.8b, v22.8b eor v17.8b, v17.8b, v22.8b load_filter_row d7, w12, w7 ext v18.8b, v16.8b, v17.8b, #1 ext v19.8b, v16.8b, v17.8b, #2 smull v0.8h, v0.8b, v16.8b smull v1.8h, v1.8b, v18.8b ext v18.8b, v16.8b, v17.8b, #3 ext v20.8b, v16.8b, v17.8b, #4 smull v2.8h, v2.8b, v19.8b smull v3.8h, v3.8b, v18.8b ext v18.8b, v16.8b, v17.8b, #5 ext v19.8b, v16.8b, v17.8b, #6 smull v4.8h, v4.8b, v20.8b smull v5.8h, v5.8b, v18.8b ext v18.8b, v16.8b, v17.8b, #7 smull v6.8h, v6.8b, v19.8b smull v7.8h, v7.8b, v18.8b addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h add w5, w5, w8 ret endfunc // void dav1d_warp_affine_8x8_8bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my) .macro warp t, shift function warp_affine_8x8\t\()_8bpc_neon, export=1 ldr x4, [x4] sbfx x7, x4, #0, #16 sbfx x8, x4, #16, #16 sbfx x9, x4, #32, #16 sbfx x4, x4, #48, #16 mov w10, #8 sub x2, x2, x3, lsl #1 sub x2, x2, x3 sub x2, x2, #3 movrel x11, X(mc_warp_filter), 64*8 mov x15, x30 .ifnb \t lsl x1, x1, #1 .endif movi v22.8b, #128 .ifb \t movi v23.8h, #128 .else movi v23.8h, #8, lsl #8 .endif bl warp_filter_horz_neon srshr v24.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v25.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v26.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v27.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v28.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v29.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v30.8h, v0.8h, #3 1: add w14, w6, #512 bl warp_filter_horz_neon srshr v31.8h, v0.8h, #3 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 load_filter_row d2, w14, w9 load_filter_row d3, w14, w9 load_filter_row d4, w14, w9 load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. smull v16.4s, v24.4h, v0.4h smlal v16.4s, v25.4h, v1.4h smlal v16.4s, v26.4h, v2.4h smlal v16.4s, v27.4h, v3.4h smlal v16.4s, v28.4h, v4.4h smlal v16.4s, v29.4h, v5.4h smlal v16.4s, v30.4h, v6.4h smlal v16.4s, v31.4h, v7.4h smull2 v17.4s, v24.8h, v0.8h smlal2 v17.4s, v25.8h, v1.8h smlal2 v17.4s, v26.8h, v2.8h smlal2 v17.4s, v27.8h, v3.8h smlal2 v17.4s, v28.8h, v4.8h smlal2 v17.4s, v29.8h, v5.8h smlal2 v17.4s, v30.8h, v6.8h smlal2 v17.4s, v31.8h, v7.8h mov v24.16b, v25.16b mov v25.16b, v26.16b sqrshrn v16.4h, v16.4s, #\shift mov v26.16b, v27.16b sqrshrn2 v16.8h, v17.4s, #\shift mov v27.16b, v28.16b mov v28.16b, v29.16b add v16.8h, v16.8h, v23.8h .ifb \t sqxtun v16.8b, v16.8h .endif mov v29.16b, v30.16b mov v30.16b, v31.16b subs w10, w10, #1 .ifnb \t st1 {v16.8h}, [x0], x1 .else st1 {v16.8b}, [x0], x1 .endif add w6, w6, w4 b.gt 1b ret x15 endfunc .endm warp , 11 warp t, 7 // void dav1d_emu_edge_8bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_8bpc_neon, export=1 ldp x8, x9, [sp] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub x12, x3, #1 // ih - 1 cmp x5, x3 sub x13, x2, #1 // iw - 1 csel x12, x12, x5, ge // min(y, ih - 1) cmp x4, x2 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) csel x13, x13, x4, ge // min(x, iw - 1) bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) madd x8, x12, x9, x8 // ref += iclip() * stride add x8, x8, x13 // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add x10, x5, x1 // y + bh neg x5, x5 // -y sub x10, x10, x3 // y + bh - ih sub x12, x1, #1 // bh - 1 cmp x10, x1 bic x5, x5, x5, asr #63 // max(-y, 0) csel x10, x10, x12, lt // min(y + bh - ih, bh-1) cmp x5, x1 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) csel x5, x5, x12, lt // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add x11, x4, x0 // x + bw neg x4, x4 // -x sub x11, x11, x2 // x + bw - iw sub x13, x0, #1 // bw - 1 cmp x11, x0 bic x4, x4, x4, asr #63 // max(-x, 0) csel x11, x11, x13, lt // min(x + bw - iw, bw-1) cmp x4, x0 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub x1, x1, x5 // bh - top_ext madd x6, x5, x7, x6 sub x2, x0, x4 // bw - left_ext sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext sub x2, x2, x11 // center_w = bw - left_ext - right_ext mov x14, x6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left ld1r {v0.16b}, [x8] mov x12, x6 // out = dst mov x3, x4 1: subs x3, x3, #16 st1 {v0.16b}, [x12], #16 b.gt 1b .endif mov x13, x8 add x12, x6, x4 // out = dst + left_ext mov x3, x2 1: ld1 {v0.16b, v1.16b}, [x13], #32 subs x3, x3, #32 st1 {v0.16b, v1.16b}, [x12], #32 b.gt 1b .if \need_right add x3, x8, x2 // in + center_w sub x3, x3, #1 // in + center_w - 1 add x12, x6, x4 // dst + left_ext ld1r {v0.16b}, [x3] add x12, x12, x2 // out = dst + left_ext + center_w mov x3, x11 1: subs x3, x3, #16 st1 {v0.16b}, [x12], #16 b.gt 1b .endif subs x1, x1, #1 // center_h-- add x6, x6, x7 add x8, x8, x9 b.gt 0b .endm cbz x4, 2f // need_left cbz x11, 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cbz x11, 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cbz x10, 3f // need_bottom sub x8, x6, x7 // ref = dst - stride mov x4, x0 1: ld1 {v0.16b, v1.16b}, [x8], #32 mov x3, x10 2: subs x3, x3, #1 st1 {v0.16b, v1.16b}, [x6], x7 b.gt 2b msub x6, x7, x10, x6 // dst -= bottom_ext * stride subs x4, x4, #32 // bw -= 32 add x6, x6, #32 // dst += 32 b.gt 1b 3: cbz x5, 3f // need_top msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 1: ld1 {v0.16b, v1.16b}, [x14], #32 mov x3, x5 2: subs x3, x3, #1 st1 {v0.16b, v1.16b}, [x6], x7 b.gt 2b msub x6, x7, x5, x6 // dst -= top_ext * stride subs x0, x0, #32 // bw -= 32 add x6, x6, #32 // dst += 32 b.gt 1b 3: ret endfunc av-scenechange-0.14.1/src/asm/arm/64/mc16.S000064400000000000000000004312521046102023000160360ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Janne Grunau * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/asm/arm/asm.S" #include "util.S" #define PREP_BIAS 8192 .macro avg d0, d1, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 sqadd \t0\().8h, \t0\().8h, \t2\().8h sqadd \t1\().8h, \t1\().8h, \t3\().8h smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) .endm .macro w_avg d0, d1, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 // This difference requires a 17 bit range, and all bits are // significant for the following multiplication. ssubl \d0\().4s, \t2\().4h, \t0\().4h ssubl2 \t0\().4s, \t2\().8h, \t0\().8h ssubl \d1\().4s, \t3\().4h, \t1\().4h ssubl2 \t1\().4s, \t3\().8h, \t1\().8h mul \d0\().4s, \d0\().4s, v27.4s mul \t0\().4s, \t0\().4s, v27.4s mul \d1\().4s, \d1\().4s, v27.4s mul \t1\().4s, \t1\().4s, v27.4s sshr \d0\().4s, \d0\().4s, #4 sshr \t0\().4s, \t0\().4s, #4 sshr \d1\().4s, \d1\().4s, #4 sshr \t1\().4s, \t1\().4s, #4 saddw \d0\().4s, \d0\().4s, \t2\().4h saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max smax \d0\().8h, \d0\().8h, v30.8h // 0 smax \d1\().8h, \d1\().8h, v30.8h // 0 .endm .macro mask d0, d1, t0, t1, t2, t3 ld1 {v27.16b}, [x6], 16 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 neg v27.16b, v27.16b ld1 {\t2\().8h,\t3\().8h}, [x3], 32 sxtl v26.8h, v27.8b sxtl2 v27.8h, v27.16b sxtl v24.4s, v26.4h sxtl2 v25.4s, v26.8h sxtl v26.4s, v27.4h sxtl2 v27.4s, v27.8h ssubl \d0\().4s, \t2\().4h, \t0\().4h ssubl2 \t0\().4s, \t2\().8h, \t0\().8h ssubl \d1\().4s, \t3\().4h, \t1\().4h ssubl2 \t1\().4s, \t3\().8h, \t1\().8h mul \d0\().4s, \d0\().4s, v24.4s mul \t0\().4s, \t0\().4s, v25.4s mul \d1\().4s, \d1\().4s, v26.4s mul \t1\().4s, \t1\().4s, v27.4s sshr \d0\().4s, \d0\().4s, #6 sshr \t0\().4s, \t0\().4s, #6 sshr \d1\().4s, \d1\().4s, #6 sshr \t1\().4s, \t1\().4s, #6 saddw \d0\().4s, \d0\().4s, \t2\().4h saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max smax \d0\().8h, \d0\().8h, v30.8h // 0 smax \d1\().8h, \d1\().8h, v30.8h // 0 .endm .macro bidir_fn type, bdmax function \type\()_16bpc_neon, export=1 clz w4, w4 .ifnc \type, avg dup v31.8h, \bdmax // bitdepth_max movi v30.8h, #0 .endif clz w7, \bdmax sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 .ifc \type, avg mov w9, #1 mov w8, #-2*PREP_BIAS lsl w9, w9, w7 // 1 << intermediate_bits add w7, w7, #1 sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits neg w7, w7 // -(intermediate_bits+1) dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits dup v29.8h, w7 // -(intermediate_bits+1) .else mov w8, #PREP_BIAS lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits neg w7, w7 // -intermediate_bits dup v28.8h, w8 // PREP_BIAS >> intermediate_bits dup v29.8h, w7 // -intermediate_bits .endif .ifc \type, w_avg dup v27.4s, w6 neg v27.4s, v27.4s .endif adr x7, L(\type\()_tbl) sub w4, w4, #24 \type v4, v5, v0, v1, v2, v3 ldrh w4, [x7, x4, lsl #1] sub x7, x7, w4, uxtw br x7 40: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 4: subs w5, w5, #4 st1 {v4.d}[0], [x0], x1 st1 {v4.d}[1], [x7], x1 st1 {v5.d}[0], [x0], x1 st1 {v5.d}[1], [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 4b 80: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 8: st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 8b 16: AARCH64_VALID_JUMP_TARGET \type v6, v7, v0, v1, v2, v3 st1 {v4.8h, v5.8h}, [x0], x1 subs w5, w5, #2 st1 {v6.8h, v7.8h}, [x0], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 16b 32: AARCH64_VALID_JUMP_TARGET \type v6, v7, v0, v1, v2, v3 subs w5, w5, #1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 32b 640: AARCH64_VALID_JUMP_TARGET add x7, x0, #64 64: \type v6, v7, v0, v1, v2, v3 \type v16, v17, v0, v1, v2, v3 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 \type v18, v19, v0, v1, v2, v3 subs w5, w5, #1 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 64b 1280: AARCH64_VALID_JUMP_TARGET add x7, x0, #64 mov x8, #128 sub x1, x1, #128 128: \type v6, v7, v0, v1, v2, v3 \type v16, v17, v0, v1, v2, v3 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8 \type v18, v19, v0, v1, v2, v3 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8 \type v4, v5, v0, v1, v2, v3 \type v6, v7, v0, v1, v2, v3 \type v16, v17, v0, v1, v2, v3 subs w5, w5, #1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 \type v18, v19, v0, v1, v2, v3 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 128b 0: ret L(\type\()_tbl): .hword L(\type\()_tbl) - 1280b .hword L(\type\()_tbl) - 640b .hword L(\type\()_tbl) - 32b .hword L(\type\()_tbl) - 16b .hword L(\type\()_tbl) - 80b .hword L(\type\()_tbl) - 40b endfunc .endm bidir_fn avg, w6 bidir_fn w_avg, w7 bidir_fn mask, w7 .macro w_mask_fn type function w_mask_\type\()_16bpc_neon, export=1 ldr w8, [sp] clz w9, w4 adr x10, L(w_mask_\type\()_tbl) dup v31.8h, w8 // bitdepth_max sub w9, w9, #24 clz w8, w8 // clz(bitdepth_max) ldrh w9, [x10, x9, lsl #1] sub x10, x10, w9, uxtw sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 mov w9, #PREP_BIAS*64 neg w8, w8 // -sh mov w11, #27615 // (64 + 1 - 38)<> mask_sh ushr v21.8h, v21.8h, #10 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 add v5.4s, v5.4s, v30.4s add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s uxtl v22.4s, v20.4h uxtl2 v23.4s, v20.8h uxtl v24.4s, v21.4h uxtl2 v25.4s, v21.8h mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) mla v5.4s, v17.4s, v23.4s mla v6.4s, v18.4s, v24.4s mla v7.4s, v19.4s, v25.4s srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v5.4s, v5.4s, v29.4s srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s sqxtun v4.4h, v4.4s // iclip_pixel sqxtun2 v4.8h, v5.4s sqxtun v5.4h, v6.4s sqxtun2 v5.8h, v7.4s umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) xtn v20.8b, v20.8h uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 st1 {v20.8b}, [x6], #8 .elseif \type == 420 trn1 v24.2d, v20.2d, v21.2d trn2 v25.2d, v20.2d, v21.2d add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 st1 {v20.s}[0], [x6], #4 .endif st1 {v4.d}[0], [x0], x1 st1 {v4.d}[1], [x12], x1 st1 {v5.d}[0], [x0], x1 st1 {v5.d}[1], [x12], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 subs w5, w5, #2 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) sabd v21.8h, v5.8h, v7.8h ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) ssubl2 v17.4s, v6.8h, v4.8h ssubl v18.4s, v7.4h, v5.4h ssubl2 v19.4s, v7.8h, v5.8h uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() uqsub v21.8h, v0.8h, v21.8h sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 sshll v6.4s, v5.4h, #6 sshll2 v5.4s, v4.8h, #6 sshll v4.4s, v4.4h, #6 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh ushr v21.8h, v21.8h, #10 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 add v5.4s, v5.4s, v30.4s add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s uxtl v22.4s, v20.4h uxtl2 v23.4s, v20.8h uxtl v24.4s, v21.4h uxtl2 v25.4s, v21.8h mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) mla v5.4s, v17.4s, v23.4s mla v6.4s, v18.4s, v24.4s mla v7.4s, v19.4s, v25.4s srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v5.4s, v5.4s, v29.4s srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s sqxtun v4.4h, v4.4s // iclip_pixel sqxtun2 v4.8h, v5.4s sqxtun v5.4h, v6.4s sqxtun2 v5.8h, v7.4s umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) xtn v20.8b, v20.8h uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 st1 {v20.8b}, [x6], #8 .elseif \type == 420 add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 st1 {v20.s}[0], [x6], #4 .endif st1 {v4.8h}, [x0], x1 st1 {v5.8h}, [x12], x1 b.gt 8b ret 1280: 640: 320: 160: AARCH64_VALID_JUMP_TARGET mov w11, w4 sub x1, x1, w4, uxtw #1 .if \type == 444 add x10, x6, w4, uxtw .elseif \type == 422 add x10, x6, x11, lsr #1 .endif add x9, x3, w4, uxtw #1 add x7, x2, w4, uxtw #1 161: mov w8, w4 16: ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 ld1 {v6.8h, v7.8h}, [x7], #32 ld1 {v18.8h, v19.8h}, [x9], #32 subs w8, w8, #16 sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) sabd v21.8h, v5.8h, v17.8h ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) ssubl2 v23.4s, v16.8h, v4.8h ssubl v24.4s, v17.4h, v5.4h ssubl2 v25.4s, v17.8h, v5.8h uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() uqsub v21.8h, v0.8h, v21.8h sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 sshll v26.4s, v5.4h, #6 sshll2 v5.4s, v4.8h, #6 sshll v4.4s, v4.4h, #6 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh ushr v21.8h, v21.8h, #10 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 add v5.4s, v5.4s, v30.4s add v26.4s, v26.4s, v30.4s add v27.4s, v27.4s, v30.4s uxtl v16.4s, v20.4h uxtl2 v17.4s, v20.8h uxtl v28.4s, v21.4h mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) uxtl2 v16.4s, v21.8h mla v5.4s, v23.4s, v17.4s mla v26.4s, v24.4s, v28.4s mla v27.4s, v25.4s, v16.4s srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v5.4s, v5.4s, v29.4s srshl v26.4s, v26.4s, v29.4s srshl v27.4s, v27.4s, v29.4s sqxtun v4.4h, v4.4s // iclip_pixel sqxtun2 v4.8h, v5.4s sqxtun v5.4h, v26.4s sqxtun2 v5.8h, v27.4s // Start of other half sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) sabd v23.8h, v7.8h, v19.8h umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) ssubl2 v17.4s, v18.8h, v6.8h ssubl v18.4s, v19.4h, v7.4h ssubl2 v19.4s, v19.8h, v7.8h uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() uqsub v23.8h, v0.8h, v23.8h sshll v24.4s, v6.4h, #6 // tmp1 << 6 sshll2 v25.4s, v6.8h, #6 sshll v26.4s, v7.4h, #6 sshll2 v27.4s, v7.8h, #6 ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh ushr v23.8h, v23.8h, #10 add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 add v25.4s, v25.4s, v30.4s add v26.4s, v26.4s, v30.4s add v27.4s, v27.4s, v30.4s uxtl v6.4s, v22.4h uxtl2 v7.4s, v22.8h uxtl v28.4s, v23.4h mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) uxtl2 v6.4s, v23.8h mla v25.4s, v17.4s, v7.4s mla v26.4s, v18.4s, v28.4s mla v27.4s, v19.4s, v6.4s srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v25.4s, v25.4s, v29.4s srshl v26.4s, v26.4s, v29.4s srshl v27.4s, v27.4s, v29.4s sqxtun v6.4h, v24.4s // iclip_pixel sqxtun2 v6.8h, v25.4s sqxtun v7.4h, v26.4s sqxtun2 v7.8h, v27.4s umin v6.8h, v6.8h, v31.8h // iclip_pixel umin v7.8h, v7.8h, v31.8h .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // 64 - m uzp1 v21.16b, v22.16b, v23.16b sub v20.16b, v1.16b, v20.16b // m sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 st1 {v21.16b}, [x10], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) addp v21.8h, v22.8h, v23.8h xtn v20.8b, v20.8h xtn v21.8b, v21.8h uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 uhsub v21.8b, v3.8b, v21.8b st1 {v20.8b}, [x6], #8 st1 {v21.8b}, [x10], #8 .elseif \type == 420 add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) add v21.8h, v21.8h, v23.8h addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 st1 {v20.8b}, [x6], #8 .endif st1 {v4.8h, v5.8h}, [x0], #32 st1 {v6.8h, v7.8h}, [x12], #32 b.gt 16b subs w5, w5, #2 add x2, x2, w4, uxtw #1 add x3, x3, w4, uxtw #1 add x7, x7, w4, uxtw #1 add x9, x9, w4, uxtw #1 .if \type == 444 add x6, x6, w4, uxtw add x10, x10, w4, uxtw .elseif \type == 422 add x6, x6, x11, lsr #1 add x10, x10, x11, lsr #1 .endif add x0, x0, x1 add x12, x12, x1 b.gt 161b ret L(w_mask_\type\()_tbl): .hword L(w_mask_\type\()_tbl) - 1280b .hword L(w_mask_\type\()_tbl) - 640b .hword L(w_mask_\type\()_tbl) - 320b .hword L(w_mask_\type\()_tbl) - 160b .hword L(w_mask_\type\()_tbl) - 8b .hword L(w_mask_\type\()_tbl) - 4b endfunc .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_16bpc_neon, export=1 adr x6, L(blend_tbl) clz w3, w3 sub w3, w3, #26 ldrh w3, [x6, x3, lsl #1] sub x6, x6, w3, uxtw add x8, x0, x1 br x6 40: AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 4: ld1 {v2.8b}, [x5], #8 ld1 {v1.8h}, [x2], #16 ld1 {v0.d}[0], [x0] neg v2.8b, v2.8b // -m subs w4, w4, #2 ld1 {v0.d}[1], [x8] sxtl v2.8h, v2.8b shl v2.8h, v2.8h, #9 // -m << 9 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h st1 {v0.d}[0], [x0], x1 st1 {v0.d}[1], [x8], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 8: ld1 {v4.16b}, [x5], #16 ld1 {v2.8h, v3.8h}, [x2], #32 neg v5.16b, v4.16b // -m ld1 {v0.8h}, [x0] ld1 {v1.8h}, [x8] sxtl v4.8h, v5.8b sxtl2 v5.8h, v5.16b shl v4.8h, v4.8h, #9 // -m << 9 shl v5.8h, v5.8h, #9 sub v2.8h, v0.8h, v2.8h // a - b sub v3.8h, v1.8h, v3.8h subs w4, w4, #2 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v3.8h, v3.8h, v5.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x8], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 16: ld1 {v16.16b, v17.16b}, [x5], #32 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 subs w4, w4, #2 neg v18.16b, v16.16b // -m neg v19.16b, v17.16b ld1 {v0.8h, v1.8h}, [x0] sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b ld1 {v2.8h, v3.8h}, [x8] shl v16.8h, v16.8h, #9 // -m << 9 shl v17.8h, v17.8h, #9 shl v18.8h, v18.8h, #9 shl v19.8h, v19.8h, #9 sub v4.8h, v0.8h, v4.8h // a - b sub v5.8h, v1.8h, v5.8h sub v6.8h, v2.8h, v6.8h sub v7.8h, v3.8h, v7.8h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.8h, v5.8h, v17.8h sqrdmulh v6.8h, v6.8h, v18.8h sqrdmulh v7.8h, v7.8h, v19.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h st1 {v0.8h, v1.8h}, [x0], x1 st1 {v2.8h, v3.8h}, [x8], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b}, [x5], #32 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 subs w4, w4, #1 neg v18.16b, v16.16b // -m neg v19.16b, v17.16b sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] shl v16.8h, v16.8h, #9 // -m << 9 shl v17.8h, v17.8h, #9 shl v18.8h, v18.8h, #9 shl v19.8h, v19.8h, #9 sub v4.8h, v0.8h, v4.8h // a - b sub v5.8h, v1.8h, v5.8h sub v6.8h, v2.8h, v6.8h sub v7.8h, v3.8h, v7.8h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.8h, v5.8h, v17.8h sqrdmulh v6.8h, v6.8h, v18.8h sqrdmulh v7.8h, v7.8h, v19.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 b.gt 32b ret L(blend_tbl): .hword L(blend_tbl) - 32b .hword L(blend_tbl) - 160b .hword L(blend_tbl) - 80b .hword L(blend_tbl) - 40b endfunc function blend_h_16bpc_neon, export=1 adr x6, L(blend_h_tbl) movrel x5, X(obmc_masks) add x5, x5, w4, uxtw sub w4, w4, w4, lsr #2 clz w7, w3 add x8, x0, x1 lsl x1, x1, #1 sub w7, w7, #24 ldrh w7, [x6, x7, lsl #1] sub x6, x6, w7, uxtw br x6 2: AARCH64_VALID_JUMP_TARGET ld2r {v2.8b, v3.8b}, [x5], #2 ld1 {v1.4h}, [x2], #8 ext v2.8b, v2.8b, v3.8b, #6 subs w4, w4, #2 neg v2.8b, v2.8b // -m ld1 {v0.s}[0], [x0] ld1 {v0.s}[1], [x8] sxtl v2.8h, v2.8b shl v2.4h, v2.4h, #9 // -m << 9 sub v1.4h, v0.4h, v1.4h // a - b sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 add v0.4h, v0.4h, v1.4h st1 {v0.s}[0], [x0], x1 st1 {v0.s}[1], [x8], x1 b.gt 2b ret 4: AARCH64_VALID_JUMP_TARGET ld2r {v2.8b, v3.8b}, [x5], #2 ld1 {v1.8h}, [x2], #16 ext v2.8b, v2.8b, v3.8b, #4 subs w4, w4, #2 neg v2.8b, v2.8b // -m ld1 {v0.d}[0], [x0] ld1 {v0.d}[1], [x8] sxtl v2.8h, v2.8b shl v2.8h, v2.8h, #9 // -m << 9 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h st1 {v0.d}[0], [x0], x1 st1 {v0.d}[1], [x8], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld2r {v4.8b, v5.8b}, [x5], #2 ld1 {v2.8h, v3.8h}, [x2], #32 neg v4.8b, v4.8b // -m neg v5.8b, v5.8b ld1 {v0.8h}, [x0] subs w4, w4, #2 sxtl v4.8h, v4.8b sxtl v5.8h, v5.8b ld1 {v1.8h}, [x8] shl v4.8h, v4.8h, #9 // -m << 9 shl v5.8h, v5.8h, #9 sub v2.8h, v0.8h, v2.8h // a - b sub v3.8h, v1.8h, v3.8h sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v3.8h, v3.8h, v5.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x8], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld2r {v16.8b, v17.8b}, [x5], #2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 neg v16.8b, v16.8b // -m neg v17.8b, v17.8b ld1 {v0.8h, v1.8h}, [x0] ld1 {v2.8h, v3.8h}, [x8] subs w4, w4, #2 sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b shl v16.8h, v16.8h, #9 // -m << 9 shl v17.8h, v17.8h, #9 sub v4.8h, v0.8h, v4.8h // a - b sub v5.8h, v1.8h, v5.8h sub v6.8h, v2.8h, v6.8h sub v7.8h, v3.8h, v7.8h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.8h, v5.8h, v16.8h sqrdmulh v6.8h, v6.8h, v17.8h sqrdmulh v7.8h, v7.8h, v17.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h st1 {v0.8h, v1.8h}, [x0], x1 st1 {v2.8h, v3.8h}, [x8], x1 b.gt 16b ret 1280: 640: 320: AARCH64_VALID_JUMP_TARGET sub x1, x1, w3, uxtw #1 add x7, x2, w3, uxtw #1 321: ld2r {v24.8b, v25.8b}, [x5], #2 mov w6, w3 neg v24.8b, v24.8b // -m neg v25.8b, v25.8b sxtl v24.8h, v24.8b sxtl v25.8h, v25.8b shl v24.8h, v24.8h, #9 // -m << 9 shl v25.8h, v25.8h, #9 32: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w6, w6, #32 sub v16.8h, v0.8h, v16.8h // a - b sub v17.8h, v1.8h, v17.8h sub v18.8h, v2.8h, v18.8h sub v19.8h, v3.8h, v19.8h ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8] sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v17.8h, v17.8h, v24.8h sqrdmulh v18.8h, v18.8h, v24.8h sqrdmulh v19.8h, v19.8h, v24.8h sub v20.8h, v4.8h, v20.8h // a - b sub v21.8h, v5.8h, v21.8h sub v22.8h, v6.8h, v22.8h sub v23.8h, v7.8h, v23.8h add v0.8h, v0.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v3.8h, v3.8h, v19.8h sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v21.8h, v21.8h, v25.8h sqrdmulh v22.8h, v22.8h, v25.8h sqrdmulh v23.8h, v23.8h, v25.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v4.8h, v4.8h, v20.8h add v5.8h, v5.8h, v21.8h add v6.8h, v6.8h, v22.8h add v7.8h, v7.8h, v23.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64 b.gt 32b subs w4, w4, #2 add x0, x0, x1 add x8, x8, x1 add x2, x2, w3, uxtw #1 add x7, x7, w3, uxtw #1 b.gt 321b ret L(blend_h_tbl): .hword L(blend_h_tbl) - 1280b .hword L(blend_h_tbl) - 640b .hword L(blend_h_tbl) - 320b .hword L(blend_h_tbl) - 16b .hword L(blend_h_tbl) - 8b .hword L(blend_h_tbl) - 4b .hword L(blend_h_tbl) - 2b endfunc function blend_v_16bpc_neon, export=1 adr x6, L(blend_v_tbl) movrel x5, X(obmc_masks) add x5, x5, w3, uxtw clz w3, w3 add x8, x0, x1 lsl x1, x1, #1 sub w3, w3, #26 ldrh w3, [x6, x3, lsl #1] sub x6, x6, w3, uxtw br x6 20: AARCH64_VALID_JUMP_TARGET ld1r {v2.8b}, [x5] neg v2.8b, v2.8b // -m sxtl v2.8h, v2.8b shl v2.4h, v2.4h, #9 // -m << 9 2: ld1 {v1.s}[0], [x2], #4 ld1 {v0.h}[0], [x0] subs w4, w4, #2 ld1 {v1.h}[1], [x2] ld1 {v0.h}[1], [x8] add x2, x2, #4 sub v1.4h, v0.4h, v1.4h // a - b sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 add v0.4h, v0.4h, v1.4h st1 {v0.h}[0], [x0], x1 st1 {v0.h}[1], [x8], x1 b.gt 2b ret 40: AARCH64_VALID_JUMP_TARGET ld1r {v2.2s}, [x5] sub x1, x1, #4 neg v2.8b, v2.8b // -m sxtl v2.8h, v2.8b shl v2.8h, v2.8h, #9 // -m << 9 4: ld1 {v1.8h}, [x2], #16 ld1 {v0.d}[0], [x0] ld1 {v0.d}[1], [x8] subs w4, w4, #2 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h st1 {v0.s}[0], [x0], #4 st1 {v0.s}[2], [x8], #4 st1 {v0.h}[2], [x0], x1 st1 {v0.h}[6], [x8], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v4.8b}, [x5] sub x1, x1, #8 neg v4.8b, v4.8b // -m sxtl v4.8h, v4.8b shl v4.8h, v4.8h, #9 // -m << 9 8: ld1 {v2.8h, v3.8h}, [x2], #32 ld1 {v0.8h}, [x0] ld1 {v1.8h}, [x8] subs w4, w4, #2 sub v2.8h, v0.8h, v2.8h // a - b sub v3.8h, v1.8h, v3.8h sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v3.8h, v3.8h, v4.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h st1 {v0.d}[0], [x0], #8 st1 {v1.d}[0], [x8], #8 st1 {v0.s}[2], [x0], x1 st1 {v1.s}[2], [x8], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b}, [x5] sub x1, x1, #16 neg v17.16b, v16.16b // -m sxtl v16.8h, v17.8b sxtl2 v17.8h, v17.16b shl v16.8h, v16.8h, #9 // -m << 9 shl v17.4h, v17.4h, #9 16: ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 ld1 {v0.8h, v1.8h}, [x0] subs w4, w4, #2 ld1 {v2.8h, v3.8h}, [x8] sub v4.8h, v0.8h, v4.8h // a - b sub v5.4h, v1.4h, v5.4h sub v6.8h, v2.8h, v6.8h sub v7.4h, v3.4h, v7.4h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.4h, v5.4h, v17.4h sqrdmulh v6.8h, v6.8h, v16.8h sqrdmulh v7.4h, v7.4h, v17.4h add v0.8h, v0.8h, v4.8h add v1.4h, v1.4h, v5.4h add v2.8h, v2.8h, v6.8h add v3.4h, v3.4h, v7.4h st1 {v0.8h}, [x0], #16 st1 {v2.8h}, [x8], #16 st1 {v1.4h}, [x0], x1 st1 {v3.4h}, [x8], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v24.16b, v25.16b}, [x5] neg v26.16b, v24.16b // -m neg v27.8b, v25.8b sxtl v24.8h, v26.8b sxtl2 v25.8h, v26.16b sxtl v26.8h, v27.8b shl v24.8h, v24.8h, #9 // -m << 9 shl v25.8h, v25.8h, #9 shl v26.8h, v26.8h, #9 32: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 ld1 {v0.8h, v1.8h, v2.8h}, [x0] ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 ld1 {v4.8h, v5.8h, v6.8h}, [x8] subs w4, w4, #2 sub v16.8h, v0.8h, v16.8h // a - b sub v17.8h, v1.8h, v17.8h sub v18.8h, v2.8h, v18.8h sub v20.8h, v4.8h, v20.8h sub v21.8h, v5.8h, v21.8h sub v22.8h, v6.8h, v22.8h sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v17.8h, v17.8h, v25.8h sqrdmulh v18.8h, v18.8h, v26.8h sqrdmulh v20.8h, v20.8h, v24.8h sqrdmulh v21.8h, v21.8h, v25.8h sqrdmulh v22.8h, v22.8h, v26.8h add v0.8h, v0.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v4.8h, v4.8h, v20.8h add v5.8h, v5.8h, v21.8h add v6.8h, v6.8h, v22.8h st1 {v0.8h, v1.8h, v2.8h}, [x0], x1 st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 b.gt 32b ret L(blend_v_tbl): .hword L(blend_v_tbl) - 320b .hword L(blend_v_tbl) - 160b .hword L(blend_v_tbl) - 80b .hword L(blend_v_tbl) - 40b .hword L(blend_v_tbl) - 20b endfunc // This has got the same signature as the put_8tap functions, // and assumes that x9 is set to (clz(w)-24). function put_neon adr x10, L(put_tbl) ldrh w9, [x10, x9, lsl #1] sub x10, x10, w9, uxtw br x10 2: AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], x3 ld1 {v1.s}[0], [x2], x3 subs w5, w5, #2 st1 {v0.s}[0], [x0], x1 st1 {v1.s}[0], [x0], x1 b.gt 2b ret 4: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], x3 ld1 {v1.4h}, [x2], x3 subs w5, w5, #2 st1 {v0.4h}, [x0], x1 st1 {v1.4h}, [x0], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET add x8, x0, x1 lsl x1, x1, #1 add x9, x2, x3 lsl x3, x3, #1 8: ld1 {v0.8h}, [x2], x3 ld1 {v1.8h}, [x9], x3 subs w5, w5, #2 st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x8], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] subs w5, w5, #1 stp x8, x9, [x0, #16] add x2, x2, x3 add x0, x0, x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] ldp x10, x11, [x2, #32] stp x8, x9, [x0, #16] subs w5, w5, #1 ldp x12, x13, [x2, #48] stp x10, x11, [x0, #32] stp x12, x13, [x0, #48] add x2, x2, x3 add x0, x0, x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] ldp q4, q5, [x2, #64] stp q2, q3, [x0, #32] ldp q6, q7, [x2, #96] subs w5, w5, #1 stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x2, x2, x3 add x0, x0, x1 b.gt 64b ret 128: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] ldp q4, q5, [x2, #64] stp q2, q3, [x0, #32] ldp q6, q7, [x2, #96] subs w5, w5, #1 stp q4, q5, [x0, #64] ldp q16, q17, [x2, #128] stp q6, q7, [x0, #96] ldp q18, q19, [x2, #160] stp q16, q17, [x0, #128] ldp q20, q21, [x2, #192] stp q18, q19, [x0, #160] ldp q22, q23, [x2, #224] stp q20, q21, [x0, #192] stp q22, q23, [x0, #224] add x2, x2, x3 add x0, x0, x1 b.gt 128b ret L(put_tbl): .hword L(put_tbl) - 128b .hword L(put_tbl) - 64b .hword L(put_tbl) - 32b .hword L(put_tbl) - 16b .hword L(put_tbl) - 80b .hword L(put_tbl) - 4b .hword L(put_tbl) - 2b endfunc // This has got the same signature as the prep_8tap functions, // and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and // x8 to w*2. function prep_neon adr x10, L(prep_tbl) ldrh w9, [x10, x9, lsl #1] dup v31.8h, w7 // intermediate_bits movi v30.8h, #(PREP_BIAS >> 8), lsl #8 sub x10, x10, w9, uxtw br x10 40: AARCH64_VALID_JUMP_TARGET add x9, x1, x2 lsl x2, x2, #1 4: ld1 {v0.d}[0], [x1], x2 ld1 {v0.d}[1], [x9], x2 subs w4, w4, #2 sshl v0.8h, v0.8h, v31.8h sub v0.8h, v0.8h, v30.8h st1 {v0.8h}, [x0], #16 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET add x9, x1, x2 lsl x2, x2, #1 8: ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x9], x2 subs w4, w4, #2 sshl v0.8h, v0.8h, v31.8h sshl v1.8h, v1.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] add x1, x1, x2 sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1] add x1, x1, x2 subs w4, w4, #2 sshl v1.8h, v1.8h, v31.8h sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1, #32] add x1, x1, x2 sshl v1.8h, v1.8h, v31.8h sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h subs w4, w4, #1 sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] subs w4, w4, #1 sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1, #32] sshl v1.8h, v1.8h, v31.8h ldp q4, q5, [x1, #64] sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h ldp q6, q7, [x1, #96] add x1, x1, x2 sshl v4.8h, v4.8h, v31.8h sshl v5.8h, v5.8h, v31.8h sshl v6.8h, v6.8h, v31.8h sshl v7.8h, v7.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h stp q0, q1, [x0] sub v4.8h, v4.8h, v30.8h sub v5.8h, v5.8h, v30.8h stp q2, q3, [x0, #32] sub v6.8h, v6.8h, v30.8h sub v7.8h, v7.8h, v30.8h stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x0, x0, x8 b.gt 64b ret 128: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] subs w4, w4, #1 sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1, #32] sshl v1.8h, v1.8h, v31.8h ldp q4, q5, [x1, #64] sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h ldp q6, q7, [x1, #96] sshl v4.8h, v4.8h, v31.8h sshl v5.8h, v5.8h, v31.8h ldp q16, q17, [x1, #128] sshl v6.8h, v6.8h, v31.8h sshl v7.8h, v7.8h, v31.8h ldp q18, q19, [x1, #160] sshl v16.8h, v16.8h, v31.8h sshl v17.8h, v17.8h, v31.8h ldp q20, q21, [x1, #192] sshl v18.8h, v18.8h, v31.8h sshl v19.8h, v19.8h, v31.8h ldp q22, q23, [x1, #224] add x1, x1, x2 sshl v20.8h, v20.8h, v31.8h sshl v21.8h, v21.8h, v31.8h sshl v22.8h, v22.8h, v31.8h sshl v23.8h, v23.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h stp q0, q1, [x0] sub v4.8h, v4.8h, v30.8h sub v5.8h, v5.8h, v30.8h stp q2, q3, [x0, #32] sub v6.8h, v6.8h, v30.8h sub v7.8h, v7.8h, v30.8h stp q4, q5, [x0, #64] sub v16.8h, v16.8h, v30.8h sub v17.8h, v17.8h, v30.8h stp q6, q7, [x0, #96] sub v18.8h, v18.8h, v30.8h sub v19.8h, v19.8h, v30.8h stp q16, q17, [x0, #128] sub v20.8h, v20.8h, v30.8h sub v21.8h, v21.8h, v30.8h stp q18, q19, [x0, #160] sub v22.8h, v22.8h, v30.8h sub v23.8h, v23.8h, v30.8h stp q20, q21, [x0, #192] stp q22, q23, [x0, #224] add x0, x0, x8 b.gt 128b ret L(prep_tbl): .hword L(prep_tbl) - 128b .hword L(prep_tbl) - 64b .hword L(prep_tbl) - 32b .hword L(prep_tbl) - 16b .hword L(prep_tbl) - 80b .hword L(prep_tbl) - 40b endfunc .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}[0], [\s0], \strd ld1 {\d1\wd}[0], [\s1], \strd .ifnb \d2 ld1 {\d2\wd}[0], [\s0], \strd ld1 {\d3\wd}[0], [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}[0], [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}[0], [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}[0], [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}, [\s0], \strd ld1 {\d1\wd}, [\s1], \strd .ifnb \d2 ld1 {\d2\wd}, [\s0], \strd ld1 {\d3\wd}, [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}, [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}, [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}, [\s0], \strd .endif .endm .macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 ld1 {\d0\wd, \d1\wd}, [\s0], \strd .ifnb \d2 ld1 {\d2\wd, \d3\wd}, [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd, \d5\wd}, [\s0], \strd .endif .endm .macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 .endm .macro interleave_1 wd, r0, r1, r2, r3, r4 trn1 \r0\wd, \r0\wd, \r1\wd trn1 \r1\wd, \r1\wd, \r2\wd .ifnb \r3 trn1 \r2\wd, \r2\wd, \r3\wd trn1 \r3\wd, \r3\wd, \r4\wd .endif .endm .macro interleave_1_s r0, r1, r2, r3, r4 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 .endm .macro umin_h c, wd, r0, r1, r2, r3 umin \r0\wd, \r0\wd, \c\wd .ifnb \r1 umin \r1\wd, \r1\wd, \c\wd .endif .ifnb \r2 umin \r2\wd, \r2\wd, \c\wd umin \r3\wd, \r3\wd, \c\wd .endif .endm .macro sub_h c, wd, r0, r1, r2, r3 sub \r0\wd, \r0\wd, \c\wd .ifnb \r1 sub \r1\wd, \r1\wd, \c\wd .endif .ifnb \r2 sub \r2\wd, \r2\wd, \c\wd sub \r3\wd, \r3\wd, \c\wd .endif .endm .macro smull_smlal_4 d, s0, s1, s2, s3 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] .endm .macro smull2_smlal2_4 d, s0, s1, s2, s3 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] .endm .macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] smlal \d\().4s, \s4\().4h, v0.h[4] smlal \d\().4s, \s5\().4h, v0.h[5] smlal \d\().4s, \s6\().4h, v0.h[6] smlal \d\().4s, \s7\().4h, v0.h[7] .endm .macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] smlal2 \d\().4s, \s4\().8h, v0.h[4] smlal2 \d\().4s, \s5\().8h, v0.h[5] smlal2 \d\().4s, \s6\().8h, v0.h[6] smlal2 \d\().4s, \s7\().8h, v0.h[7] .endm .macro sqrshrun_h shift, r0, r1, r2, r3 sqrshrun \r0\().4h, \r0\().4s, #\shift .ifnb \r1 sqrshrun2 \r0\().8h, \r1\().4s, #\shift .endif .ifnb \r2 sqrshrun \r2\().4h, \r2\().4s, #\shift sqrshrun2 \r2\().8h, \r3\().4s, #\shift .endif .endm .macro xtn_h r0, r1, r2, r3 uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2 .ifnb \r2 uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto .endif .endm .macro srshl_s shift, r0, r1, r2, r3 srshl \r0\().4s, \r0\().4s, \shift\().4s srshl \r1\().4s, \r1\().4s, \shift\().4s .ifnb \r2 srshl \r2\().4s, \r2\().4s, \shift\().4s srshl \r3\().4s, \r3\().4s, \shift\().4s .endif .endm .macro st_s strd, reg, lanes st1 {\reg\().s}[0], [x0], \strd st1 {\reg\().s}[1], [x9], \strd .if \lanes > 2 st1 {\reg\().s}[2], [x0], \strd st1 {\reg\().s}[3], [x9], \strd .endif .endm .macro st_d strd, r0, r1 st1 {\r0\().d}[0], [x0], \strd st1 {\r0\().d}[1], [x9], \strd .ifnb \r1 st1 {\r1\().d}[0], [x0], \strd st1 {\r1\().d}[1], [x9], \strd .endif .endm .macro shift_store_4 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun_h 6, \r0, \r1, \r2, \r3 umin_h v31, .8h, \r0, \r2 .else srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) xtn_h \r0, \r1, \r2, \r3 sub_h v29, .8h, \r0, \r2 // PREP_BIAS .endif st_d \strd, \r0, \r2 .endm .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 st1 {\r0\wd}, [x0], \strd st1 {\r1\wd}, [x9], \strd .ifnb \r2 st1 {\r2\wd}, [x0], \strd st1 {\r3\wd}, [x9], \strd .endif .ifnb \r4 st1 {\r4\wd}, [x0], \strd st1 {\r5\wd}, [x9], \strd st1 {\r6\wd}, [x0], \strd st1 {\r7\wd}, [x9], \strd .endif .endm .macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endm .macro shift_store_8 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun_h 6, \r0, \r1, \r2, \r3 umin_h v31, .8h, \r0, \r2 .else srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) xtn_h \r0, \r1, \r2, \r3 sub_h v29, .8h, \r0, \r2 // PREP_BIAS .endif st_8h \strd, \r0, \r2 .endm .macro shift_store_16 type, strd, dst, r0, r1, r2, r3 .ifc \type, put sqrshrun_h 6, \r0, \r1, \r2, \r3 umin \r0\().8h, \r0\().8h, v31.8h umin \r1\().8h, \r2\().8h, v31.8h .else srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) xtn_h \r0, \r1, \r2, \r3 sub \r0\().8h, \r0\().8h, v29.8h sub \r1\().8h, \r2\().8h, v29.8h .endif st1 {\r0\().8h, \r1\().8h}, [\dst], \strd .endm .macro make_8tap_fn op, type, type_h, type_v function \op\()_8tap_\type\()_16bpc_neon, export=1 mov w9, \type_h mov w10, \type_v b \op\()_8tap_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 make_8tap_fn \type, regular, REGULAR, REGULAR make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH make_8tap_fn \type, regular_sharp, REGULAR, SHARP make_8tap_fn \type, smooth, SMOOTH, SMOOTH make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP make_8tap_fn \type, sharp, SHARP, SHARP make_8tap_fn \type, sharp_regular, SHARP, REGULAR make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH function \type\()_8tap_neon .ifc \bdmax, w8 ldr w8, [sp] .endif mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, w11 mul \my, \my, w11 add \mx, \mx, w9 // mx, 8tap_h, 4tap_h add \my, \my, w10 // my, 8tap_v, 4tap_v .ifc \type, prep uxtw \d_strd, \w lsl \d_strd, \d_strd, #1 .endif dup v31.8h, \bdmax // bitdepth_max clz \bdmax, \bdmax clz w9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 mov w12, #6 tst \mx, #(0x7f << 14) sub w9, w9, #24 add w13, w12, \bdmax // 6 + intermediate_bits sub w12, w12, \bdmax // 6 - intermediate_bits movrel x11, X(mc_subpel_filters), -8 b.ne L(\type\()_8tap_h) tst \my, #(0x7f << 14) b.ne L(\type\()_8tap_v) b \type\()_neon L(\type\()_8tap_h): cmp \w, #4 ubfx w10, \mx, #7, #7 and \mx, \mx, #0x7f b.le 4f mov \mx, w10 4: tst \my, #(0x7f << 14) add \xmx, x11, \mx, uxtw #3 b.ne L(\type\()_8tap_hv) adr x10, L(\type\()_8tap_h_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put dup v29.8h, \bdmax // intermediate_bits .else movi v28.8h, #(PREP_BIAS >> 8), lsl #8 .endif sub x10, x10, w9, uxtw .ifc \type, put neg v29.8h, v29.8h // -intermediate_bits .endif br x10 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 2: ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 subs \h, \h, #2 trn1 v3.2s, v4.2s, v6.2s trn2 v6.2s, v4.2s, v6.2s trn1 v4.2s, v5.2s, v7.2s trn2 v7.2s, v5.2s, v7.2s smull v3.4s, v3.4h, v0.h[0] smlal v3.4s, v4.4h, v0.h[1] smlal v3.4s, v6.4h, v0.h[2] smlal v3.4s, v7.4h, v0.h[3] srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits) sqxtun v3.4h, v3.4s srshl v3.4h, v3.4h, v29.4h // -intermediate_bits umin v3.4h, v3.4h, v31.4h st1 {v3.s}[0], [\dst], \d_strd st1 {v3.s}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 4: ld1 {v16.8h}, [\src], \s_strd ld1 {v20.8h}, [\sr2], \s_strd ext v17.16b, v16.16b, v16.16b, #2 ext v18.16b, v16.16b, v16.16b, #4 ext v19.16b, v16.16b, v16.16b, #6 ext v21.16b, v20.16b, v20.16b, #2 ext v22.16b, v20.16b, v20.16b, #4 ext v23.16b, v20.16b, v20.16b, #6 subs \h, \h, #2 smull v16.4s, v16.4h, v0.h[0] smlal v16.4s, v17.4h, v0.h[1] smlal v16.4s, v18.4h, v0.h[2] smlal v16.4s, v19.4h, v0.h[3] smull v20.4s, v20.4h, v0.h[0] smlal v20.4s, v21.4h, v0.h[1] smlal v20.4s, v22.4h, v0.h[2] smlal v20.4s, v23.4h, v0.h[3] srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits) srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put sqxtun v16.4h, v16.4s sqxtun2 v16.8h, v20.4s srshl v16.8h, v16.8h, v29.8h // -intermediate_bits umin v16.8h, v16.8h, v31.8h .else uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2 sub v16.8h, v16.8h, v28.8h // PREP_BIAS .endif st1 {v16.d}[0], [\dst], \d_strd st1 {v16.d}[1], [\ds2], \d_strd b.gt 4b ret 80: 160: 320: 640: 1280: // 8xN, 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] sub \src, \src, #6 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b sub \s_strd, \s_strd, \w, uxtw #1 sub \s_strd, \s_strd, #16 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw #1 .endif 81: ld1 {v16.8h, v17.8h}, [\src], #32 ld1 {v20.8h, v21.8h}, [\sr2], #32 mov \mx, \w 8: smull v18.4s, v16.4h, v0.h[0] smull2 v19.4s, v16.8h, v0.h[0] smull v22.4s, v20.4h, v0.h[0] smull2 v23.4s, v20.8h, v0.h[0] .irpc i, 1234567 ext v24.16b, v16.16b, v17.16b, #(2*\i) ext v25.16b, v20.16b, v21.16b, #(2*\i) smlal v18.4s, v24.4h, v0.h[\i] smlal2 v19.4s, v24.8h, v0.h[\i] smlal v22.4s, v25.4h, v0.h[\i] smlal2 v23.4s, v25.8h, v0.h[\i] .endr subs \mx, \mx, #8 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put sqxtun v18.4h, v18.4s sqxtun2 v18.8h, v19.4s sqxtun v22.4h, v22.4s sqxtun2 v22.8h, v23.4s srshl v18.8h, v18.8h, v29.8h // -intermediate_bits srshl v22.8h, v22.8h, v29.8h // -intermediate_bits umin v18.8h, v18.8h, v31.8h umin v22.8h, v22.8h, v31.8h .else uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2 uzp1 v22.8h, v22.8h, v23.8h // Ditto sub v18.8h, v18.8h, v28.8h // PREP_BIAS sub v22.8h, v22.8h, v28.8h // PREP_BIAS .endif st1 {v18.8h}, [\dst], #16 st1 {v22.8h}, [\ds2], #16 b.le 9f mov v16.16b, v17.16b mov v20.16b, v21.16b ld1 {v17.8h}, [\src], #16 ld1 {v21.8h}, [\sr2], #16 b 8b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 81b ret L(\type\()_8tap_h_tbl): .hword L(\type\()_8tap_h_tbl) - 1280b .hword L(\type\()_8tap_h_tbl) - 640b .hword L(\type\()_8tap_h_tbl) - 320b .hword L(\type\()_8tap_h_tbl) - 160b .hword L(\type\()_8tap_h_tbl) - 80b .hword L(\type\()_8tap_h_tbl) - 40b .hword L(\type\()_8tap_h_tbl) - 20b .hword 0 L(\type\()_8tap_v): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w10 4: add \xmy, x11, \my, uxtw #3 .ifc \type, prep dup v30.4s, w12 // 6 - intermediate_bits movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif adr x10, L(\type\()_8tap_v_tbl) ldrh w9, [x10, x9, lsl #1] .ifc \type, prep neg v30.4s, v30.4s // -(6-intermediate_bits) .endif sub x10, x10, w9, uxtw br x10 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put b.gt 28f cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b // 2x2 v load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 b.gt 24f smull_smlal_4 v6, v1, v2, v3, v4 sqrshrun_h 6, v6 umin_h v31, .8h, v6 st_s \d_strd, v6, 2 ret 24: // 2x4 v load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 smull_smlal_4 v16, v1, v2, v3, v4 smull_smlal_4 v17, v3, v4, v5, v6 sqrshrun_h 6, v16, v17 umin_h v31, .8h, v16 st_s \d_strd, v16, 4 ret 28: // 2x6, 2x8, 2x12, 2x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 interleave_1_s v1, v2, v3, v4, v5 interleave_1_s v5, v6, v7 216: subs \h, \h, #4 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 interleave_1_s v7, v16, v17, v18, v19 smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 sqrshrun_h 6, v24, v25 umin_h v31, .8h, v24 st_s \d_strd, v24, 4 b.le 0f cmp \h, #2 mov v1.16b, v5.16b mov v2.16b, v6.16b mov v3.16b, v7.16b mov v4.16b, v16.16b mov v5.16b, v17.16b mov v6.16b, v18.16b mov v7.16b, v19.16b b.eq 26f b 216b 26: load_s \sr2, \src, \s_strd, v16, v17 interleave_1_s v7, v16, v17 smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_h 6, v24 umin_h v31, .4h, v24 st_s \d_strd, v24, 2 0: ret .endif 40: AARCH64_VALID_JUMP_TARGET b.gt 480f // 4x2, 4x4 v cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 smull_smlal_4 v6, v1, v2, v3, v4 smull_smlal_4 v7, v2, v3, v4, v5 shift_store_4 \type, \d_strd, v6, v7 b.le 0f load_4h \sr2, \src, \s_strd, v6, v7 smull_smlal_4 v1, v3, v4, v5, v6 smull_smlal_4 v2, v4, v5, v6, v7 shift_store_4 \type, \d_strd, v1, v2 0: ret 480: // 4x6, 4x8, 4x12, 4x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 48: subs \h, \h, #4 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_4 \type, \d_strd, v1, v2, v3, v4 b.le 0f cmp \h, #2 mov v16.8b, v20.8b mov v17.8b, v21.8b mov v18.8b, v22.8b mov v19.8b, v23.8b mov v20.8b, v24.8b mov v21.8b, v25.8b mov v22.8b, v26.8b b.eq 46f b 48b 46: load_4h \sr2, \src, \s_strd, v23, v24 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_4 \type, \d_strd, v1, v2 0: ret 80: AARCH64_VALID_JUMP_TARGET b.gt 880f // 8x2, 8x4 v cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 smull_smlal_4 v16, v1, v2, v3, v4 smull2_smlal2_4 v17, v1, v2, v3, v4 smull_smlal_4 v18, v2, v3, v4, v5 smull2_smlal2_4 v19, v2, v3, v4, v5 shift_store_8 \type, \d_strd, v16, v17, v18, v19 b.le 0f load_8h \sr2, \src, \s_strd, v6, v7 smull_smlal_4 v16, v3, v4, v5, v6 smull2_smlal2_4 v17, v3, v4, v5, v6 smull_smlal_4 v18, v4, v5, v6, v7 smull2_smlal2_4 v19, v4, v5, v6, v7 shift_store_8 \type, \d_strd, v16, v17, v18, v19 0: ret 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: 1280: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmy] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 88: subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v23, v24 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v25, v26 smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f mov v16.16b, v20.16b mov v17.16b, v21.16b mov v18.16b, v22.16b mov v19.16b, v23.16b mov v20.16b, v24.16b mov v21.16b, v25.16b mov v22.16b, v26.16b b 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 168b 0: ret 160: AARCH64_VALID_JUMP_TARGET b.gt 1680b // 16x2, 16x4 v add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd sxtl v0.8h, v0.8b load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 16: load_16h \src, \src, \s_strd, v22, v23 subs \h, \h, #1 smull_smlal_4 v1, v16, v18, v20, v22 smull2_smlal2_4 v2, v16, v18, v20, v22 smull_smlal_4 v3, v17, v19, v21, v23 smull2_smlal2_4 v4, v17, v19, v21, v23 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 b.le 0f mov v16.16b, v18.16b mov v17.16b, v19.16b mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b mov v21.16b, v23.16b b 16b 0: ret L(\type\()_8tap_v_tbl): .hword L(\type\()_8tap_v_tbl) - 1280b .hword L(\type\()_8tap_v_tbl) - 640b .hword L(\type\()_8tap_v_tbl) - 320b .hword L(\type\()_8tap_v_tbl) - 160b .hword L(\type\()_8tap_v_tbl) - 80b .hword L(\type\()_8tap_v_tbl) - 40b .hword L(\type\()_8tap_v_tbl) - 20b .hword 0 L(\type\()_8tap_hv): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w10 4: add \xmy, x11, \my, uxtw #3 adr x10, L(\type\()_8tap_hv_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put dup v29.4s, w13 // 6 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif sub x10, x10, w9, uxtw .ifc \type, put neg v29.4s, v29.4s // -(6+intermediate_bits) .endif br x10 20: AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 280f add \xmy, \xmy, #2 ld1 {v1.s}[0], [\xmy] // 2x2, 2x4 hv sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 smull v27.4s, v27.4h, v0.4h smull v28.4s, v28.4h, v0.4h addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) bl L(\type\()_8tap_filter_2) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s trn1 v16.2s, v16.2s, v24.2s mov v17.8b, v24.8b 2: bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v24.8b, #4 smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v24.4h, v1.h[3] srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s umin v2.4h, v2.4h, v31.4h subs \h, \h, #2 st1 {v2.s}[0], [\dst], \d_strd st1 {v2.s}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v24.8b b 2b 280: // 2x8, 2x16, 2x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #2 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 smull v27.4s, v27.4h, v0.4h smull v28.4s, v28.4h, v0.4h addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). bl L(\type\()_8tap_filter_2) xtn v16.4h, v16.4s trn1 v16.2s, v16.2s, v24.2s mov v17.8b, v24.8b bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v24.8b, #4 mov v19.8b, v24.8b bl L(\type\()_8tap_filter_2) ext v20.8b, v19.8b, v24.8b, #4 mov v21.8b, v24.8b 28: bl L(\type\()_8tap_filter_2) ext v22.8b, v21.8b, v24.8b, #4 smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] smlal v3.4s, v19.4h, v1.h[3] smlal v3.4s, v20.4h, v1.h[4] smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] smlal v3.4s, v24.4h, v1.h[7] srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s umin v3.4h, v3.4h, v31.4h subs \h, \h, #2 st1 {v3.s}[0], [\dst], \d_strd st1 {v3.s}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v24.8b b 28b 0: ret x15 L(\type\()_8tap_filter_2): ld1 {v25.8h}, [\sr2], \s_strd ld1 {v27.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 ext v28.16b, v27.16b, v27.16b, #2 trn1 v24.2s, v25.2s, v27.2s trn2 v27.2s, v25.2s, v27.2s trn1 v25.2s, v26.2s, v28.2s trn2 v28.2s, v26.2s, v28.2s smull v24.4s, v24.4h, v0.h[0] smlal v24.4s, v25.4h, v0.h[1] smlal v24.4s, v27.4h, v0.h[2] smlal v24.4s, v28.4h, v0.h[3] srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) xtn v24.4h, v24.4s ret .endif 40: AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 480f add \xmy, \xmy, #2 ld1 {v1.s}[0], [\xmy] sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 // 4x2, 4x4 hv ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 ext v27.16b, v25.16b, v25.16b, #4 ext v28.16b, v25.16b, v25.16b, #6 smull v25.4s, v25.4h, v0.h[0] smlal v25.4s, v26.4h, v0.h[1] smlal v25.4s, v27.4h, v0.h[2] smlal v25.4s, v28.4h, v0.h[3] srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s bl L(\type\()_8tap_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b 4: bl L(\type\()_8tap_filter_4) smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v24.4h, v1.h[3] smull v3.4s, v17.4h, v1.h[0] smlal v3.4s, v18.4h, v1.h[1] smlal v3.4s, v24.4h, v1.h[2] smlal v3.4s, v25.4h, v1.h[3] .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s umin v2.8h, v2.8h, v31.8h .else rshrn v2.4h, v2.4s, #6 rshrn2 v2.8h, v3.4s, #6 sub v2.8h, v2.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v2.d}[0], [\dst], \d_strd st1 {v2.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v24.8b mov v18.8b, v25.8b b 4b 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #2 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 ext v27.16b, v25.16b, v25.16b, #4 ext v28.16b, v25.16b, v25.16b, #6 smull v25.4s, v25.4h, v0.h[0] smlal v25.4s, v26.4h, v0.h[1] smlal v25.4s, v27.4h, v0.h[2] smlal v25.4s, v28.4h, v0.h[3] srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s bl L(\type\()_8tap_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b bl L(\type\()_8tap_filter_4) mov v19.8b, v24.8b mov v20.8b, v25.8b bl L(\type\()_8tap_filter_4) mov v21.8b, v24.8b mov v22.8b, v25.8b 48: bl L(\type\()_8tap_filter_4) smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] smlal v3.4s, v19.4h, v1.h[3] smlal v3.4s, v20.4h, v1.h[4] smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] smlal v3.4s, v24.4h, v1.h[7] smull v4.4s, v17.4h, v1.h[0] smlal v4.4s, v18.4h, v1.h[1] smlal v4.4s, v19.4h, v1.h[2] smlal v4.4s, v20.4h, v1.h[3] smlal v4.4s, v21.4h, v1.h[4] smlal v4.4s, v22.4h, v1.h[5] smlal v4.4s, v24.4h, v1.h[6] smlal v4.4s, v25.4h, v1.h[7] .ifc \type, put srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s sqxtun2 v3.8h, v4.4s umin v3.8h, v3.8h, v31.8h .else rshrn v3.4h, v3.4s, #6 rshrn2 v3.8h, v4.4s, #6 sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v3.d}[0], [\dst], \d_strd st1 {v3.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v24.8b mov v22.8b, v25.8b b 48b 0: ret x15 L(\type\()_8tap_filter_4): ld1 {v24.8h}, [\sr2], \s_strd ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v24.16b, v24.16b, #2 ext v27.16b, v24.16b, v24.16b, #4 ext v28.16b, v24.16b, v24.16b, #6 smull v24.4s, v24.4h, v0.h[0] smlal v24.4s, v26.4h, v0.h[1] smlal v24.4s, v27.4h, v0.h[2] smlal v24.4s, v28.4h, v0.h[3] ext v26.16b, v25.16b, v25.16b, #2 ext v27.16b, v25.16b, v25.16b, #4 ext v28.16b, v25.16b, v25.16b, #6 smull v25.4s, v25.4h, v0.h[0] smlal v25.4s, v26.4h, v0.h[1] smlal v25.4s, v27.4h, v0.h[2] smlal v25.4s, v28.4h, v0.h[3] srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) xtn v24.4h, v24.4s xtn v25.4h, v25.4s ret 80: 160: 320: AARCH64_VALID_JUMP_TARGET b.gt 880f add \xmy, \xmy, #2 ld1 {v0.8b}, [\xmx] ld1 {v1.s}[0], [\xmy] sub \src, \src, #6 sub \src, \src, \s_strd sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] .irpc i, 1234567 ext v26.16b, v27.16b, v28.16b, #(2*\i) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_8tap_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_8tap_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v23.4h, v1.h[2] smlal2 v5.4s, v23.8h, v1.h[2] smlal v2.4s, v23.4h, v1.h[3] smlal2 v3.4s, v23.8h, v1.h[3] smlal v4.4s, v24.4h, v1.h[3] smlal2 v5.4s, v24.8h, v1.h[3] .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s sqxtun v3.4h, v4.4s sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h .else rshrn v2.4h, v2.4s, #6 rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 sub v2.8h, v2.8h, v29.8h // PREP_BIAS sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v2.8h}, [\dst], \d_strd st1 {v3.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v18.16b mov v17.16b, v23.16b mov v18.16b, v24.16b b 8b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 164b 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #6 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] .irpc i, 1234567 ext v26.16b, v27.16b, v28.16b, #(2*\i) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_8tap_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b bl L(\type\()_8tap_filter_8) mov v19.16b, v23.16b mov v20.16b, v24.16b bl L(\type\()_8tap_filter_8) mov v21.16b, v23.16b mov v22.16b, v24.16b 88: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_8tap_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v19.4h, v1.h[2] smlal2 v5.4s, v19.8h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal2 v3.4s, v19.8h, v1.h[3] smlal v4.4s, v20.4h, v1.h[3] smlal2 v5.4s, v20.8h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal2 v3.4s, v20.8h, v1.h[4] smlal v4.4s, v21.4h, v1.h[4] smlal2 v5.4s, v21.8h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal2 v3.4s, v21.8h, v1.h[5] smlal v4.4s, v22.4h, v1.h[5] smlal2 v5.4s, v22.8h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal2 v3.4s, v22.8h, v1.h[6] smlal v4.4s, v23.4h, v1.h[6] smlal2 v5.4s, v23.8h, v1.h[6] smlal v2.4s, v23.4h, v1.h[7] smlal2 v3.4s, v23.8h, v1.h[7] smlal v4.4s, v24.4h, v1.h[7] smlal2 v5.4s, v24.8h, v1.h[7] .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s sqxtun v3.4h, v4.4s sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h .else rshrn v2.4h, v2.4s, #6 rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 sub v2.8h, v2.8h, v29.8h // PREP_BIAS sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v2.8h}, [\dst], \d_strd st1 {v3.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v18.16b mov v17.16b, v19.16b mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b mov v21.16b, v23.16b mov v22.16b, v24.16b b 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 168b 0: ret x15 L(\type\()_8tap_filter_8): ld1 {v4.8h, v5.8h}, [\sr2], \s_strd ld1 {v6.8h, v7.8h}, [\src], \s_strd smull v25.4s, v4.4h, v0.h[0] smull2 v26.4s, v4.8h, v0.h[0] smull v27.4s, v6.4h, v0.h[0] smull2 v28.4s, v6.8h, v0.h[0] .irpc i, 1234567 ext v23.16b, v4.16b, v5.16b, #(2*\i) ext v24.16b, v6.16b, v7.16b, #(2*\i) smlal v25.4s, v23.4h, v0.h[\i] smlal2 v26.4s, v23.8h, v0.h[\i] smlal v27.4s, v24.4h, v0.h[\i] smlal2 v28.4s, v24.8h, v0.h[\i] .endr srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2 uzp1 v24.8h, v27.8h, v28.8h // Ditto ret L(\type\()_8tap_hv_tbl): .hword L(\type\()_8tap_hv_tbl) - 1280b .hword L(\type\()_8tap_hv_tbl) - 640b .hword L(\type\()_8tap_hv_tbl) - 320b .hword L(\type\()_8tap_hv_tbl) - 160b .hword L(\type\()_8tap_hv_tbl) - 80b .hword L(\type\()_8tap_hv_tbl) - 40b .hword L(\type\()_8tap_hv_tbl) - 20b .hword 0 endfunc function \type\()_bilin_16bpc_neon, export=1 .ifc \bdmax, w8 ldr w8, [sp] .endif dup v1.8h, \mx dup v3.8h, \my mov w10, #16 sub w9, w10, \mx sub w10, w10, \my dup v0.8h, w9 dup v2.8h, w10 .ifc \type, prep uxtw \d_strd, \w lsl \d_strd, \d_strd, #1 .endif clz \bdmax, \bdmax // bitdepth_max clz w9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 mov w11, #4 sub w9, w9, #24 sub w11, w11, \bdmax // 4 - intermediate_bits add w12, \bdmax, #4 // 4 + intermediate_bits cbnz \mx, L(\type\()_bilin_h) cbnz \my, L(\type\()_bilin_v) b \type\()_neon L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) adr x10, L(\type\()_bilin_h_tbl) dup v31.8h, w11 // 4 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.8h, \bdmax // intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif sub x10, x10, w9, uxtw .ifc \type, put neg v30.8h, v30.8h // -intermediate_bits .endif br x10 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: ld1 {v4.4h}, [\src], \s_strd ld1 {v6.4h}, [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #2 ext v7.8b, v6.8b, v6.8b, #2 trn1 v4.2s, v4.2s, v6.2s trn1 v5.2s, v5.2s, v7.2s subs \h, \h, #2 mul v4.4h, v4.4h, v0.4h mla v4.4h, v5.4h, v1.4h urshl v4.4h, v4.4h, v31.4h urshl v4.4h, v4.4h, v30.4h st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 trn1 v4.2d, v4.2d, v6.2d trn1 v5.2d, v5.2d, v7.2d subs \h, \h, #2 mul v4.8h, v4.8h, v0.8h mla v4.8h, v5.8h, v1.8h urshl v4.8h, v4.8h, v31.8h .ifc \type, put urshl v4.8h, v4.8h, v30.8h .else sub v4.8h, v4.8h, v29.8h .endif st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.gt 4b ret 80: // 8xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: ldr h5, [\src, #16] ldr h7, [\sr2, #16] ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd ext v5.16b, v4.16b, v5.16b, #2 ext v7.16b, v6.16b, v7.16b, #2 subs \h, \h, #2 mul v4.8h, v4.8h, v0.8h mla v4.8h, v5.8h, v1.8h mul v6.8h, v6.8h, v0.8h mla v6.8h, v7.8h, v1.8h urshl v4.8h, v4.8h, v31.8h urshl v6.8h, v6.8h, v31.8h .ifc \type, put urshl v4.8h, v4.8h, v30.8h urshl v6.8h, v6.8h, v30.8h .else sub v4.8h, v4.8h, v29.8h sub v6.8h, v6.8h, v29.8h .endif st1 {v4.8h}, [\dst], \d_strd st1 {v6.8h}, [\ds2], \d_strd b.gt 8b ret 160: 320: 640: 1280: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w, uxtw #1 sub \s_strd, \s_strd, #16 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw #1 .endif 161: ld1 {v16.8h}, [\src], #16 ld1 {v21.8h}, [\sr2], #16 mov \mx, \w 16: ld1 {v17.8h, v18.8h}, [\src], #32 ld1 {v22.8h, v23.8h}, [\sr2], #32 ext v19.16b, v16.16b, v17.16b, #2 ext v20.16b, v17.16b, v18.16b, #2 ext v24.16b, v21.16b, v22.16b, #2 ext v25.16b, v22.16b, v23.16b, #2 mul v16.8h, v16.8h, v0.8h mla v16.8h, v19.8h, v1.8h mul v17.8h, v17.8h, v0.8h mla v17.8h, v20.8h, v1.8h mul v21.8h, v21.8h, v0.8h mla v21.8h, v24.8h, v1.8h mul v22.8h, v22.8h, v0.8h mla v22.8h, v25.8h, v1.8h urshl v16.8h, v16.8h, v31.8h urshl v17.8h, v17.8h, v31.8h urshl v21.8h, v21.8h, v31.8h urshl v22.8h, v22.8h, v31.8h subs \mx, \mx, #16 .ifc \type, put urshl v16.8h, v16.8h, v30.8h urshl v17.8h, v17.8h, v30.8h urshl v21.8h, v21.8h, v30.8h urshl v22.8h, v22.8h, v30.8h .else sub v16.8h, v16.8h, v29.8h sub v17.8h, v17.8h, v29.8h sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v29.8h .endif st1 {v16.8h, v17.8h}, [\dst], #32 st1 {v21.8h, v22.8h}, [\ds2], #32 b.le 9f mov v16.16b, v18.16b mov v21.16b, v23.16b b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 161b ret L(\type\()_bilin_h_tbl): .hword L(\type\()_bilin_h_tbl) - 1280b .hword L(\type\()_bilin_h_tbl) - 640b .hword L(\type\()_bilin_h_tbl) - 320b .hword L(\type\()_bilin_h_tbl) - 160b .hword L(\type\()_bilin_h_tbl) - 80b .hword L(\type\()_bilin_h_tbl) - 40b .hword L(\type\()_bilin_h_tbl) - 20b .hword 0 L(\type\()_bilin_v): cmp \h, #4 adr x10, L(\type\()_bilin_v_tbl) .ifc \type, prep dup v31.8h, w11 // 4 - intermediate_bits .endif ldrh w9, [x10, x9, lsl #1] .ifc \type, prep movi v29.8h, #(PREP_BIAS >> 8), lsl #8 neg v31.8h, v31.8h // -(4-intermediate_bits) .endif sub x10, x10, w9, uxtw br x10 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v ld1 {v16.s}[0], [\src], \s_strd b.gt 24f 22: ld1 {v17.s}[0], [\sr2], \s_strd ld1 {v18.s}[0], [\src], \s_strd trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s mul v4.4h, v16.4h, v2.4h mla v4.4h, v17.4h, v3.4h urshr v4.8h, v4.8h, #4 st1 {v4.s}[0], [\dst] st1 {v4.s}[1], [\ds2] ret 24: // 2x4, 2x6, 2x8, ... v ld1 {v17.s}[0], [\sr2], \s_strd ld1 {v18.s}[0], [\src], \s_strd ld1 {v19.s}[0], [\sr2], \s_strd ld1 {v20.s}[0], [\src], \s_strd sub \h, \h, #4 trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s trn1 v18.2s, v18.2s, v19.2s trn1 v19.2s, v19.2s, v20.2s trn1 v16.2d, v16.2d, v18.2d trn1 v17.2d, v17.2d, v19.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h cmp \h, #2 urshr v4.8h, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd st1 {v4.s}[2], [\dst], \d_strd st1 {v4.s}[3], [\ds2], \d_strd b.lt 0f mov v16.8b, v20.8b b.eq 22b b 24b 0: ret .endif 40: // 4xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.4h}, [\src], \s_strd 4: ld1 {v17.4h}, [\sr2], \s_strd ld1 {v18.4h}, [\src], \s_strd trn1 v16.2d, v16.2d, v17.2d trn1 v17.2d, v17.2d, v18.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h subs \h, \h, #2 .ifc \type, put urshr v4.8h, v4.8h, #4 .else urshl v4.8h, v4.8h, v31.8h sub v4.8h, v4.8h, v29.8h .endif st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b b 4b 0: ret 80: // 8xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.8h}, [\src], \s_strd 8: ld1 {v17.8h}, [\sr2], \s_strd ld1 {v18.8h}, [\src], \s_strd mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h mul v5.8h, v17.8h, v2.8h mla v5.8h, v18.8h, v3.8h subs \h, \h, #2 .ifc \type, put urshr v4.8h, v4.8h, #4 urshr v5.8h, v5.8h, #4 .else urshl v4.8h, v4.8h, v31.8h urshl v5.8h, v5.8h, v31.8h sub v4.8h, v4.8h, v29.8h sub v5.8h, v5.8h, v29.8h .endif st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd b.le 0f mov v16.16b, v18.16b b 8b 0: ret 160: // 16xN, 32xN, ... 320: 640: 1280: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.8h, v17.8h}, [\src], \s_strd 2: ld1 {v18.8h, v19.8h}, [\sr2], \s_strd ld1 {v20.8h, v21.8h}, [\src], \s_strd mul v4.8h, v16.8h, v2.8h mla v4.8h, v18.8h, v3.8h mul v5.8h, v17.8h, v2.8h mla v5.8h, v19.8h, v3.8h mul v6.8h, v18.8h, v2.8h mla v6.8h, v20.8h, v3.8h mul v7.8h, v19.8h, v2.8h mla v7.8h, v21.8h, v3.8h subs \h, \h, #2 .ifc \type, put urshr v4.8h, v4.8h, #4 urshr v5.8h, v5.8h, #4 urshr v6.8h, v6.8h, #4 urshr v7.8h, v7.8h, #4 .else urshl v4.8h, v4.8h, v31.8h urshl v5.8h, v5.8h, v31.8h urshl v6.8h, v6.8h, v31.8h urshl v7.8h, v7.8h, v31.8h sub v4.8h, v4.8h, v29.8h sub v5.8h, v5.8h, v29.8h sub v6.8h, v6.8h, v29.8h sub v7.8h, v7.8h, v29.8h .endif st1 {v4.8h, v5.8h}, [\dst], \d_strd st1 {v6.8h, v7.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v20.16b mov v17.16b, v21.16b b 2b 9: subs \w, \w, #16 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #32 add \dst, \dst, #32 b 1b 0: ret L(\type\()_bilin_v_tbl): .hword L(\type\()_bilin_v_tbl) - 1280b .hword L(\type\()_bilin_v_tbl) - 640b .hword L(\type\()_bilin_v_tbl) - 320b .hword L(\type\()_bilin_v_tbl) - 160b .hword L(\type\()_bilin_v_tbl) - 80b .hword L(\type\()_bilin_v_tbl) - 40b .hword L(\type\()_bilin_v_tbl) - 20b .hword 0 L(\type\()_bilin_hv): adr x10, L(\type\()_bilin_hv_tbl) dup v31.8h, w11 // 4 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.4s, w12 // 4 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif sub x10, x10, w9, uxtw .ifc \type, put neg v30.4s, v30.4s // -(4+intermediate_bits) .endif br x10 20: // 2xN hv AARCH64_VALID_JUMP_TARGET .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v20.4h}, [\src], \s_strd ext v21.8b, v20.8b, v20.8b, #2 mul v16.4h, v20.4h, v0.4h mla v16.4h, v21.4h, v1.4h urshl v16.4h, v16.4h, v31.4h 2: ld1 {v22.4h}, [\sr2], \s_strd ld1 {v24.4h}, [\src], \s_strd ext v23.8b, v22.8b, v22.8b, #2 ext v25.8b, v24.8b, v24.8b, #2 trn1 v22.2s, v22.2s, v24.2s trn1 v23.2s, v23.2s, v25.2s mul v17.4h, v22.4h, v0.4h mla v17.4h, v23.4h, v1.4h urshl v17.4h, v17.4h, v31.4h trn1 v16.2s, v16.2s, v17.2s umull v4.4s, v16.4h, v2.4h umlal v4.4s, v17.4h, v3.4h urshl v4.4s, v4.4s, v30.4s xtn v4.4h, v4.4s subs \h, \h, #2 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd b.le 0f trn2 v16.2s, v17.2s, v17.2s b 2b 0: ret .endif 40: // 4xN hv AARCH64_VALID_JUMP_TARGET add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v20.8h}, [\src], \s_strd ext v21.16b, v20.16b, v20.16b, #2 mul v16.4h, v20.4h, v0.4h mla v16.4h, v21.4h, v1.4h urshl v16.4h, v16.4h, v31.4h 4: ld1 {v22.8h}, [\sr2], \s_strd ld1 {v24.8h}, [\src], \s_strd ext v23.16b, v22.16b, v22.16b, #2 ext v25.16b, v24.16b, v24.16b, #2 trn1 v22.2d, v22.2d, v24.2d trn1 v23.2d, v23.2d, v25.2d mul v17.8h, v22.8h, v0.8h mla v17.8h, v23.8h, v1.8h urshl v17.8h, v17.8h, v31.8h trn1 v16.2d, v16.2d, v17.2d umull v4.4s, v16.4h, v2.4h umlal v4.4s, v17.4h, v3.4h umull2 v5.4s, v16.8h, v2.8h umlal2 v5.4s, v17.8h, v3.8h .ifc \type, put urshl v4.4s, v4.4s, v30.4s urshl v5.4s, v5.4s, v30.4s uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 sub v4.8h, v4.8h, v29.8h .endif subs \h, \h, #2 st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.le 0f trn2 v16.2d, v17.2d, v17.2d b 4b 0: ret 80: // 8xN, 16xN, ... hv 160: 320: 640: 1280: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ldr h21, [\src, #16] ld1 {v20.8h}, [\src], \s_strd ext v21.16b, v20.16b, v21.16b, #2 mul v16.8h, v20.8h, v0.8h mla v16.8h, v21.8h, v1.8h urshl v16.8h, v16.8h, v31.8h 2: ldr h23, [\sr2, #16] ld1 {v22.8h}, [\sr2], \s_strd ldr h25, [\src, #16] ld1 {v24.8h}, [\src], \s_strd ext v23.16b, v22.16b, v23.16b, #2 ext v25.16b, v24.16b, v25.16b, #2 mul v17.8h, v22.8h, v0.8h mla v17.8h, v23.8h, v1.8h mul v18.8h, v24.8h, v0.8h mla v18.8h, v25.8h, v1.8h urshl v17.8h, v17.8h, v31.8h urshl v18.8h, v18.8h, v31.8h umull v4.4s, v16.4h, v2.4h umlal v4.4s, v17.4h, v3.4h umull2 v5.4s, v16.8h, v2.8h umlal2 v5.4s, v17.8h, v3.8h umull v6.4s, v17.4h, v2.4h umlal v6.4s, v18.4h, v3.4h umull2 v7.4s, v17.8h, v2.8h umlal2 v7.4s, v18.8h, v3.8h .ifc \type, put urshl v4.4s, v4.4s, v30.4s urshl v5.4s, v5.4s, v30.4s urshl v6.4s, v6.4s, v30.4s urshl v7.4s, v7.4s, v30.4s uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 uzp1 v5.8h, v6.8h, v7.8h // Ditto .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 rshrn v5.4h, v6.4s, #4 rshrn2 v5.8h, v7.4s, #4 sub v4.8h, v4.8h, v29.8h sub v5.8h, v5.8h, v29.8h .endif subs \h, \h, #2 st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v18.16b b 2b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 1b 0: ret L(\type\()_bilin_hv_tbl): .hword L(\type\()_bilin_hv_tbl) - 1280b .hword L(\type\()_bilin_hv_tbl) - 640b .hword L(\type\()_bilin_hv_tbl) - 320b .hword L(\type\()_bilin_hv_tbl) - 160b .hword L(\type\()_bilin_hv_tbl) - 80b .hword L(\type\()_bilin_hv_tbl) - 40b .hword L(\type\()_bilin_hv_tbl) - 20b .hword 0 endfunc .endm filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 .macro load_filter_row dst, src, inc asr w13, \src, #10 add \src, \src, \inc ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon add w12, w5, #512 ld1 {v16.8h, v17.8h}, [x2], x3 load_filter_row d0, w12, w7 load_filter_row d1, w12, w7 load_filter_row d2, w12, w7 sxtl v0.8h, v0.8b load_filter_row d3, w12, w7 sxtl v1.8h, v1.8b load_filter_row d4, w12, w7 sxtl v2.8h, v2.8b load_filter_row d5, w12, w7 sxtl v3.8h, v3.8b load_filter_row d6, w12, w7 sxtl v4.8h, v4.8b load_filter_row d7, w12, w7 sxtl v5.8h, v5.8b ext v18.16b, v16.16b, v17.16b, #2*1 smull v8.4s, v16.4h, v0.4h smull2 v9.4s, v16.8h, v0.8h sxtl v6.8h, v6.8b ext v19.16b, v16.16b, v17.16b, #2*2 smull v10.4s, v18.4h, v1.4h smull2 v11.4s, v18.8h, v1.8h sxtl v7.8h, v7.8b ext v20.16b, v16.16b, v17.16b, #2*3 smull v0.4s, v19.4h, v2.4h smull2 v1.4s, v19.8h, v2.8h ext v21.16b, v16.16b, v17.16b, #2*4 addp v8.4s, v8.4s, v9.4s smull v2.4s, v20.4h, v3.4h smull2 v3.4s, v20.8h, v3.8h ext v22.16b, v16.16b, v17.16b, #2*5 addp v9.4s, v10.4s, v11.4s smull v10.4s, v21.4h, v4.4h smull2 v11.4s, v21.8h, v4.8h ext v23.16b, v16.16b, v17.16b, #2*6 addp v0.4s, v0.4s, v1.4s smull v18.4s, v22.4h, v5.4h smull2 v19.4s, v22.8h, v5.8h ext v16.16b, v16.16b, v17.16b, #2*7 addp v1.4s, v2.4s, v3.4s addp v2.4s, v10.4s, v11.4s smull v20.4s, v23.4h, v6.4h smull2 v21.4s, v23.8h, v6.8h addp v3.4s, v18.4s, v19.4s smull v22.4s, v16.4h, v7.4h smull2 v23.4s, v16.8h, v7.8h addp v4.4s, v20.4s, v21.4s addp v5.4s, v22.4s, v23.4s addp v8.4s, v8.4s, v9.4s addp v0.4s, v0.4s, v1.4s addp v2.4s, v2.4s, v3.4s addp v4.4s, v4.4s, v5.4s addp v16.4s, v8.4s, v0.4s addp v17.4s, v2.4s, v4.4s add w5, w5, w8 srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) ret endfunc // void dav1d_warp_affine_8x8_16bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my, // const int bitdepth_max) .macro warp t function warp_affine_8x8\t\()_16bpc_neon, export=1 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] .ifb \t dup v15.8h, w7 // bitdepth_max .else movi v15.8h, #(PREP_BIAS >> 8), lsl #8 .endif clz w7, w7 // intermediate_bits = clz(bitdepth_max) - 18 .ifb \t sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 .endif sub w7, w7, #25 // -(7 - intermediate_bits) .ifb \t neg w8, w8 // -(7 + intermediate_bits) .endif dup v14.4s, w7 // -(7 - intermediate_bits) .ifb \t dup v13.4s, w8 // -(7 + intermediate_bits) .endif ldr x4, [x4] sbfx x7, x4, #0, #16 sbfx x8, x4, #16, #16 sbfx x9, x4, #32, #16 sbfx x4, x4, #48, #16 mov w10, #8 sub x2, x2, x3, lsl #1 sub x2, x2, x3 sub x2, x2, #6 movrel x11, X(mc_warp_filter), 64*8 mov x15, x30 .ifnb \t lsl x1, x1, #1 .endif bl warp_filter_horz_neon uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2 bl warp_filter_horz_neon uzp1 v25.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v26.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v27.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v28.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v29.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v30.8h, v16.8h, v17.8h // Ditto 1: add w14, w6, #512 bl warp_filter_horz_neon uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 load_filter_row d2, w14, w9 load_filter_row d3, w14, w9 load_filter_row d4, w14, w9 load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. smull v16.4s, v24.4h, v0.4h smlal v16.4s, v25.4h, v1.4h smlal v16.4s, v26.4h, v2.4h smlal v16.4s, v27.4h, v3.4h smlal v16.4s, v28.4h, v4.4h smlal v16.4s, v29.4h, v5.4h smlal v16.4s, v30.4h, v6.4h smlal v16.4s, v31.4h, v7.4h smull2 v17.4s, v24.8h, v0.8h smlal2 v17.4s, v25.8h, v1.8h smlal2 v17.4s, v26.8h, v2.8h smlal2 v17.4s, v27.8h, v3.8h smlal2 v17.4s, v28.8h, v4.8h smlal2 v17.4s, v29.8h, v5.8h smlal2 v17.4s, v30.8h, v6.8h smlal2 v17.4s, v31.8h, v7.8h mov v24.16b, v25.16b mov v25.16b, v26.16b .ifb \t srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) .else rshrn v16.4h, v16.4s, #7 rshrn2 v16.8h, v17.4s, #7 .endif mov v26.16b, v27.16b .ifb \t sqxtun v16.4h, v16.4s sqxtun2 v16.8h, v17.4s .else sub v16.8h, v16.8h, v15.8h // PREP_BIAS .endif mov v27.16b, v28.16b mov v28.16b, v29.16b .ifb \t umin v16.8h, v16.8h, v15.8h // bitdepth_max .endif mov v29.16b, v30.16b mov v30.16b, v31.16b subs w10, w10, #1 st1 {v16.8h}, [x0], x1 add w6, w6, w4 b.gt 1b ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret x15 endfunc .endm warp warp t // void dav1d_emu_edge_16bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_16bpc_neon, export=1 ldp x8, x9, [sp] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub x12, x3, #1 // ih - 1 cmp x5, x3 sub x13, x2, #1 // iw - 1 csel x12, x12, x5, ge // min(y, ih - 1) cmp x4, x2 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) csel x13, x13, x4, ge // min(x, iw - 1) bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) madd x8, x12, x9, x8 // ref += iclip() * stride add x8, x8, x13, lsl #1 // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add x10, x5, x1 // y + bh neg x5, x5 // -y sub x10, x10, x3 // y + bh - ih sub x12, x1, #1 // bh - 1 cmp x10, x1 bic x5, x5, x5, asr #63 // max(-y, 0) csel x10, x10, x12, lt // min(y + bh - ih, bh-1) cmp x5, x1 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) csel x5, x5, x12, lt // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add x11, x4, x0 // x + bw neg x4, x4 // -x sub x11, x11, x2 // x + bw - iw sub x13, x0, #1 // bw - 1 cmp x11, x0 bic x4, x4, x4, asr #63 // max(-x, 0) csel x11, x11, x13, lt // min(x + bw - iw, bw-1) cmp x4, x0 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub x1, x1, x5 // bh - top_ext madd x6, x5, x7, x6 sub x2, x0, x4 // bw - left_ext sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext sub x2, x2, x11 // center_w = bw - left_ext - right_ext mov x14, x6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left ld1r {v0.8h}, [x8] mov x12, x6 // out = dst mov x3, x4 mov v1.16b, v0.16b 1: subs x3, x3, #16 st1 {v0.8h, v1.8h}, [x12], #32 b.gt 1b .endif mov x13, x8 add x12, x6, x4, lsl #1 // out = dst + left_ext mov x3, x2 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 subs x3, x3, #32 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 b.gt 1b .if \need_right add x3, x8, x2, lsl #1 // in + center_w sub x3, x3, #2 // in + center_w - 1 add x12, x6, x4, lsl #1 // dst + left_ext ld1r {v0.8h}, [x3] add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w mov x3, x11 mov v1.16b, v0.16b 1: subs x3, x3, #16 st1 {v0.8h, v1.8h}, [x12], #32 b.gt 1b .endif subs x1, x1, #1 // center_h-- add x6, x6, x7 add x8, x8, x9 b.gt 0b .endm cbz x4, 2f // need_left cbz x11, 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cbz x11, 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cbz x10, 3f // need_bottom sub x8, x6, x7 // ref = dst - stride mov x4, x0 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 mov x3, x10 2: subs x3, x3, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 b.gt 2b msub x6, x7, x10, x6 // dst -= bottom_ext * stride subs x4, x4, #32 // bw -= 32 add x6, x6, #64 // dst += 32 b.gt 1b 3: cbz x5, 3f // need_top msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 mov x3, x5 2: subs x3, x3, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 b.gt 2b msub x6, x7, x5, x6 // dst -= top_ext * stride subs x0, x0, #32 // bw -= 32 add x6, x6, #64 // dst += 32 b.gt 1b 3: ret endfunc av-scenechange-0.14.1/src/asm/arm/64/satd.S000064400000000000000000001076271046102023000162310ustar 00000000000000/* Copyright (c) 2022-2023, The rav1e contributors. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "src/asm/arm/asm.S" #include "util.S" .macro butterfly r0, r1, r2, r3, t=8h add \r0\().\t, \r2\().\t, \r3\().\t sub \r1\().\t, \r2\().\t, \r3\().\t .endm .macro butterflyw r0, r1, r2, r3, r4, r5 sxtl \r0\().4s, \r4\().4h sxtl2 \r2\().4s, \r4\().8h ssubw \r1\().4s, \r0\().4s, \r5\().4h ssubw2 \r3\().4s, \r2\().4s, \r5\().8h saddw \r0\().4s, \r0\().4s, \r5\().4h saddw2 \r2\().4s, \r2\().4s, \r5\().8h .endm .macro interleave r0, r1, r2, r3 zip1 \r0\().8h, \r2\().8h, \r3\().8h zip2 \r1\().8h, \r2\().8h, \r3\().8h .endm .macro interleave_pairs r0, r1, r2, r3 zip1 \r0\().4s, \r2\().4s, \r3\().4s zip2 \r1\().4s, \r2\().4s, \r3\().4s .endm .macro interleave_quads r0, r1, r2, r3 zip1 \r0\().2d, \r2\().2d, \r3\().2d zip2 \r1\().2d, \r2\().2d, \r3\().2d .endm .macro normalize_4 add w0, w0, 2 lsr w0, w0, 2 .endm .macro normalize_8 add w0, w0, 4 lsr w0, w0, 3 .endm // x0: src: *const u8, // x1: src_stride: isize, // x2: dst: *const u8, // x3: dst_stride: isize, function satd4x4_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 ldr s0, [src] ldr s1, [dst] // subtract; cast to 16-bit usubl v0.8h, v0.8b, v1.8b ldr s1, [src, src_stride] ldr s2, [dst, dst_stride] usubl v1.8h, v1.8b, v2.8b // stride * 2 lsl x8, src_stride, 1 lsl x9, dst_stride, 1 ldr s2, [src, x8] ldr s3, [dst, x9] usubl v2.8h, v2.8b, v3.8b // stride * 3 add x8, src_stride, src_stride, lsl 1 add x9, dst_stride, dst_stride, lsl 1 ldr s3, [src, x8] ldr s4, [dst, x9] usubl v3.8h, v3.8b, v4.8b // pack rows 0-2, 1-3 mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] // Horizontal transform // v0 0 1 2 3 8 9 10 11 // v1 4 5 6 7 12 13 14 15 butterfly v2, v3, v0, v1 // v2 [0+4][1+5][2+6][3+7] [8+12][9+13][10+14][11+15] // v3 [0-4][1-5][2-6][3-7] [8-12][9-13][10-14][11-15] interleave v0, v1, v2, v3 // v0 [ 0+4][ 0-4][ 1+5][ 1-5] [2 + 6][2 - 6][3 + 7][3 - 7] // v1 [8+12][8-12][9+13][9-13] [10+14][10-14][11+15][11-15] butterfly v2, v3, v0, v1 // v2 [0+4+8+12][0-4+8-12][1+5+9+13][1-5+9-13] [2+6+10+14][2-6+10-14][3+7+11+15][3-7+11-15] // v3 [0+4-8-12][0-4-8+12][1+5-9-13][1-5-9+13] [2+6-10-14][2-6-10+14][3+7-11-15][3-7-11+15] interleave_pairs v0, v1, v2, v3 // Vertical transform butterfly v2, v3, v0, v1 interleave v0, v1, v2, v3 butterfly v2, v3, v0, v1 // sum up transform abs v2.8h, v2.8h abs v3.8h, v3.8h add v0.8h, v2.8h, v3.8h addv h0, v0.8h fmov w0, s0 normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride endfunc .macro DOUBLE_HADAMARD_4X4 hbd=0 // Horizontal transform butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 interleave v0, v1, v2, v3 interleave v4, v5, v6, v7 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 interleave_pairs v0, v1, v2, v3 interleave_pairs v4, v5, v6, v7 // Vertical transform butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 interleave v0, v1, v2, v3 interleave v4, v5, v6, v7 .if \hbd == 0 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 .else butterflyw v2, v3, v16, v17, v0, v1 butterflyw v6, v7, v18, v19, v4, v5 .endif .endm .macro SUM_DOUBLE_HADAMARD_4X4 abs v2.8h, v2.8h abs v3.8h, v3.8h abs v6.8h, v6.8h abs v7.8h, v7.8h add v0.8h, v2.8h, v3.8h add v1.8h, v6.8h, v7.8h add v0.8h, v0.8h, v1.8h addv h0, v0.8h fmov w0, s0 normalize_4 .endm function satd8x4_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 // load 8 pixel row ldr d0, [src] ldr d1, [dst] usubl v0.8h, v0.8b, v1.8b ldr d1, [src, src_stride] ldr d2, [dst, dst_stride] usubl v1.8h, v1.8b, v2.8b lsl x8, src_stride, 1 lsl x9, dst_stride, 1 ldr d2, [src, x8] ldr d3, [dst, x9] usubl v2.8h, v2.8b, v3.8b // stride * 3 add x8, src_stride, src_stride, lsl 1 add x9, dst_stride, dst_stride, lsl 1 ldr d3, [src, x8] ldr d4, [dst, x9] usubl v3.8h, v3.8b, v4.8b // extract top 64 bits out of register // (4 x 16 bits = 64) ext v4.16b, v0.16b, v0.16b, 8 ext v5.16b, v1.16b, v1.16b, 8 // pack rows 0-2, 1-3 (set 1) mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] // pack rows 0-2, 1-3 (set 2) mov v4.d[1], v2.d[1] mov v5.d[1], v3.d[1] // v2-3 temp registers for first 4x4 block// // 6-7 for second block DOUBLE_HADAMARD_4X4 SUM_DOUBLE_HADAMARD_4X4 ret #undef src #undef src_stride #undef dst #undef dst_stride endfunc .macro load_row n0, n1, src, dst, src_stride, dst_stride, should_add=1 ldr s\n0, [\src] ldr s\n1, [\dst] usubl v\n0\().8h, v\n0\().8b, v\n1\().8b .if \should_add != 0 add \src, \src, \src_stride add \dst, \dst, \dst_stride .endif .endm .macro load_row2 n0, n1, src, dst, src_stride, dst_stride ldr s\n0, [\src, \src_stride] ldr s\n1, [\dst, \dst_stride] usubl v\n0\().8h, v\n0\().8b, v\n1\().8b .endm function satd4x8_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 // 0 * stride load_row 0, 1, src, dst, src_stride, dst_stride, 0 // 1 * stride load_row2 1, 2, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 // pattern repeats load_row 2, 3, src, dst, src_stride, dst_stride, 0 load_row2 3, 4, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 4, 5, src, dst, src_stride, dst_stride, 0 load_row2 5, 6, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 6, 7, src, dst, src_stride, dst_stride, 0 load_row2 7, 8, src, dst, src_stride, dst_stride // pack rows mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] mov v4.d[1], v6.d[0] mov v5.d[1], v7.d[0] DOUBLE_HADAMARD_4X4 SUM_DOUBLE_HADAMARD_4X4 ret #undef src #undef src_stride #undef dst #undef dst_stride endfunc function satd16x4_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define ROW1 v0 #define ROW2 v1 #define TMP1 v2 #define TMP2 v3 #define ROW3 v4 #define ROW4 v5 #define TMP3 v6 #define TMP4 v7 #define ROW5 v16 #define ROW6 v17 #define TMP5 v20 #define TMP6 v21 #define ROW7 v18 #define ROW8 v19 #define TMP7 v22 #define TMP8 v23 // load 16 pixel row ldr q0, [src] ldr q1, [dst] usubl2 v16.8h, v0.16b, v1.16b usubl v0.8h, v0.8b, v1.8b ldr q1, [src, src_stride] ldr q2, [dst, dst_stride] usubl2 v17.8h, v1.16b, v2.16b usubl v1.8h, v1.8b, v2.8b lsl x8, src_stride, 1 lsl x9, dst_stride, 1 ldr q2, [src, x8] ldr q3, [dst, x9] usubl2 v6.8h, v2.16b, v3.16b usubl v2.8h, v2.8b, v3.8b // stride * 3 add x8, src_stride, src_stride, lsl 1 add x9, dst_stride, dst_stride, lsl 1 ldr q3, [src, x8] ldr q4, [dst, x9] usubl2 v7.8h, v3.16b, v4.16b usubl v3.8h, v3.8b, v4.8b // swap high/low 64 bits ext v4.16b, v0.16b, v0.16b, 8 ext v5.16b, v1.16b, v1.16b, 8 mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] ext v18.16b, v16.16b, v16.16b, 8 ext v19.16b, v17.16b, v17.16b, 8 mov v16.d[1], v6.d[0] mov v17.d[1], v7.d[0] // 2-3 free mov v4.d[1], v2.d[1] mov v5.d[1], v3.d[1] // 6-7 free mov v18.d[1], v6.d[1] mov v19.d[1], v7.d[1] // 0,1 2,3 // 4,5 6,7 // 16,17 20,21 // 18,19 22,23 // quadruple 4x4 hadamard butterfly TMP1, TMP2, ROW1, ROW2 butterfly TMP3, TMP4, ROW3, ROW4 butterfly TMP5, TMP6, ROW5, ROW6 butterfly TMP7, TMP8, ROW7, ROW8 interleave ROW1, ROW2, TMP1, TMP2 interleave ROW3, ROW4, TMP3, TMP4 interleave ROW5, ROW6, TMP5, TMP6 interleave ROW7, ROW8, TMP7, TMP8 butterfly TMP1, TMP2, ROW1, ROW2 butterfly TMP3, TMP4, ROW3, ROW4 butterfly TMP5, TMP6, ROW5, ROW6 butterfly TMP7, TMP8, ROW7, ROW8 interleave_pairs ROW1, ROW2, TMP1, TMP2 interleave_pairs ROW3, ROW4, TMP3, TMP4 interleave_pairs ROW5, ROW6, TMP5, TMP6 interleave_pairs ROW7, ROW8, TMP7, TMP8 butterfly TMP1, TMP2, ROW1, ROW2 butterfly TMP3, TMP4, ROW3, ROW4 butterfly TMP5, TMP6, ROW5, ROW6 butterfly TMP7, TMP8, ROW7, ROW8 interleave ROW1, ROW2, TMP1, TMP2 interleave ROW3, ROW4, TMP3, TMP4 interleave ROW5, ROW6, TMP5, TMP6 interleave ROW7, ROW8, TMP7, TMP8 butterfly TMP1, TMP2, ROW1, ROW2 butterfly TMP3, TMP4, ROW3, ROW4 butterfly TMP5, TMP6, ROW5, ROW6 butterfly TMP7, TMP8, ROW7, ROW8 // absolute value of transform coefficients abs TMP1.8h, TMP1.8h abs TMP2.8h, TMP2.8h abs TMP3.8h, TMP3.8h abs TMP4.8h, TMP4.8h abs TMP5.8h, TMP5.8h abs TMP6.8h, TMP6.8h abs TMP7.8h, TMP7.8h abs TMP8.8h, TMP8.8h // stage 1 sum add TMP1.8h, TMP1.8h, TMP5.8h add TMP2.8h, TMP2.8h, TMP6.8h add TMP3.8h, TMP3.8h, TMP7.8h add TMP4.8h, TMP4.8h, TMP8.8h // stage 2 sum add TMP1.8h, TMP1.8h, TMP3.8h add TMP2.8h, TMP2.8h, TMP4.8h add v0.8h, TMP1.8h, TMP2.8h addv h0, v0.8h fmov w0, s0 normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef ROW1 #undef TMP1 #undef ROW2 #undef TMP2 #undef ROW3 #undef TMP3 #undef ROW4 #undef TMP4 #undef ROW5 #undef TMP5 #undef ROW6 #undef TMP6 #undef ROW7 #undef TMP7 #undef ROW8 #undef TMP8 endfunc function satd4x16_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 load_row 0, 1, src, dst, src_stride, dst_stride, 0 load_row2 1, 2, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 2, 3, src, dst, src_stride, dst_stride, 0 load_row2 3, 4, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 4, 5, src, dst, src_stride, dst_stride, 0 load_row2 5, 6, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 6, 7, src, dst, src_stride, dst_stride, 0 load_row2 7, 16, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 16, 17, src, dst, src_stride, dst_stride, 0 load_row2 17, 18, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 18, 19, src, dst, src_stride, dst_stride, 0 load_row2 19, 20, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 20, 21, src, dst, src_stride, dst_stride, 0 load_row2 21, 22, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 22, 23, src, dst, src_stride, dst_stride, 0 load_row2 23, 24, src, dst, src_stride, dst_stride // pack rows mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] mov v4.d[1], v6.d[0] mov v5.d[1], v7.d[0] mov v16.d[1], v18.d[0] mov v17.d[1], v19.d[0] mov v20.d[1], v22.d[0] mov v21.d[1], v23.d[0] butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 butterfly v18, v19, v16, v17 butterfly v22, v23, v20, v21 interleave v0, v1, v2, v3 interleave v4, v5, v6, v7 interleave v16, v17, v18, v19 interleave v20, v21, v22, v23 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 butterfly v18, v19, v16, v17 butterfly v22, v23, v20, v21 interleave_pairs v0, v1, v2, v3 interleave_pairs v4, v5, v6, v7 interleave_pairs v16, v17, v18, v19 interleave_pairs v20, v21, v22, v23 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 butterfly v18, v19, v16, v17 butterfly v22, v23, v20, v21 interleave v0, v1, v2, v3 interleave v4, v5, v6, v7 interleave v16, v17, v18, v19 interleave v20, v21, v22, v23 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 butterfly v18, v19, v16, v17 butterfly v22, v23, v20, v21 abs v2.8h, v2.8h abs v3.8h, v3.8h abs v6.8h, v6.8h abs v7.8h, v7.8h abs v18.8h, v18.8h abs v19.8h, v19.8h abs v22.8h, v22.8h abs v23.8h, v23.8h add v2.8h, v2.8h, v3.8h add v6.8h, v6.8h, v7.8h add v18.8h, v18.8h, v19.8h add v22.8h, v22.8h, v23.8h add v2.8h, v2.8h, v6.8h add v18.8h, v18.8h, v22.8h add v0.8h, v2.8h, v18.8h addv h0, v0.8h fmov w0, s0 normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride endfunc .macro load_rows n0, n1, n2, src, dst, src_stride, dst_stride, n3=0, n4=0 .if \n3 == 0 ldr d\n0, [\src] ldr d\n1, [\dst] .else ldr q\n0, [\src] ldr q\n1, [\dst] usubl2 v\n3\().8h, v\n0\().16b, v\n1\().16b .endif usubl v\n0\().8h, v\n0\().8b, v\n1\().8b .if \n4 == 0 ldr d\n1, [\src, \src_stride] ldr d\n2, [\dst, \dst_stride] .else ldr q\n1, [\src, \src_stride] ldr q\n2, [\dst, \dst_stride] usubl2 v\n4\().8h, v\n1\().16b, v\n2\().16b .endif usubl v\n1\().8h, v\n1\().8b, v\n2\().8b add \src, \src, \src_stride, lsl 1 add \dst, \dst, \dst_stride, lsl 1 .endm .macro HADAMARD_8X8_H \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 // Horizontal transform butterfly v\b0, v\b1, v\a0, v\a1 butterfly v\b2, v\b3, v\a2, v\a3 butterfly v\b4, v\b5, v\a4, v\a5 butterfly v\b6, v\b7, v\a6, v\a7 interleave v\a0, v\a1, v\b0, v\b1 interleave v\a2, v\a3, v\b2, v\b3 interleave v\a4, v\a5, v\b4, v\b5 interleave v\a6, v\a7, v\b6, v\b7 butterfly v\b0, v\b2, v\a0, v\a2 butterfly v\b1, v\b3, v\a1, v\a3 butterfly v\b4, v\b6, v\a4, v\a6 butterfly v\b5, v\b7, v\a5, v\a7 interleave_pairs v\a0, v\a2, v\b0, v\b2 interleave_pairs v\a1, v\a3, v\b1, v\b3 interleave_pairs v\a4, v\a6, v\b4, v\b6 interleave_pairs v\a5, v\a7, v\b5, v\b7 butterfly v\b0, v\b4, v\a0, v\a4 butterfly v\b1, v\b5, v\a1, v\a5 butterfly v\b2, v\b6, v\a2, v\a6 butterfly v\b3, v\b7, v\a3, v\a7 interleave_quads v\a0, v\a4, v\b0, v\b4 interleave_quads v\a1, v\a5, v\b1, v\b5 interleave_quads v\a2, v\a6, v\b2, v\b6 interleave_quads v\a3, v\a7, v\b3, v\b7 .endm .macro HADAMARD_8X8_V \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 // Vertical transform butterfly v\b0, v\b1, v\a0, v\a1 butterfly v\b2, v\b3, v\a2, v\a3 butterfly v\b4, v\b5, v\a4, v\a5 butterfly v\b6, v\b7, v\a6, v\a7 butterfly v\a0, v\a2, v\b0, v\b2 butterfly v\a1, v\a3, v\b1, v\b3 butterfly v\a4, v\a6, v\b4, v\b6 butterfly v\a5, v\a7, v\b5, v\b7 butterfly v\b0, v\b4, v\a0, v\a4 butterfly v\b1, v\b5, v\a1, v\a5 butterfly v\b2, v\b6, v\a2, v\a6 butterfly v\b3, v\b7, v\a3, v\a7 .endm .macro SUM_HADAMARD_8X8 \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 // absolute value of transform coefficients abs v\b0\().8h, v\b0\().8h abs v\b1\().8h, v\b1\().8h abs v\b2\().8h, v\b2\().8h abs v\b3\().8h, v\b3\().8h abs v\b4\().8h, v\b4\().8h abs v\b5\().8h, v\b5\().8h abs v\b6\().8h, v\b6\().8h abs v\b7\().8h, v\b7\().8h // stage 1 sum sxtl v\a0\().4s, v\b0\().4h sxtl v\a1\().4s, v\b1\().4h sxtl v\a2\().4s, v\b2\().4h sxtl v\a3\().4s, v\b3\().4h saddw2 v\a0\().4s, v\a0\().4s, v\b0\().8h saddw2 v\a1\().4s, v\a1\().4s, v\b1\().8h saddw2 v\a2\().4s, v\a2\().4s, v\b2\().8h saddw2 v\a3\().4s, v\a3\().4s, v\b3\().8h saddw v\a0\().4s, v\a0\().4s, v\b4\().4h saddw2 v\a1\().4s, v\a1\().4s, v\b4\().8h saddw v\a2\().4s, v\a2\().4s, v\b5\().4h saddw2 v\a3\().4s, v\a3\().4s, v\b5\().8h saddw v\a0\().4s, v\a0\().4s, v\b6\().4h saddw2 v\a1\().4s, v\a1\().4s, v\b6\().8h saddw v\a2\().4s, v\a2\().4s, v\b7\().4h saddw2 v\a3\().4s, v\a3\().4s, v\b7\().8h // stage 2 sum add v\a0\().4s, v\a0\().4s, v\a1\().4s add v\a2\().4s, v\a2\().4s, v\a3\().4s // stage 3 sum add v0.4s, v\a0\().4s, v\a2\().4s addv s0, v0.4s .endm .macro SATD_8X8 \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 HADAMARD_8X8_H \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 HADAMARD_8X8_V \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 SUM_HADAMARD_8X8 \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 .endm function satd8x8_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define height w13 mov height, 8 mov total, wzr // 0, 1; 2, 3 // 4, 5; 6, 7 // 16, 17; 20, 21 // 18, 19; 22, 23 L(satd_w8): load_rows 0, 1, 2, src, dst, src_stride, dst_stride load_rows 4, 5, 6, src, dst, src_stride, dst_stride load_rows 16, 17, 20, src, dst, src_stride, dst_stride load_rows 18, 19, 22, src, dst, src_stride, dst_stride SATD_8X8 \ 0, 1, 4, 5, 16, 17, 18, 19, \ 2, 3, 6, 7, 20, 21, 22, 23 fmov subtotal, s0 add total, subtotal, total subs height, height, #8 bne L(satd_w8) mov w0, total normalize_8 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef subtotal #undef total #undef height endfunc .macro DOUBLE_HADAMARD_8X8 \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 \ c0 c1 c2 c3 c4 c5 c6 c7 // Horizontal transform butterfly v\b0, v\b1, v\a0, v\a1 butterfly v\b2, v\b3, v\a2, v\a3 butterfly v\b4, v\b5, v\a4, v\a5 butterfly v\b6, v\b7, v\a6, v\a7 butterfly v\a0, v\a1, v\c0, v\c1 butterfly v\a2, v\a3, v\c2, v\c3 butterfly v\a4, v\a5, v\c4, v\c5 butterfly v\a6, v\a7, v\c6, v\c7 interleave v\c0, v\c1, v\b0, v\b1 interleave v\c2, v\c3, v\b2, v\b3 interleave v\c4, v\c5, v\b4, v\b5 interleave v\c6, v\c7, v\b6, v\b7 interleave v\b0, v\b1, v\a0, v\a1 interleave v\b2, v\b3, v\a2, v\a3 interleave v\b4, v\b5, v\a4, v\a5 interleave v\b6, v\b7, v\a6, v\a7 butterfly v\a0, v\a2, v\c0, v\c2 butterfly v\a1, v\a3, v\c1, v\c3 butterfly v\a4, v\a6, v\c4, v\c6 butterfly v\a5, v\a7, v\c5, v\c7 butterfly v\c0, v\c2, v\b0, v\b2 butterfly v\c1, v\c3, v\b1, v\b3 butterfly v\c4, v\c6, v\b4, v\b6 butterfly v\c5, v\c7, v\b5, v\b7 interleave_pairs v\b0, v\b2, v\a0, v\a2 interleave_pairs v\b1, v\b3, v\a1, v\a3 interleave_pairs v\b4, v\b6, v\a4, v\a6 interleave_pairs v\b5, v\b7, v\a5, v\a7 interleave_pairs v\a0, v\a2, v\c0, v\c2 interleave_pairs v\a1, v\a3, v\c1, v\c3 interleave_pairs v\a4, v\a6, v\c4, v\c6 interleave_pairs v\a5, v\a7, v\c5, v\c7 butterfly v\c0, v\c4, v\b0, v\b4 butterfly v\c1, v\c5, v\b1, v\b5 butterfly v\c2, v\c6, v\b2, v\b6 butterfly v\c3, v\c7, v\b3, v\b7 butterfly v\b0, v\b4, v\a0, v\a4 butterfly v\b1, v\b5, v\a1, v\a5 butterfly v\b2, v\b6, v\a2, v\a6 butterfly v\b3, v\b7, v\a3, v\a7 interleave_quads v\a0, v\a4, v\c0, v\c4 interleave_quads v\a1, v\a5, v\c1, v\c5 interleave_quads v\a2, v\a6, v\c2, v\c6 interleave_quads v\a3, v\a7, v\c3, v\c7 interleave_quads v\c0, v\c4, v\b0, v\b4 interleave_quads v\c1, v\c5, v\b1, v\b5 interleave_quads v\c2, v\c6, v\b2, v\b6 interleave_quads v\c3, v\c7, v\b3, v\b7 // Vertical transform butterfly v\b0, v\b1, v\a0, v\a1 butterfly v\b2, v\b3, v\a2, v\a3 butterfly v\b4, v\b5, v\a4, v\a5 butterfly v\b6, v\b7, v\a6, v\a7 butterfly v\a0, v\a1, v\c0, v\c1 butterfly v\a2, v\a3, v\c2, v\c3 butterfly v\a4, v\a5, v\c4, v\c5 butterfly v\a6, v\a7, v\c6, v\c7 butterfly v\c0, v\c2, v\b0, v\b2 butterfly v\c1, v\c3, v\b1, v\b3 butterfly v\c4, v\c6, v\b4, v\b6 butterfly v\c5, v\c7, v\b5, v\b7 butterfly v\b0, v\b2, v\a0, v\a2 butterfly v\b1, v\b3, v\a1, v\a3 butterfly v\b4, v\b6, v\a4, v\a6 butterfly v\b5, v\b7, v\a5, v\a7 butterfly v\a0, v\a4, v\c0, v\c4 butterfly v\a1, v\a5, v\c1, v\c5 butterfly v\a2, v\a6, v\c2, v\c6 butterfly v\a3, v\a7, v\c3, v\c7 butterfly v\c0, v\c4, v\b0, v\b4 butterfly v\c1, v\c5, v\b1, v\b5 butterfly v\c2, v\c6, v\b2, v\b6 butterfly v\c3, v\c7, v\b3, v\b7 .endm .macro SUM_DOUBLE_HADAMARD_8X8 \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 \ c0 c1 c2 c3 c4 c5 c6 c7 // absolute value of transform coefficients abs v\a0\().8h, v\a0\().8h abs v\a1\().8h, v\a1\().8h abs v\a2\().8h, v\a2\().8h abs v\a3\().8h, v\a3\().8h abs v\a4\().8h, v\a4\().8h abs v\a5\().8h, v\a5\().8h abs v\a6\().8h, v\a6\().8h abs v\a7\().8h, v\a7\().8h abs v\c0\().8h, v\c0\().8h abs v\c1\().8h, v\c1\().8h abs v\c2\().8h, v\c2\().8h abs v\c3\().8h, v\c3\().8h abs v\c4\().8h, v\c4\().8h abs v\c5\().8h, v\c5\().8h abs v\c6\().8h, v\c6\().8h abs v\c7\().8h, v\c7\().8h // stage 1 sum sxtl v\b0\().4s, v\a0\().4h sxtl v\b1\().4s, v\a1\().4h sxtl v\b2\().4s, v\a2\().4h sxtl v\b3\().4s, v\a3\().4h sxtl v\b4\().4s, v\a4\().4h sxtl v\b5\().4s, v\a5\().4h sxtl v\b6\().4s, v\a6\().4h sxtl v\b7\().4s, v\a7\().4h saddw2 v\b0\().4s, v\b0\().4s, v\a0\().8h saddw2 v\b1\().4s, v\b1\().4s, v\a1\().8h saddw2 v\b2\().4s, v\b2\().4s, v\a2\().8h saddw2 v\b3\().4s, v\b3\().4s, v\a3\().8h saddw2 v\b4\().4s, v\b4\().4s, v\a4\().8h saddw2 v\b5\().4s, v\b5\().4s, v\a5\().8h saddw2 v\b6\().4s, v\b6\().4s, v\a6\().8h saddw2 v\b7\().4s, v\b7\().4s, v\a7\().8h saddw v\b0\().4s, v\b0\().4s, v\c0\().4h saddw2 v\b1\().4s, v\b1\().4s, v\c0\().8h saddw v\b2\().4s, v\b2\().4s, v\c1\().4h saddw2 v\b3\().4s, v\b3\().4s, v\c1\().8h saddw v\b4\().4s, v\b4\().4s, v\c2\().4h saddw2 v\b5\().4s, v\b5\().4s, v\c2\().8h saddw v\b6\().4s, v\b6\().4s, v\c3\().4h saddw2 v\b7\().4s, v\b7\().4s, v\c3\().8h saddw v\b0\().4s, v\b0\().4s, v\c4\().4h saddw2 v\b1\().4s, v\b1\().4s, v\c4\().8h saddw v\b2\().4s, v\b2\().4s, v\c5\().4h saddw2 v\b3\().4s, v\b3\().4s, v\c5\().8h saddw v\b4\().4s, v\b4\().4s, v\c6\().4h saddw2 v\b5\().4s, v\b5\().4s, v\c6\().8h saddw v\b6\().4s, v\b6\().4s, v\c7\().4h saddw2 v\b7\().4s, v\b7\().4s, v\c7\().8h // stage 2 sum add v\b0\().4s, v\b0\().4s, v\b1\().4s add v\b2\().4s, v\b2\().4s, v\b3\().4s add v\b4\().4s, v\b4\().4s, v\b5\().4s add v\b6\().4s, v\b6\().4s, v\b7\().4s // stage 3 sum add v\b0\().4s, v\b0\().4s, v\b2\().4s add v\b4\().4s, v\b4\().4s, v\b6\().4s // stage 4 sum add v0.4s, v\b0\().4s, v\b4\().4s addv s0, v0.4s .endm function satd16x8_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define w_ext x11 #define w_bak w11 #define width w12 #define height w13 mov height, 8 mov width, 16 sxtw w_ext, width mov total, wzr // 0, 1; 2, 3; 24, 25 // 4, 5; 6, 7; 26, 27 // 16, 17; 20, 21; 28, 29 // 18, 19; 22, 23; 30, 31 L(satd_w16up): load_rows 0, 1, 2, src, dst, src_stride, dst_stride, 24, 25 load_rows 4, 5, 6, src, dst, src_stride, dst_stride, 26, 27 load_rows 16, 17, 20, src, dst, src_stride, dst_stride, 28, 29 load_rows 18, 19, 22, src, dst, src_stride, dst_stride, 30, 31 DOUBLE_HADAMARD_8X8 \ 0, 1, 4, 5, 16, 17, 18, 19, \ 2, 3, 6, 7, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 SUM_DOUBLE_HADAMARD_8X8 \ 0, 1, 4, 5, 16, 17, 18, 19, \ 2, 3, 6, 7, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 fmov subtotal, s0 add total, subtotal, total sub src, src, src_stride, lsl 3 sub dst, dst, dst_stride, lsl 3 add src, src, #16 add dst, dst, #16 subs width, width, #16 bne L(satd_w16up) sub src, src, w_ext sub dst, dst, w_ext add src, src, src_stride, lsl 3 add dst, dst, dst_stride, lsl 3 subs height, height, #8 mov width, w_bak bne L(satd_w16up) mov w0, total normalize_8 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef w_ext #undef w_bak #undef subtotal #undef total #undef height #undef width endfunc .macro satd_x8up width, height function satd\width\()x\height\()_neon, export=1 mov w13, \height .if \width == 8 mov w10, wzr b L(satd_w8) .else mov w12, \width sxtw x11, w12 mov w10, wzr b L(satd_w16up) .endif endfunc .endm satd_x8up 8, 16 satd_x8up 8, 32 satd_x8up 16, 16 satd_x8up 16, 32 satd_x8up 16, 64 satd_x8up 32, 8 satd_x8up 32, 16 satd_x8up 32, 32 satd_x8up 32, 64 satd_x8up 64, 16 satd_x8up 64, 32 satd_x8up 64, 64 satd_x8up 64, 128 satd_x8up 128, 64 satd_x8up 128, 128 .macro load_rows_hbd n0, n1, n2, src, dst, src_stride, dst_stride ldr q\n0, [\src] ldr q\n1, [\dst] sub v\n0\().8h, v\n0\().8h, v\n1\().8h ldr q\n1, [\src, \src_stride] ldr q\n2, [\dst, \dst_stride] sub v\n1\().8h, v\n1\().8h, v\n2\().8h add \src, \src, \src_stride, lsl 1 add \dst, \dst, \dst_stride, lsl 1 .endm .macro HADAMARD_8X8_V_HBD \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 \ c0 c1 c2 c3 c4 c5 c6 c7 // Vertical transform butterflyw v\b0, v\b1, v\c0, v\c1, v\a0, v\a1 butterflyw v\b2, v\b3, v\c2, v\c3, v\a2, v\a3 butterflyw v\b4, v\b5, v\c4, v\c5, v\a4, v\a5 butterflyw v\b6, v\b7, v\c6, v\c7, v\a6, v\a7 butterfly v\a0, v\a2, v\b0, v\b2, 4s butterfly v\a1, v\a3, v\b1, v\b3, 4s butterfly v\a4, v\a6, v\b4, v\b6, 4s butterfly v\a5, v\a7, v\b5, v\b7, 4s butterfly v\b0, v\b2, v\c0, v\c2, 4s butterfly v\b1, v\b3, v\c1, v\c3, 4s butterfly v\b4, v\b6, v\c4, v\c6, 4s butterfly v\b5, v\b7, v\c5, v\c7, 4s butterfly v\c0, v\c4, v\a0, v\a4, 4s butterfly v\c1, v\c5, v\a1, v\a5, 4s butterfly v\c2, v\c6, v\a2, v\a6, 4s butterfly v\c3, v\c7, v\a3, v\a7, 4s butterfly v\a0, v\a4, v\b0, v\b4, 4s butterfly v\a1, v\a5, v\b1, v\b5, 4s butterfly v\a2, v\a6, v\b2, v\b6, 4s butterfly v\a3, v\a7, v\b3, v\b7, 4s .endm .macro SUM_HADAMARD_8X8_HBD \ a0 a1 a2 a3 a4 a5 a6 a7 \ c0 c1 c2 c3 c4 c5 c6 c7 // absolute value of transform coefficients abs v\a0\().4s, v\a0\().4s abs v\a1\().4s, v\a1\().4s abs v\a2\().4s, v\a2\().4s abs v\a3\().4s, v\a3\().4s abs v\a4\().4s, v\a4\().4s abs v\a5\().4s, v\a5\().4s abs v\a6\().4s, v\a6\().4s abs v\a7\().4s, v\a7\().4s abs v\c0\().4s, v\c0\().4s abs v\c1\().4s, v\c1\().4s abs v\c2\().4s, v\c2\().4s abs v\c3\().4s, v\c3\().4s abs v\c4\().4s, v\c4\().4s abs v\c5\().4s, v\c5\().4s abs v\c6\().4s, v\c6\().4s abs v\c7\().4s, v\c7\().4s // stage 1 sum add v\a0\().4s, v\a0\().4s, v\a1\().4s add v\a2\().4s, v\a2\().4s, v\a3\().4s add v\a4\().4s, v\a4\().4s, v\a5\().4s add v\a6\().4s, v\a6\().4s, v\a7\().4s add v\c0\().4s, v\c0\().4s, v\c1\().4s add v\c2\().4s, v\c2\().4s, v\c3\().4s add v\c4\().4s, v\c4\().4s, v\c5\().4s add v\c6\().4s, v\c6\().4s, v\c7\().4s // stage 2 sum add v\a0\().4s, v\a0\().4s, v\a2\().4s add v\a4\().4s, v\a4\().4s, v\a6\().4s add v\c0\().4s, v\c0\().4s, v\c2\().4s add v\c4\().4s, v\c4\().4s, v\c6\().4s // stage 3 sum add v\a0\().4s, v\a0\().4s, v\a4\().4s add v\c0\().4s, v\c0\().4s, v\c4\().4s // stage 4 sum add v0.4s, v\a0\().4s, v\c0\().4s addv s0, v0.4s .endm .macro SATD_8X8_HBD \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 \ c0 c1 c2 c3 c4 c5 c6 c7 HADAMARD_8X8_H \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 HADAMARD_8X8_V_HBD \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7, \ \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7 SUM_HADAMARD_8X8_HBD \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7 .endm function satd8x8_hbd_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define w_ext x11 #define w_bak w11 #define width w12 #define height w13 mov height, 8 mov width, 8 sxtw w_ext, width mov total, wzr // 0, 1; 2, 3; 24, 25 // 4, 5; 6, 7; 26, 27 // 16, 17; 20, 21; 28, 29 // 18, 19; 22, 23; 30, 31 L(satd_w8up_hbd): load_rows_hbd 0, 1, 2, src, dst, src_stride, dst_stride load_rows_hbd 4, 5, 6, src, dst, src_stride, dst_stride load_rows_hbd 16, 17, 20, src, dst, src_stride, dst_stride load_rows_hbd 18, 19, 22, src, dst, src_stride, dst_stride SATD_8X8_HBD \ 0, 1, 4, 5, 16, 17, 18, 19, \ 2, 3, 6, 7, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 fmov subtotal, s0 add total, subtotal, total sub src, src, src_stride, lsl 3 sub dst, dst, dst_stride, lsl 3 add src, src, #16 add dst, dst, #16 subs width, width, #8 bne L(satd_w8up_hbd) sub src, src, w_ext, lsl 1 sub dst, dst, w_ext, lsl 1 add src, src, src_stride, lsl 3 add dst, dst, dst_stride, lsl 3 subs height, height, #8 mov width, w_bak bne L(satd_w8up_hbd) mov w0, total normalize_8 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef w_ext #undef w_bak #undef subtotal #undef total #undef height #undef width endfunc .macro satd_x8up_hbd width, height function satd\width\()x\height\()_hbd_neon, export=1 mov w13, \height mov w12, \width sxtw x11, w12 mov w10, wzr b L(satd_w8up_hbd) endfunc .endm satd_x8up_hbd 8, 16 satd_x8up_hbd 8, 32 satd_x8up_hbd 16, 8 satd_x8up_hbd 16, 16 satd_x8up_hbd 16, 32 satd_x8up_hbd 16, 64 satd_x8up_hbd 32, 8 satd_x8up_hbd 32, 16 satd_x8up_hbd 32, 32 satd_x8up_hbd 32, 64 satd_x8up_hbd 64, 16 satd_x8up_hbd 64, 32 satd_x8up_hbd 64, 64 satd_x8up_hbd 64, 128 satd_x8up_hbd 128, 64 satd_x8up_hbd 128, 128 // x0: src: *const u16, // x1: src_stride: isize, // x2: dst: *const u16, // x3: dst_stride: isize, function satd4x4_hbd_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define height w13 mov height, 4 mov total, wzr L(satd_w4_hbd): ldr d0, [src] ldr d1, [dst] sub v0.8h, v0.8h, v1.8h ldr d1, [src, src_stride] ldr d2, [dst, dst_stride] sub v1.8h, v1.8h, v2.8h add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 ldr d2, [src] ldr d3, [dst] sub v2.8h, v2.8h, v3.8h ldr d3, [src, src_stride] ldr d4, [dst, src_stride] sub v3.8h, v3.8h, v4.8h add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 // pack rows 0-2, 1-3 mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] // Horizontal transform butterfly v2, v3, v0, v1 interleave v0, v1, v2, v3 butterfly v2, v3, v0, v1 interleave_pairs v0, v1, v2, v3 // Vertical transform butterfly v2, v3, v0, v1 interleave v0, v1, v2, v3 butterflyw v2, v3, v4, v5, v0, v1 // absolute value of transform coefficients abs v2.4s, v2.4s abs v3.4s, v3.4s abs v4.4s, v4.4s abs v5.4s, v5.4s // stage 1 sum add v2.4s, v2.4s, v3.4s add v4.4s, v4.4s, v5.4s // stage 2 sum add v0.4s, v2.4s, v4.4s addv s0, v0.4s fmov subtotal, s0 add total, subtotal, total subs height, height, #4 bne L(satd_w4_hbd) mov w0, total normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef subtotal #undef total #undef height endfunc function satd4x8_hbd_neon, export=1 mov w13, 8 mov w10, wzr b L(satd_w4_hbd) endfunc function satd4x16_hbd_neon, export=1 mov w13, 16 mov w10, wzr b L(satd_w4_hbd) endfunc .macro SUM_DOUBLE_HADAMARD_4X4_HBD \ a0 a1 a2 a3 c0 c1 c2 c3 // absolute value of transform coefficients abs v\a0\().4s, v\a0\().4s abs v\a1\().4s, v\a1\().4s abs v\a2\().4s, v\a2\().4s abs v\a3\().4s, v\a3\().4s abs v\c0\().4s, v\c0\().4s abs v\c1\().4s, v\c1\().4s abs v\c2\().4s, v\c2\().4s abs v\c3\().4s, v\c3\().4s // stage 1 sum add v\a0\().4s, v\a0\().4s, v\a1\().4s add v\a2\().4s, v\a2\().4s, v\a3\().4s add v\c0\().4s, v\c0\().4s, v\c1\().4s add v\c2\().4s, v\c2\().4s, v\c3\().4s // stage 2 sum add v\a0\().4s, v\a0\().4s, v\a2\().4s add v\c0\().4s, v\c0\().4s, v\c2\().4s // stage 3 sum add v0.4s, v\a0\().4s, v\c0\().4s addv s0, v0.4s .endm function satd8x4_hbd_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define width w12 mov width, 8 mov total, wzr L(satd_h4_hbd): ldr q0, [src] ldr q1, [dst] sub v0.8h, v0.8h, v1.8h ldr q1, [src, src_stride] ldr q2, [dst, dst_stride] sub v1.8h, v1.8h, v2.8h lsl x8, src_stride, 1 lsl x9, dst_stride, 1 ldr q2, [src, x8] ldr q3, [dst, x9] sub v2.8h, v2.8h, v3.8h add x8, src_stride, src_stride, lsl 1 add x9, dst_stride, dst_stride, lsl 1 ldr q3, [src, x8] ldr q4, [dst, x9] sub v3.8h, v3.8h, v4.8h ext v4.16b, v0.16b, v0.16b, 8 ext v5.16b, v1.16b, v1.16b, 8 mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] mov v4.d[1], v2.d[1] mov v5.d[1], v3.d[1] DOUBLE_HADAMARD_4X4 hbd=1 SUM_DOUBLE_HADAMARD_4X4_HBD 2, 3, 16, 17, 6, 7, 18, 19 fmov subtotal, s0 add total, subtotal, total add src, src, #16 add dst, dst, #16 subs width, width, #8 bne L(satd_h4_hbd) mov w0, total normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef subtotal #undef total #undef width endfunc function satd16x4_hbd_neon, export=1 mov w12, 16 mov w10, wzr b L(satd_h4_hbd) endfunc av-scenechange-0.14.1/src/asm/arm/64/util.S000064400000000000000000000235471046102023000162510ustar 00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2015 Martin Storsjo * Copyright © 2015 Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #ifndef DAV1D_SRC_ARM_64_UTIL_S #define DAV1D_SRC_ARM_64_UTIL_S #include "config.h" #include "src/asm/arm/asm.S" .macro movrel rd, val, offset=0 #if defined(__APPLE__) .if \offset < 0 adrp \rd, \val@PAGE add \rd, \rd, \val@PAGEOFF sub \rd, \rd, -(\offset) .else adrp \rd, \val+(\offset)@PAGE add \rd, \rd, \val+(\offset)@PAGEOFF .endif #elif defined(PIC) && defined(_WIN32) .if \offset < 0 adrp \rd, \val add \rd, \rd, :lo12:\val sub \rd, \rd, -(\offset) .else adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) .endif #elif defined(PIC) adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) #else ldr \rd, =\val+\offset #endif .endm .macro sub_sp space #ifdef _WIN32 .if \space > 8192 // Here, we'd need to touch two (or more) pages while decrementing // the stack pointer. .error "sub_sp_align doesn't support values over 8K at the moment" .elseif \space > 4096 sub x16, sp, #4096 ldr xzr, [x16] sub sp, x16, #(\space - 4096) .else sub sp, sp, #\space .endif #else .if \space >= 4096 sub sp, sp, #(\space)/4096*4096 .endif .if (\space % 4096) != 0 sub sp, sp, #(\space)%4096 .endif #endif .endm .macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 zip1 \r0\().16b, \r0\().16b, \r1\().16b // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7 zip1 \r2\().16b, \r2\().16b, \r3\().16b // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7 zip1 \r4\().16b, \r4\().16b, \r5\().16b // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7 zip1 \r6\().16b, \r6\().16b, \r7\().16b // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6 trn1 \r1\().8h, \r0\().8h, \r2\().8h // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7 trn2 \r3\().8h, \r0\().8h, \r2\().8h // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6 trn1 \r5\().8h, \r4\().8h, \r6\().8h // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7 trn2 \r7\().8h, \r4\().8h, \r6\().8h // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4 trn1 \r0\().4s, \r1\().4s, \r5\().4s // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6 trn2 \r2\().4s, \r1\().4s, \r5\().4s // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5 trn1 \r1\().4s, \r3\().4s, \r7\().4s // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7 trn2 \r3\().4s, \r3\().4s, \r7\().4s \xtl\()2 \r4\().8h, \r0\().16b \xtl \r0\().8h, \r0\().8b \xtl\()2 \r6\().8h, \r2\().16b \xtl \r2\().8h, \r2\().8b \xtl\()2 \r5\().8h, \r1\().16b \xtl \r1\().8h, \r1\().8b \xtl\()2 \r7\().8h, \r3\().16b \xtl \r3\().8h, \r3\().8b .endm .macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 trn1 \t8\().8h, \r0\().8h, \r1\().8h trn2 \t9\().8h, \r0\().8h, \r1\().8h trn1 \r1\().8h, \r2\().8h, \r3\().8h trn2 \r3\().8h, \r2\().8h, \r3\().8h trn1 \r0\().8h, \r4\().8h, \r5\().8h trn2 \r5\().8h, \r4\().8h, \r5\().8h trn1 \r2\().8h, \r6\().8h, \r7\().8h trn2 \r7\().8h, \r6\().8h, \r7\().8h trn1 \r4\().4s, \r0\().4s, \r2\().4s trn2 \r2\().4s, \r0\().4s, \r2\().4s trn1 \r6\().4s, \r5\().4s, \r7\().4s trn2 \r7\().4s, \r5\().4s, \r7\().4s trn1 \r5\().4s, \t9\().4s, \r3\().4s trn2 \t9\().4s, \t9\().4s, \r3\().4s trn1 \r3\().4s, \t8\().4s, \r1\().4s trn2 \t8\().4s, \t8\().4s, \r1\().4s trn1 \r0\().2d, \r3\().2d, \r4\().2d trn2 \r4\().2d, \r3\().2d, \r4\().2d trn1 \r1\().2d, \r5\().2d, \r6\().2d trn2 \r5\().2d, \r5\().2d, \r6\().2d trn2 \r6\().2d, \t8\().2d, \r2\().2d trn1 \r2\().2d, \t8\().2d, \r2\().2d trn1 \r3\().2d, \t9\().2d, \r7\().2d trn2 \r7\().2d, \t9\().2d, \r7\().2d .endm .macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 trn1 \t8\().16b, \r0\().16b, \r1\().16b trn2 \t9\().16b, \r0\().16b, \r1\().16b trn1 \r1\().16b, \r2\().16b, \r3\().16b trn2 \r3\().16b, \r2\().16b, \r3\().16b trn1 \r0\().16b, \r4\().16b, \r5\().16b trn2 \r5\().16b, \r4\().16b, \r5\().16b trn1 \r2\().16b, \r6\().16b, \r7\().16b trn2 \r7\().16b, \r6\().16b, \r7\().16b trn1 \r4\().8h, \r0\().8h, \r2\().8h trn2 \r2\().8h, \r0\().8h, \r2\().8h trn1 \r6\().8h, \r5\().8h, \r7\().8h trn2 \r7\().8h, \r5\().8h, \r7\().8h trn1 \r5\().8h, \t9\().8h, \r3\().8h trn2 \t9\().8h, \t9\().8h, \r3\().8h trn1 \r3\().8h, \t8\().8h, \r1\().8h trn2 \t8\().8h, \t8\().8h, \r1\().8h trn1 \r0\().4s, \r3\().4s, \r4\().4s trn2 \r4\().4s, \r3\().4s, \r4\().4s trn1 \r1\().4s, \r5\().4s, \r6\().4s trn2 \r5\().4s, \r5\().4s, \r6\().4s trn2 \r6\().4s, \t8\().4s, \r2\().4s trn1 \r2\().4s, \t8\().4s, \r2\().4s trn1 \r3\().4s, \t9\().4s, \r7\().4s trn2 \r7\().4s, \t9\().4s, \r7\().4s .endm .macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().16b, \r0\().16b, \r1\().16b trn2 \t5\().16b, \r0\().16b, \r1\().16b trn1 \t6\().16b, \r2\().16b, \r3\().16b trn2 \t7\().16b, \r2\().16b, \r3\().16b trn1 \r0\().8h, \t4\().8h, \t6\().8h trn2 \r2\().8h, \t4\().8h, \t6\().8h trn1 \r1\().8h, \t5\().8h, \t7\().8h trn2 \r3\().8h, \t5\().8h, \t7\().8h .endm .macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().4h, \r0\().4h, \r1\().4h trn2 \t5\().4h, \r0\().4h, \r1\().4h trn1 \t6\().4h, \r2\().4h, \r3\().4h trn2 \t7\().4h, \r2\().4h, \r3\().4h trn1 \r0\().2s, \t4\().2s, \t6\().2s trn2 \r2\().2s, \t4\().2s, \t6\().2s trn1 \r1\().2s, \t5\().2s, \t7\().2s trn2 \r3\().2s, \t5\().2s, \t7\().2s .endm .macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().4s, \r0\().4s, \r1\().4s trn2 \t5\().4s, \r0\().4s, \r1\().4s trn1 \t6\().4s, \r2\().4s, \r3\().4s trn2 \t7\().4s, \r2\().4s, \r3\().4s trn1 \r0\().2d, \t4\().2d, \t6\().2d trn2 \r2\().2d, \t4\().2d, \t6\().2d trn1 \r1\().2d, \t5\().2d, \t7\().2d trn2 \r3\().2d, \t5\().2d, \t7\().2d .endm .macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().8h, \r0\().8h, \r1\().8h trn2 \t5\().8h, \r0\().8h, \r1\().8h trn1 \t6\().8h, \r2\().8h, \r3\().8h trn2 \t7\().8h, \r2\().8h, \r3\().8h trn1 \r0\().4s, \t4\().4s, \t6\().4s trn2 \r2\().4s, \t4\().4s, \t6\().4s trn1 \r1\().4s, \t5\().4s, \t7\().4s trn2 \r3\().4s, \t5\().4s, \t7\().4s .endm #endif /* DAV1D_SRC_ARM_64_UTIL_S */ av-scenechange-0.14.1/src/asm/arm/asm.S000064400000000000000000000212161046102023000156120ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV1D_SRC_ARM_ASM_S #define DAV1D_SRC_ARM_ASM_S #include "config.h" #if ARCH_AARCH64 #define x18 do_not_use_x18 #define w18 do_not_use_w18 /* Support macros for * - Armv8.3-A Pointer Authentication and * - Armv8.5-A Branch Target Identification * features which require emitting a .note.gnu.property section with the * appropriate architecture-dependent feature bits set. * * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be * used immediately before saving the LR register (x30) to the stack. * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also * have the same value at the two points. For example: * * .global f * f: * AARCH64_SIGN_LINK_REGISTER * stp x29, x30, [sp, #-96]! * mov x29, sp * ... * ldp x29, x30, [sp], #96 * AARCH64_VALIDATE_LINK_REGISTER * ret * * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an * indirect call target. In particular, all symbols exported from a file must * begin with one of these macros. For example, a leaf function that does not * save LR can instead use |AARCH64_VALID_CALL_TARGET|: * * .globl return_zero * return_zero: * AARCH64_VALID_CALL_TARGET * mov x0, #0 * ret * * A non-leaf function which does not immediately save LR may need both macros * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function * may jump to an alternate implementation before setting up the stack: * * .globl with_early_jump * with_early_jump: * AARCH64_VALID_CALL_TARGET * cmp x0, #128 * b.lt .Lwith_early_jump_128 * AARCH64_SIGN_LINK_REGISTER * stp x29, x30, [sp, #-96]! * mov x29, sp * ... * ldp x29, x30, [sp], #96 * AARCH64_VALIDATE_LINK_REGISTER * ret * * .Lwith_early_jump_128: * ... * ret * * These annotations are only required with indirect calls. Private symbols that * are only the target of direct calls do not require annotations. Also note * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not * indirect jumps (BR). Indirect jumps in assembly are supported through * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|. * * Although not necessary, it is safe to use these macros in 32-bit ARM * assembly. This may be used to simplify dual 32-bit and 64-bit files. * * References: * - "ELF for the Arm® 64-bit Architecture" * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst * - "Providing protection for complex software" * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software */ #if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1) #define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification #define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc' #define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' #define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j' #else #define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification #define AARCH64_VALID_JUMP_CALL_TARGET #define AARCH64_VALID_CALL_TARGET #define AARCH64_VALID_JUMP_TARGET #endif #if defined(__ARM_FEATURE_PAC_DEFAULT) #if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A #define AARCH64_SIGN_LINK_REGISTER paciasp #define AARCH64_VALIDATE_LINK_REGISTER autiasp #elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B #define AARCH64_SIGN_LINK_REGISTER pacibsp #define AARCH64_VALIDATE_LINK_REGISTER autibsp #else #error Pointer authentication defines no valid key! #endif #if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions #error Authentication of leaf functions is enabled but not supported in dav1d! #endif #define GNU_PROPERTY_AARCH64_PAC (1 << 1) #elif defined(__APPLE__) && defined(__arm64e__) #define GNU_PROPERTY_AARCH64_PAC 0 #define AARCH64_SIGN_LINK_REGISTER pacibsp #define AARCH64_VALIDATE_LINK_REGISTER autibsp #else /* __ARM_FEATURE_PAC_DEFAULT */ #define GNU_PROPERTY_AARCH64_PAC 0 #define AARCH64_SIGN_LINK_REGISTER #define AARCH64_VALIDATE_LINK_REGISTER #endif /* !__ARM_FEATURE_PAC_DEFAULT */ #if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) .pushsection .note.gnu.property, "a" .balign 8 .long 4 .long 0x10 .long 0x5 .asciz "GNU" .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ .long 4 .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC) .long 0 .popsection #endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */ #endif /* ARCH_AARCH64 */ #if ARCH_ARM .syntax unified #ifdef __ELF__ .arch armv7-a .fpu neon .eabi_attribute 10, 0 // suppress Tag_FP_arch .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch .section .note.GNU-stack,"",%progbits // Mark stack as non-executable #endif /* __ELF__ */ #ifdef _WIN32 #define CONFIG_THUMB 1 #else #define CONFIG_THUMB 0 #endif #if CONFIG_THUMB .thumb #define A @ #define T #else #define A #define T @ #endif /* CONFIG_THUMB */ #endif /* ARCH_ARM */ #if !defined(PIC) #if defined(__PIC__) #define PIC __PIC__ #elif defined(__pic__) #define PIC __pic__ #endif #endif #ifndef PRIVATE_PREFIX #define PRIVATE_PREFIX dav1d_ #endif #define PASTE(a,b) a ## b #define CONCAT(a,b) PASTE(a,b) #ifdef PREFIX #define EXTERN CONCAT(_,PRIVATE_PREFIX) #else #define EXTERN PRIVATE_PREFIX #endif .macro function name, export=0, align=2 .macro endfunc #ifdef __ELF__ .size \name, . - \name #endif #if HAVE_AS_FUNC .endfunc #endif .purgem endfunc .endm .text .align \align .if \export .global EXTERN\name #ifdef __ELF__ .type EXTERN\name, %function .hidden EXTERN\name #elif defined(__MACH__) .private_extern EXTERN\name #endif #if HAVE_AS_FUNC .func EXTERN\name #endif EXTERN\name: .else #ifdef __ELF__ .type \name, %function #endif #if HAVE_AS_FUNC .func \name #endif .endif \name: #if ARCH_AARCH64 .if \export AARCH64_VALID_CALL_TARGET .endif #endif .endm .macro const name, export=0, align=2 .macro endconst #ifdef __ELF__ .size \name, . - \name #endif .purgem endconst .endm #if defined(_WIN32) .section .rdata #elif !defined(__MACH__) .section .rodata #else .const_data #endif .align \align .if \export .global EXTERN\name #ifdef __ELF__ .hidden EXTERN\name #elif defined(__MACH__) .private_extern EXTERN\name #endif EXTERN\name: .endif \name: .endm #ifdef __APPLE__ #define L(x) L ## x #else #define L(x) .L ## x #endif #define X(x) CONCAT(EXTERN, x) #endif /* DAV1D_SRC_ARM_ASM_S */ av-scenechange-0.14.1/src/asm/arm/tables.S000064400000000000000000000431121046102023000163030ustar 00000000000000/* * Copyright (c) 2019-2022, The rav1e contributors. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/asm/arm/asm.S" const mc_subpel_filters, export=1, align=3 .byte 0, 1, -3, 63, 4, -1, 0, 0 /* REGULAR */ .byte 0, 1, -5, 61, 9, -2, 0, 0 .byte 0, 1, -6, 58, 14, -4, 1, 0 .byte 0, 1, -7, 55, 19, -5, 1, 0 .byte 0, 1, -7, 51, 24, -6, 1, 0 .byte 0, 1, -8, 47, 29, -6, 1, 0 .byte 0, 1, -7, 42, 33, -6, 1, 0 .byte 0, 1, -7, 38, 38, -7, 1, 0 .byte 0, 1, -6, 33, 42, -7, 1, 0 .byte 0, 1, -6, 29, 47, -8, 1, 0 .byte 0, 1, -6, 24, 51, -7, 1, 0 .byte 0, 1, -5, 19, 55, -7, 1, 0 .byte 0, 1, -4, 14, 58, -6, 1, 0 .byte 0, 0, -2, 9, 61, -5, 1, 0 .byte 0, 0, -1, 4, 63, -3, 1, 0 .byte 0, 1, 14, 31, 17, 1, 0, 0 /* SMOOTH */ .byte 0, 0, 13, 31, 18, 2, 0, 0 .byte 0, 0, 11, 31, 20, 2, 0, 0 .byte 0, 0, 10, 30, 21, 3, 0, 0 .byte 0, 0, 9, 29, 22, 4, 0, 0 .byte 0, 0, 8, 28, 23, 5, 0, 0 .byte 0, -1, 8, 27, 24, 6, 0, 0 .byte 0, -1, 7, 26, 26, 7, -1, 0 .byte 0, 0, 6, 24, 27, 8, -1, 0 .byte 0, 0, 5, 23, 28, 8, 0, 0 .byte 0, 0, 4, 22, 29, 9, 0, 0 .byte 0, 0, 3, 21, 30, 10, 0, 0 .byte 0, 0, 2, 20, 31, 11, 0, 0 .byte 0, 0, 2, 18, 31, 13, 0, 0 .byte 0, 0, 1, 17, 31, 14, 1, 0 .byte -1, 1, -3, 63, 4, -1, 1, 0 /* SHARP */ .byte -1, 3, -6, 62, 8, -3, 2, -1 .byte -1, 4, -9, 60, 13, -5, 3, -1 .byte -2, 5, -11, 58, 19, -7, 3, -1 .byte -2, 5, -11, 54, 24, -9, 4, -1 .byte -2, 5, -12, 50, 30, -10, 4, -1 .byte -2, 5, -12, 45, 35, -11, 5, -1 .byte -2, 6, -12, 40, 40, -12, 6, -2 .byte -1, 5, -11, 35, 45, -12, 5, -2 .byte -1, 4, -10, 30, 50, -12, 5, -2 .byte -1, 4, -9, 24, 54, -11, 5, -2 .byte -1, 3, -7, 19, 58, -11, 5, -2 .byte -1, 3, -5, 13, 60, -9, 4, -1 .byte -1, 2, -3, 8, 62, -6, 3, -1 .byte 0, 1, -1, 4, 63, -3, 1, -1 .byte 0, 0, -2, 63, 4, -1, 0, 0 /* REGULAR 4 */ .byte 0, 0, -4, 61, 9, -2, 0, 0 .byte 0, 0, -5, 58, 14, -3, 0, 0 .byte 0, 0, -6, 55, 19, -4, 0, 0 .byte 0, 0, -6, 51, 24, -5, 0, 0 .byte 0, 0, -7, 47, 29, -5, 0, 0 .byte 0, 0, -6, 42, 33, -5, 0, 0 .byte 0, 0, -6, 38, 38, -6, 0, 0 .byte 0, 0, -5, 33, 42, -6, 0, 0 .byte 0, 0, -5, 29, 47, -7, 0, 0 .byte 0, 0, -5, 24, 51, -6, 0, 0 .byte 0, 0, -4, 19, 55, -6, 0, 0 .byte 0, 0, -3, 14, 58, -5, 0, 0 .byte 0, 0, -2, 9, 61, -4, 0, 0 .byte 0, 0, -1, 4, 63, -2, 0, 0 .byte 0, 0, 15, 31, 17, 1, 0, 0 /* SMOOTH 4 */ .byte 0, 0, 13, 31, 18, 2, 0, 0 .byte 0, 0, 11, 31, 20, 2, 0, 0 .byte 0, 0, 10, 30, 21, 3, 0, 0 .byte 0, 0, 9, 29, 22, 4, 0, 0 .byte 0, 0, 8, 28, 23, 5, 0, 0 .byte 0, 0, 7, 27, 24, 6, 0, 0 .byte 0, 0, 6, 26, 26, 6, 0, 0 .byte 0, 0, 6, 24, 27, 7, 0, 0 .byte 0, 0, 5, 23, 28, 8, 0, 0 .byte 0, 0, 4, 22, 29, 9, 0, 0 .byte 0, 0, 3, 21, 30, 10, 0, 0 .byte 0, 0, 2, 20, 31, 11, 0, 0 .byte 0, 0, 2, 18, 31, 13, 0, 0 .byte 0, 0, 1, 17, 31, 15, 0, 0 /* Bilin scaled being very rarely used, add a new table entry * and use the put/prep_8tap_scaled code, thus acting as a * scaled bilinear filter. */ .byte 0, 0, 0, 60, 4, 0, 0, 0 .byte 0, 0, 0, 56, 8, 0, 0, 0 .byte 0, 0, 0, 52, 12, 0, 0, 0 .byte 0, 0, 0, 48, 16, 0, 0, 0 .byte 0, 0, 0, 44, 20, 0, 0, 0 .byte 0, 0, 0, 40, 24, 0, 0, 0 .byte 0, 0, 0, 36, 28, 0, 0, 0 .byte 0, 0, 0, 32, 32, 0, 0, 0 .byte 0, 0, 0, 28, 36, 0, 0, 0 .byte 0, 0, 0, 24, 40, 0, 0, 0 .byte 0, 0, 0, 20, 44, 0, 0, 0 .byte 0, 0, 0, 16, 48, 0, 0, 0 .byte 0, 0, 0, 12, 52, 0, 0, 0 .byte 0, 0, 0, 8, 56, 0, 0, 0 .byte 0, 0, 0, 4, 60, 0, 0, 0 endconst const filter_intra_taps, export=1, align=4 .byte -6, 10, -5, 2, -3, 1, -3, 1 /* 0 */ .byte -4, 6, -3, 2, -3, 2, -3, 1 .byte 0, 0, 10, 0, 1, 10, 1, 2 .byte 0, 0, 6, 0, 2, 6, 2, 2 .byte 0, 12, 0, 9, 0, 7, 10, 5 .byte 0, 2, 0, 2, 0, 2, 6, 3 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 12, 0, 9, 0, 7, 0, 5, 0 .byte -10, 16, -6, 0, -4, 0, -2, 0 /* 1 */ .byte -10, 16, -6, 0, -4, 0, -2, 0 .byte 0, 0, 16, 0, 0, 16, 0, 0 .byte 0, 0, 16, 0, 0, 16, 0, 0 .byte 0, 10, 0, 6, 0, 4, 16, 2 .byte 0, 0, 0, 0, 0, 0, 16, 0 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 10, 0, 6, 0, 4, 0, 2, 0 .byte -8, 8, -8, 0, -8, 0, -8, 0 /* 2 */ .byte -4, 4, -4, 0, -4, 0, -4, 0 .byte 0, 0, 8, 0, 0, 8, 0, 0 .byte 0, 0, 4, 0, 0, 4, 0, 0 .byte 0, 16, 0, 16, 0, 16, 8, 16 .byte 0, 0, 0, 0, 0, 0, 4, 0 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 16, 0, 16, 0, 16, 0, 16, 0 .byte -2, 8, -1, 3, -1, 2, 0, 1 /* 3 */ .byte -1, 4, -1, 3, -1, 2, -1, 2 .byte 0, 0, 8, 0, 3, 8, 2, 3 .byte 0, 0, 4, 0, 3, 4, 2, 3 .byte 0, 10, 0, 6, 0, 4, 8, 2 .byte 0, 3, 0, 4, 0, 4, 4, 3 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 10, 0, 6, 0, 4, 0, 3, 0 .byte -12, 14, -10, 0, -9, 0, -8, 0 /* 4 */ .byte -10, 12, -9, 1, -8, 0, -7, 0 .byte 0, 0, 14, 0, 0, 14, 0, 0 .byte 0, 0, 12, 0, 0, 12, 0, 1 .byte 0, 14, 0, 12, 0, 11, 14, 10 .byte 0, 0, 0, 0, 0, 1, 12, 1 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 14, 0, 12, 0, 11, 0, 9, 0 endconst const sgr_x_by_x, export=1, align=4 .byte 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17 .byte 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9 .byte 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6 .byte 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 .byte 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3 .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 .byte 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 0 endconst const mc_warp_filter, export=1, align=3 /* [-1, 0) */ .byte 0, 0, 127, 1, 0, 0, 0, 0, 0, - 1, 127, 2, 0, 0, 0, 0 .byte 1, - 3, 127, 4, - 1, 0, 0, 0, 1, - 4, 126, 6, - 2, 1, 0, 0 .byte 1, - 5, 126, 8, - 3, 1, 0, 0, 1, - 6, 125, 11, - 4, 1, 0, 0 .byte 1, - 7, 124, 13, - 4, 1, 0, 0, 2, - 8, 123, 15, - 5, 1, 0, 0 .byte 2, - 9, 122, 18, - 6, 1, 0, 0, 2, -10, 121, 20, - 6, 1, 0, 0 .byte 2, -11, 120, 22, - 7, 2, 0, 0, 2, -12, 119, 25, - 8, 2, 0, 0 .byte 3, -13, 117, 27, - 8, 2, 0, 0, 3, -13, 116, 29, - 9, 2, 0, 0 .byte 3, -14, 114, 32, -10, 3, 0, 0, 3, -15, 113, 35, -10, 2, 0, 0 .byte 3, -15, 111, 37, -11, 3, 0, 0, 3, -16, 109, 40, -11, 3, 0, 0 .byte 3, -16, 108, 42, -12, 3, 0, 0, 4, -17, 106, 45, -13, 3, 0, 0 .byte 4, -17, 104, 47, -13, 3, 0, 0, 4, -17, 102, 50, -14, 3, 0, 0 .byte 4, -17, 100, 52, -14, 3, 0, 0, 4, -18, 98, 55, -15, 4, 0, 0 .byte 4, -18, 96, 58, -15, 3, 0, 0, 4, -18, 94, 60, -16, 4, 0, 0 .byte 4, -18, 91, 63, -16, 4, 0, 0, 4, -18, 89, 65, -16, 4, 0, 0 .byte 4, -18, 87, 68, -17, 4, 0, 0, 4, -18, 85, 70, -17, 4, 0, 0 .byte 4, -18, 82, 73, -17, 4, 0, 0, 4, -18, 80, 75, -17, 4, 0, 0 .byte 4, -18, 78, 78, -18, 4, 0, 0, 4, -17, 75, 80, -18, 4, 0, 0 .byte 4, -17, 73, 82, -18, 4, 0, 0, 4, -17, 70, 85, -18, 4, 0, 0 .byte 4, -17, 68, 87, -18, 4, 0, 0, 4, -16, 65, 89, -18, 4, 0, 0 .byte 4, -16, 63, 91, -18, 4, 0, 0, 4, -16, 60, 94, -18, 4, 0, 0 .byte 3, -15, 58, 96, -18, 4, 0, 0, 4, -15, 55, 98, -18, 4, 0, 0 .byte 3, -14, 52, 100, -17, 4, 0, 0, 3, -14, 50, 102, -17, 4, 0, 0 .byte 3, -13, 47, 104, -17, 4, 0, 0, 3, -13, 45, 106, -17, 4, 0, 0 .byte 3, -12, 42, 108, -16, 3, 0, 0, 3, -11, 40, 109, -16, 3, 0, 0 .byte 3, -11, 37, 111, -15, 3, 0, 0, 2, -10, 35, 113, -15, 3, 0, 0 .byte 3, -10, 32, 114, -14, 3, 0, 0, 2, - 9, 29, 116, -13, 3, 0, 0 .byte 2, - 8, 27, 117, -13, 3, 0, 0, 2, - 8, 25, 119, -12, 2, 0, 0 .byte 2, - 7, 22, 120, -11, 2, 0, 0, 1, - 6, 20, 121, -10, 2, 0, 0 .byte 1, - 6, 18, 122, - 9, 2, 0, 0, 1, - 5, 15, 123, - 8, 2, 0, 0 .byte 1, - 4, 13, 124, - 7, 1, 0, 0, 1, - 4, 11, 125, - 6, 1, 0, 0 .byte 1, - 3, 8, 126, - 5, 1, 0, 0, 1, - 2, 6, 126, - 4, 1, 0, 0 .byte 0, - 1, 4, 127, - 3, 1, 0, 0, 0, 0, 2, 127, - 1, 0, 0, 0 /* [0, 1) */ .byte 0, 0, 0, 127, 1, 0, 0, 0, 0, 0, -1, 127, 2, 0, 0, 0 .byte 0, 1, -3, 127, 4, -2, 1, 0, 0, 1, -5, 127, 6, -2, 1, 0 .byte 0, 2, -6, 126, 8, -3, 1, 0, -1, 2, -7, 126, 11, -4, 2, -1 .byte -1, 3, -8, 125, 13, -5, 2, -1, -1, 3, -10, 124, 16, -6, 3, -1 .byte -1, 4, -11, 123, 18, -7, 3, -1, -1, 4, -12, 122, 20, -7, 3, -1 .byte -1, 4, -13, 121, 23, -8, 3, -1, -2, 5, -14, 120, 25, -9, 4, -1 .byte -1, 5, -15, 119, 27, -10, 4, -1, -1, 5, -16, 118, 30, -11, 4, -1 .byte -2, 6, -17, 116, 33, -12, 5, -1, -2, 6, -17, 114, 35, -12, 5, -1 .byte -2, 6, -18, 113, 38, -13, 5, -1, -2, 7, -19, 111, 41, -14, 6, -2 .byte -2, 7, -19, 110, 43, -15, 6, -2, -2, 7, -20, 108, 46, -15, 6, -2 .byte -2, 7, -20, 106, 49, -16, 6, -2, -2, 7, -21, 104, 51, -16, 7, -2 .byte -2, 7, -21, 102, 54, -17, 7, -2, -2, 8, -21, 100, 56, -18, 7, -2 .byte -2, 8, -22, 98, 59, -18, 7, -2, -2, 8, -22, 96, 62, -19, 7, -2 .byte -2, 8, -22, 94, 64, -19, 7, -2, -2, 8, -22, 91, 67, -20, 8, -2 .byte -2, 8, -22, 89, 69, -20, 8, -2, -2, 8, -22, 87, 72, -21, 8, -2 .byte -2, 8, -21, 84, 74, -21, 8, -2, -2, 8, -22, 82, 77, -21, 8, -2 .byte -2, 8, -21, 79, 79, -21, 8, -2, -2, 8, -21, 77, 82, -22, 8, -2 .byte -2, 8, -21, 74, 84, -21, 8, -2, -2, 8, -21, 72, 87, -22, 8, -2 .byte -2, 8, -20, 69, 89, -22, 8, -2, -2, 8, -20, 67, 91, -22, 8, -2 .byte -2, 7, -19, 64, 94, -22, 8, -2, -2, 7, -19, 62, 96, -22, 8, -2 .byte -2, 7, -18, 59, 98, -22, 8, -2, -2, 7, -18, 56, 100, -21, 8, -2 .byte -2, 7, -17, 54, 102, -21, 7, -2, -2, 7, -16, 51, 104, -21, 7, -2 .byte -2, 6, -16, 49, 106, -20, 7, -2, -2, 6, -15, 46, 108, -20, 7, -2 .byte -2, 6, -15, 43, 110, -19, 7, -2, -2, 6, -14, 41, 111, -19, 7, -2 .byte -1, 5, -13, 38, 113, -18, 6, -2, -1, 5, -12, 35, 114, -17, 6, -2 .byte -1, 5, -12, 33, 116, -17, 6, -2, -1, 4, -11, 30, 118, -16, 5, -1 .byte -1, 4, -10, 27, 119, -15, 5, -1, -1, 4, -9, 25, 120, -14, 5, -2 .byte -1, 3, -8, 23, 121, -13, 4, -1, -1, 3, -7, 20, 122, -12, 4, -1 .byte -1, 3, -7, 18, 123, -11, 4, -1, -1, 3, -6, 16, 124, -10, 3, -1 .byte -1, 2, -5, 13, 125, -8, 3, -1, -1, 2, -4, 11, 126, -7, 2, -1 .byte 0, 1, -3, 8, 126, -6, 2, 0, 0, 1, -2, 6, 127, -5, 1, 0 .byte 0, 1, -2, 4, 127, -3, 1, 0, 0, 0, 0, 2, 127, -1, 0, 0 /* [1, 2) */ .byte 0, 0, 0, 1, 127, 0, 0, 0, 0, 0, 0, - 1, 127, 2, 0, 0 .byte 0, 0, 1, - 3, 127, 4, - 1, 0, 0, 0, 1, - 4, 126, 6, - 2, 1 .byte 0, 0, 1, - 5, 126, 8, - 3, 1, 0, 0, 1, - 6, 125, 11, - 4, 1 .byte 0, 0, 1, - 7, 124, 13, - 4, 1, 0, 0, 2, - 8, 123, 15, - 5, 1 .byte 0, 0, 2, - 9, 122, 18, - 6, 1, 0, 0, 2, -10, 121, 20, - 6, 1 .byte 0, 0, 2, -11, 120, 22, - 7, 2, 0, 0, 2, -12, 119, 25, - 8, 2 .byte 0, 0, 3, -13, 117, 27, - 8, 2, 0, 0, 3, -13, 116, 29, - 9, 2 .byte 0, 0, 3, -14, 114, 32, -10, 3, 0, 0, 3, -15, 113, 35, -10, 2 .byte 0, 0, 3, -15, 111, 37, -11, 3, 0, 0, 3, -16, 109, 40, -11, 3 .byte 0, 0, 3, -16, 108, 42, -12, 3, 0, 0, 4, -17, 106, 45, -13, 3 .byte 0, 0, 4, -17, 104, 47, -13, 3, 0, 0, 4, -17, 102, 50, -14, 3 .byte 0, 0, 4, -17, 100, 52, -14, 3, 0, 0, 4, -18, 98, 55, -15, 4 .byte 0, 0, 4, -18, 96, 58, -15, 3, 0, 0, 4, -18, 94, 60, -16, 4 .byte 0, 0, 4, -18, 91, 63, -16, 4, 0, 0, 4, -18, 89, 65, -16, 4 .byte 0, 0, 4, -18, 87, 68, -17, 4, 0, 0, 4, -18, 85, 70, -17, 4 .byte 0, 0, 4, -18, 82, 73, -17, 4, 0, 0, 4, -18, 80, 75, -17, 4 .byte 0, 0, 4, -18, 78, 78, -18, 4, 0, 0, 4, -17, 75, 80, -18, 4 .byte 0, 0, 4, -17, 73, 82, -18, 4, 0, 0, 4, -17, 70, 85, -18, 4 .byte 0, 0, 4, -17, 68, 87, -18, 4, 0, 0, 4, -16, 65, 89, -18, 4 .byte 0, 0, 4, -16, 63, 91, -18, 4, 0, 0, 4, -16, 60, 94, -18, 4 .byte 0, 0, 3, -15, 58, 96, -18, 4, 0, 0, 4, -15, 55, 98, -18, 4 .byte 0, 0, 3, -14, 52, 100, -17, 4, 0, 0, 3, -14, 50, 102, -17, 4 .byte 0, 0, 3, -13, 47, 104, -17, 4, 0, 0, 3, -13, 45, 106, -17, 4 .byte 0, 0, 3, -12, 42, 108, -16, 3, 0, 0, 3, -11, 40, 109, -16, 3 .byte 0, 0, 3, -11, 37, 111, -15, 3, 0, 0, 2, -10, 35, 113, -15, 3 .byte 0, 0, 3, -10, 32, 114, -14, 3, 0, 0, 2, - 9, 29, 116, -13, 3 .byte 0, 0, 2, - 8, 27, 117, -13, 3, 0, 0, 2, - 8, 25, 119, -12, 2 .byte 0, 0, 2, - 7, 22, 120, -11, 2, 0, 0, 1, - 6, 20, 121, -10, 2 .byte 0, 0, 1, - 6, 18, 122, - 9, 2, 0, 0, 1, - 5, 15, 123, - 8, 2 .byte 0, 0, 1, - 4, 13, 124, - 7, 1, 0, 0, 1, - 4, 11, 125, - 6, 1 .byte 0, 0, 1, - 3, 8, 126, - 5, 1, 0, 0, 1, - 2, 6, 126, - 4, 1 .byte 0, 0, 0, - 1, 4, 127, - 3, 1, 0, 0, 0, 0, 2, 127, - 1, 0 /* dummy (replicate row index 191) */ .byte 0, 0, 0, 0, 2, 127, - 1, 0 endconst const sm_weights, export=1 .byte 0, 0 /* Unused, because we always offset by bs, which is at least 2. */ .byte 255, 128 /* bs = 2 */ .byte 255, 149, 85, 64 /* bs = 4 */ .byte 255, 197, 146, 105, 73, 50, 37, 32 /* bs = 8 */ .byte 255, 225, 196, 170, 145, 123, 102, 84 /* bs = 16 */ .byte 68, 54, 43, 33, 26, 20, 17, 16 .byte 255, 240, 225, 210, 196, 182, 169, 157 /* bs =32 */ .byte 145, 133, 122, 111, 101, 92, 83, 74 .byte 66, 59, 52, 45, 39, 34, 29, 25 .byte 21, 17, 14, 12, 10, 9, 8, 8 .byte 255, 248, 240, 233, 225, 218, 210, 203 /* bs = 64 */ .byte 196, 189, 182, 176, 169, 163, 156, 150 .byte 144, 138, 133, 127, 121, 116, 111, 106 .byte 101, 96, 91, 86, 82, 77, 73, 69 .byte 65, 61, 57, 54, 50, 47, 44, 41 .byte 38, 35, 32, 29, 27, 25, 22, 20 .byte 18, 16, 15, 13, 12, 10, 9, 8 .byte 7, 6, 6, 5, 5, 4, 4, 4 endconst const obmc_masks, export=1, align=4 .byte 0, 0 /* Unused */ .byte 19, 0 /* 2 */ .byte 25, 14, 5, 0 /* 4 */ .byte 28, 22, 16, 11, 7, 3, 0, 0 /* 8 */ .byte 30, 27, 24, 21, 18, 15, 12, 10 .byte 8, 6, 4, 3, 0, 0, 0, 0 /* 16 */ .byte 31, 29, 28, 26, 24, 23, 21, 20 .byte 19, 17, 16, 14, 13, 12, 11, 9 .byte 8, 7, 6, 5, 4, 4, 3, 2 .byte 0, 0, 0, 0, 0, 0, 0, 0 /* 32 */ endconst av-scenechange-0.14.1/src/asm/x86/ipred16_avx2.asm000064400000000000000000005406761046102023000175100ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" SECTION_RODATA 64 %macro SMOOTH_WEIGHTS 1-* const smooth_weights_1d_16bpc ; sm_weights[] << 7 %rep %0 dw %1*128 %rotate 1 %endrep const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] %rep %0 dw %1, 256-%1 %rotate 1 %endrep %endmacro SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 %if ARCH_X86_64 ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 pw_m1024: times 2 dw -1024 pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4 z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8 pb_90: times 4 db 90 z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4 z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11 z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9 z_filter_k: dw 4, 4, 5, 5, 4, 4 dw 8, 8, 6, 6, 4, 4 dw 0, 0, 0, 0, 2, 2 %define pw_2 (z_filter_k+32) %define pw_4 (z_filter_k+ 0) %define pw_16 (z2_ymul8 +20) pw_1: times 2 dw 1 pw_3: times 2 dw 3 pw_62: times 2 dw 62 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pd_8: dd 8 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) %define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4) JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 cextern dr_intra_derivative cextern filter_intra_taps SECTION .text INIT_YMM avx2 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h movifnidn hd, hm add tlq, 2 movd xm4, wd pxor xm3, xm3 pavgw xm4, xm3 tzcnt wd, wd movd xm5, wd movu m0, [tlq] lea r5, [ipred_dc_left_16bpc_avx2_table] movsxd r6, [r5+wq*4] add r6, r5 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 mov hd, hm sub tlq, hq movd xm4, hd sub tlq, hq pxor xm3, xm3 pavgw xm4, xm3 tzcnt r6d, hd movd xm5, r6d movu m0, [tlq] lea r5, [ipred_dc_left_16bpc_avx2_table] movsxd r6, [r5+r6*4] add r6, r5 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: paddw m0, [tlq+96] paddw m0, [tlq+64] .h32: paddw m0, [tlq+32] .h16: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h8: psrldq xm1, xm0, 8 paddw xm0, xm1 .h4: punpcklwd xm0, xm3 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 paddd xm0, xm4 psrld xm0, xm5 lea stride3q, [strideq*3] vpbroadcastw m0, xm0 mova m1, m0 mova m2, m0 mova m3, m0 jmp wq cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm tzcnt r6d, hd lea r5d, [wq+hq] movd xm4, r5d tzcnt r5d, r5d movd xm5, r5d lea r5, [ipred_dc_16bpc_avx2_table] tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] pxor m3, m3 psrlw xm4, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movq xm0, [tlq-8] jmp wq .w4: movq xm1, [tlq+2] paddw m0, m4 paddw m0, m1 psrlq m1, m0, 32 paddw m0, m1 psrld m1, m0, 16 paddw m0, m1 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: vextracti128 xm1, m0, 1 paddw xm0, xm1 lea r2d, [hq*2] mov r6d, 0xAAAB6667 shrx r6d, r6d, r2d punpckhwd xm1, xm0, xm3 punpcklwd xm0, xm3 paddd xm0, xm1 movd xm1, r6d psrld xm0, 2 pmulhuw xm0, xm1 psrlw xm0, 1 .w4_end: vpbroadcastw xm0, xm0 .s4: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET ALIGN function_align .h8: mova xm0, [tlq-16] jmp wq .w8: vextracti128 xm1, m0, 1 paddw xm0, [tlq+2] paddw xm0, xm4 paddw xm0, xm1 psrld xm1, xm0, 16 paddw xm0, xm1 pblendw xm0, xm3, 0xAA psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w8_end: vpbroadcastw xm0, xm0 .s8: mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm0 mova [dstq+strideq*2], xm0 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET ALIGN function_align .h16: mova m0, [tlq-32] jmp wq .w16: paddw m0, [tlq+2] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpckhwd xm1, xm0, xm3 punpcklwd xm0, xm3 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w16_end: vpbroadcastw m0, xm0 .s16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET ALIGN function_align .h32: mova m0, [tlq-64] paddw m0, [tlq-32] jmp wq .w32: paddw m0, [tlq+ 2] paddw m0, [tlq+34] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpcklwd xm1, xm0, xm3 punpckhwd xm0, xm3 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x6667AAAB shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w32_end: vpbroadcastw m0, xm0 mova m1, m0 .s32: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m0 mova [dstq+strideq*2+32*1], m1 mova [dstq+stride3q +32*0], m0 mova [dstq+stride3q +32*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET ALIGN function_align .h64: mova m0, [tlq-128] mova m1, [tlq- 96] paddw m0, [tlq- 64] paddw m1, [tlq- 32] paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 2] paddw m0, [tlq+34] paddw m1, [tlq+66] paddw m0, [tlq+98] paddw m0, m1 vextracti128 xm1, m0, 1 paddw xm0, xm1 punpcklwd xm1, xm0, xm3 punpckhwd xm0, xm3 paddd xm1, xm4 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 64 je .w64_end mov r6d, 0x6667AAAB shrx r6d, r6d, hd movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w64_end: vpbroadcastw m0, xm0 mova m1, m0 mova m2, m0 mova m3, m0 .s64: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s64 RET cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 mov r6d, r8m shr r6d, 11 lea r5, [ipred_dc_splat_16bpc_avx2_table] tzcnt wd, wd movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] mova m1, m0 mova m2, m0 mova m3, m0 add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movu m0, [tlq+ 2] movu m1, [tlq+34] movu m2, [tlq+66] movu m3, [tlq+98] lea r5, [ipred_dc_splat_16bpc_avx2_table] tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq %macro IPRED_H 2 ; w, store_type vpbroadcastw m0, [tlq-2] vpbroadcastw m1, [tlq-4] vpbroadcastw m2, [tlq-6] vpbroadcastw m3, [tlq-8] sub tlq, 8 mov%2 [dstq+strideq*0], m0 mov%2 [dstq+strideq*1], m1 mov%2 [dstq+strideq*2], m2 mov%2 [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w%1 RET ALIGN function_align %endmacro cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 movifnidn hd, hm lea r5, [ipred_h_16bpc_avx2_table] tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq INIT_XMM avx2 .w4: IPRED_H 4, q .w8: IPRED_H 8, a INIT_YMM avx2 .w16: IPRED_H 16, a .w32: vpbroadcastw m0, [tlq-2] vpbroadcastw m1, [tlq-4] vpbroadcastw m2, [tlq-6] vpbroadcastw m3, [tlq-8] sub tlq, 8 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m0 mova [dstq+strideq*1+32*0], m1 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m2 mova [dstq+strideq*2+32*1], m2 mova [dstq+stride3q +32*0], m3 mova [dstq+stride3q +32*1], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32 RET .w64: vpbroadcastw m0, [tlq-2] vpbroadcastw m1, [tlq-4] sub tlq, 4 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m0 mova [dstq+strideq*0+32*2], m0 mova [dstq+strideq*0+32*3], m0 mova [dstq+strideq*1+32*0], m1 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m1 mova [dstq+strideq*1+32*3], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64 RET %macro PAETH 3 ; top, signed_ldiff, ldiff paddw m0, m%2, m1 psubw m7, m3, m0 ; tldiff psubw m0, m%1 ; tdiff pabsw m7, m7 pabsw m0, m0 pminsw m7, m0 pcmpeqw m0, m7 pcmpgtw m7, m%3, m7 vpblendvb m0, m3, m%1, m0 vpblendvb m0, m1, m0, m7 %endmacro cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h %define base r5-ipred_paeth_16bpc_avx2_table movifnidn hd, hm lea r5, [ipred_paeth_16bpc_avx2_table] tzcnt wd, wd movsxd wq, [r5+wq*4] vpbroadcastw m3, [tlq] ; topleft add wq, r5 jmp wq .w4: vpbroadcastq m2, [tlq+2] ; top movsldup m6, [base+ipred_hv_shuf] lea r3, [strideq*3] psubw m4, m2, m3 pabsw m5, m4 .w4_loop: sub tlq, 8 vpbroadcastq m1, [tlq] pshufb m1, m6 ; left PAETH 2, 4, 5 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: vbroadcasti128 m2, [tlq+2] movsldup m6, [base+ipred_hv_shuf] psubw m4, m2, m3 pabsw m5, m4 .w8_loop: sub tlq, 4 vpbroadcastd m1, [tlq] pshufb m1, m6 PAETH 2, 4, 5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: movu m2, [tlq+2] psubw m4, m2, m3 pabsw m5, m4 .w16_loop: sub tlq, 2 vpbroadcastw m1, [tlq] PAETH 2, 4, 5 mova [dstq], m0 add dstq, strideq dec hd jg .w16_loop RET ALIGN function_align .w32: movu m2, [tlq+2] movu m6, [tlq+34] %if WIN64 movaps r4m, xmm8 movaps r6m, xmm9 %endif psubw m4, m2, m3 psubw m8, m6, m3 pabsw m5, m4 pabsw m9, m8 .w32_loop: sub tlq, 2 vpbroadcastw m1, [tlq] PAETH 2, 4, 5 mova [dstq+32*0], m0 PAETH 6, 8, 9 mova [dstq+32*1], m0 add dstq, strideq dec hd jg .w32_loop %if WIN64 movaps xmm8, r4m movaps xmm9, r6m %endif RET ALIGN function_align .w64: WIN64_SPILL_XMM 16 movu m2, [tlq+ 2] movu m6, [tlq+34] movu m10, [tlq+66] movu m13, [tlq+98] psubw m4, m2, m3 psubw m8, m6, m3 psubw m11, m10, m3 psubw m14, m13, m3 pabsw m5, m4 pabsw m9, m8 pabsw m12, m11 pabsw m15, m14 .w64_loop: sub tlq, 2 vpbroadcastw m1, [tlq] PAETH 2, 4, 5 mova [dstq+32*0], m0 PAETH 6, 8, 9 mova [dstq+32*1], m0 PAETH 10, 11, 12 mova [dstq+32*2], m0 PAETH 13, 14, 15 mova [dstq+32*3], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_16bpc_avx2_table lea r6, [ipred_smooth_v_16bpc_avx2_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] neg hq vpbroadcastw m5, [tlq+hq*2] ; bottom add wq, r6 jmp wq .w4: vpbroadcastq m4, [tlq+2] ; top movsldup m3, [base+ipred_hv_shuf] lea r6, [strideq*3] psubw m4, m5 ; top - bottom .w4_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m3 pmulhrsw m0, m4 paddw m0, m5 vextracti128 xm1, m0, 1 movhps [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movq [dstq+r6 ], xm0 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop .ret: RET .w8: vbroadcasti128 m4, [tlq+2] movsldup m3, [base+ipred_hv_shuf] lea r6, [strideq*3] psubw m4, m5 .w8_loop: vpbroadcastd m0, [weightsq+hq*2+0] vpbroadcastd m1, [weightsq+hq*2+4] pshufb m0, m3 pshufb m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 vextracti128 [dstq+strideq*0], m0, 1 mova [dstq+strideq*1], xm0 vextracti128 [dstq+strideq*2], m1, 1 mova [dstq+r6 ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: movu m4, [tlq+2] lea r6, [strideq*3] psubw m4, m5 .w16_loop: vpbroadcastw m0, [weightsq+hq*2+0] vpbroadcastw m1, [weightsq+hq*2+2] vpbroadcastw m2, [weightsq+hq*2+4] vpbroadcastw m3, [weightsq+hq*2+6] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r6 ], m3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: WIN64_SPILL_XMM 7 movu m4, [tlq+ 2] movu m6, [tlq+34] psubw m4, m5 psubw m6, m5 .w32_loop: vpbroadcastw m1, [weightsq+hq*2+0] vpbroadcastw m3, [weightsq+hq*2+2] pmulhrsw m0, m4, m1 pmulhrsw m1, m6 pmulhrsw m2, m4, m3 pmulhrsw m3, m6 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 lea dstq, [dstq+strideq*2] add hq, 2 jl .w32_loop RET .w64: WIN64_SPILL_XMM 8 movu m3, [tlq+ 2] movu m4, [tlq+34] movu m6, [tlq+66] movu m7, [tlq+98] REPX {psubw x, m5}, m3, m4, m6, m7 .w64_loop: vpbroadcastw m2, [weightsq+hq*2] pmulhrsw m0, m3, m2 pmulhrsw m1, m4, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*0], m0 pmulhrsw m0, m6, m2 mova [dstq+32*1], m1 pmulhrsw m1, m7, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*2], m0 mova [dstq+32*3], m1 add dstq, strideq inc hq jl .w64_loop RET cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 %define base r6-ipred_smooth_h_16bpc_avx2_table lea r6, [ipred_smooth_h_16bpc_avx2_table] mov wd, wm movifnidn hd, hm vpbroadcastw m5, [tlq+wq*2] ; right tzcnt wd, wd add hd, hd movsxd wq, [r6+wq*4] sub tlq, hq lea stride3q, [strideq*3] add wq, r6 jmp wq .w4: vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] movsldup m3, [base+ipred_hv_shuf] .w4_loop: vpbroadcastq m0, [tlq+hq-8] ; left pshufb m0, m3 psubw m0, m5 ; left - right pmulhrsw m0, m4 paddw m0, m5 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w4_loop RET .w8: vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] movsldup m3, [base+ipred_hv_shuf] .w8_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastd m1, [tlq+hq-8] pshufb m0, m3 pshufb m1, m3 psubw m0, m5 psubw m1, m5 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w8_loop RET .w16: movu m4, [base+smooth_weights_1d_16bpc+16*2] .w16_loop: vpbroadcastq m3, [tlq+hq-8] punpcklwd m3, m3 psubw m3, m5 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w16_loop RET .w32: WIN64_SPILL_XMM 7 movu m4, [base+smooth_weights_1d_16bpc+32*2] movu m6, [base+smooth_weights_1d_16bpc+32*3] .w32_loop: vpbroadcastw m1, [tlq+hq-2] vpbroadcastw m3, [tlq+hq-4] psubw m1, m5 psubw m3, m5 pmulhrsw m0, m4, m1 pmulhrsw m1, m6 pmulhrsw m2, m4, m3 pmulhrsw m3, m6 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w32_loop RET .w64: WIN64_SPILL_XMM 8 movu m3, [base+smooth_weights_1d_16bpc+32*4] movu m4, [base+smooth_weights_1d_16bpc+32*5] movu m6, [base+smooth_weights_1d_16bpc+32*6] movu m7, [base+smooth_weights_1d_16bpc+32*7] .w64_loop: vpbroadcastw m2, [tlq+hq-2] psubw m2, m5 pmulhrsw m0, m3, m2 pmulhrsw m1, m4, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*0], m0 pmulhrsw m0, m6, m2 mova [dstq+32*1], m1 pmulhrsw m1, m7, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*2], m0 mova [dstq+32*3], m1 add dstq, strideq sub hq, 1*2 jg .w64_loop RET %macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] pmaddwd m0, m%1, m%3 pmaddwd m1, m%2, m%4 paddd m0, m%5 paddd m1, m%6 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m5 %endmacro cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_16bpc_avx2_table lea r6, [ipred_smooth_16bpc_avx2_table] mov wd, wm vpbroadcastw m4, [tlq+wq*2] ; right tzcnt wd, wd mov hd, hm sub tlq, hq sub tlq, hq movsxd wq, [r6+wq*4] pxor m5, m5 add wq, r6 lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] jmp wq .w4: WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom vpbroadcastq m6, [tlq+hq*2+2] movsldup m7, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] punpcklwd m6, m0 ; top, bottom punpcklqdq m8, m9, m9 punpckhqdq m9, m9 lea r3, [strideq*3] .w4_loop: vpbroadcastq m3, [tlq+hq*2-8] vbroadcasti128 m1, [v_weightsq] pshufb m3, m7 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 pmaddwd m2, m10 pmaddwd m3, m10 pshufb m0, m1, m8 pshufb m1, m9 SMOOTH_2D_END 0, 1, 6, 6, 2, 3 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] add v_weightsq, 16 sub hd, 4 jg .w4_loop RET .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 vpbroadcastw m0, [tlq] ; bottom vbroadcasti128 m7, [tlq+hq*2+2] movsldup m8, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 .w8_loop: vpbroadcastd m3, [tlq+hq*2-4] vpbroadcastq m1, [v_weightsq] pshufb m3, m8 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 pmaddwd m2, m10 pmaddwd m3, m11 pshufb m1, m9 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] add v_weightsq, 8 sub hd, 2 jg .w8_loop RET .w16: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+2] mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 .w16_loop: vpbroadcastd m3, [tlq+hq*2-4] vpbroadcastd m1, [v_weightsq+0] punpcklwd m3, m4 ; left, right pshufd m2, m3, q1111 pmaddwd m10, m8, m2 pmaddwd m2, m9 pshufd m3, m3, q0000 SMOOTH_2D_END 1, 1, 6, 7, 10, 2 vpbroadcastd m1, [v_weightsq+4] pmaddwd m2, m8, m3 pmaddwd m3, m9 mova [dstq+strideq*0], m0 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] add v_weightsq, 8 sub hq, 2 jg .w16_loop RET .w32: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+ 2] movu m9, [tlq+hq*2+34] mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 .w32_loop: vpbroadcastw m3, [tlq+hq*2-2] vpbroadcastd m14, [v_weightsq] punpcklwd m3, m4 pmaddwd m1, m10, m3 pmaddwd m2, m11, m3 pmaddwd m0, m6, m14 paddd m0, m1 pmaddwd m1, m7, m14 paddd m1, m2 pmaddwd m2, m12, m3 pmaddwd m3, m13 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m5 mova [dstq+32*0], m0 SMOOTH_2D_END 14, 14, 8, 9, 2, 3 mova [dstq+32*1], m0 add dstq, strideq add v_weightsq, 4 dec hd jg .w32_loop RET .w64: %assign stack_offset stack_offset - stack_size_padded PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base mov dst_baseq, dstq mov tl_baseq, tlq mov v_weights_baseq, v_weightsq xor xq, xq .w64_loop_x: mov yq, hq lea tlq, [tl_baseq+hq*2] vpbroadcastw m0, [tl_baseq] ; bottom movu m7, [tlq+xq*2+ 2] movu m9, [tlq+xq*2+34] mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 lea tlq, [tl_baseq-2] .w64_loop_y: vpbroadcastw m3, [tlq+yq*2] vpbroadcastd m1, [v_weightsq] punpcklwd m3, m4 pmaddwd m14, m10, m3 pmaddwd m15, m11, m3 pmaddwd m2, m12, m3 pmaddwd m3, m13 pmaddwd m0, m6, m1 paddd m0, m14 pmaddwd m14, m7, m1 paddd m14, m15 psrld m0, 8 psrld m14, 8 packssdw m0, m14 pavgw m0, m5 mova [dstq+32*0], m0 SMOOTH_2D_END 8, 9, 1, 1, 2, 3 mova [dstq+32*1], m0 add dstq, strideq add v_weightsq, 4 dec yq jg .w64_loop_y lea dstq, [dst_baseq+32*2] add r6, 16*8 mov v_weightsq, v_weights_baseq add xq, 32 test xb, 64 jz .w64_loop_x RET cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z1_16bpc_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea r7, [dr_intra_derivative] movsxd wq, [r6+wq*4] add tlq, 2 add wq, r6 mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 movzx dxd, word [r7+dxq] xor angled, 0x4ff ; d = 90 - angle vpbroadcastd m5, [pw_62] jmp wq .w4: ALLOC_STACK -64, 7 cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) vpbroadcastw xm3, [tlq+14] movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 add dxd, dxd palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 pxor xm4, xm4 paddw xm2, xm0 vpbroadcastw xm0, r8m ; pixel_max mova [rsp+32], xm3 movd xm3, dxd pmaxsw xm2, xm4 mov r3d, dxd pavgw xm2, xm4 vpbroadcastw m3, xm3 pminsw xm2, xm0 punpcklwd xm0, xm1, xm2 punpckhwd xm1, xm2 lea r5, [strideq*3] pslldq m2, m3, 8 mova [rsp+ 0], xm0 mova [rsp+16], xm1 paddw m6, m3, m3 paddw m3, m2 vpblendd m4, m6, 0xf0 paddw m6, m6 paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 vbroadcasti128 m4, [z_upsample] .w4_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu xm1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base1 movu xm2, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base2 vinserti128 m1, [rsp+r3*2], 1 ; 0 2 lea r3d, [r2+dxq] shr r2d, 6 ; base3 vinserti128 m2, [rsp+r2*2], 1 ; 1 3 pshufb m1, m4 pshufb m2, m4 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m5, m3 ; frac psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) paddw m3, m6 ; xpos += dx paddw m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r5 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_upsample_loop RET ALIGN function_align .filter_strength: ; w4/w8/w16 %define base r3-z_filter_t0 movd xm0, maxbased lea r3, [z_filter_t0] movd xm1, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m1, xm1 pcmpeqb m0, [base+z_filter_wh] mova xm2, [r3+angleq*8] pand m0, m1 pcmpgtb m0, m2 pmovmskb r5d, m0 ret .w4_no_upsample: mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea maxbased, [hq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastw xm3, [tlq+14] mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 paddw xm2, xm0 pmullw xm2, xm4 movd [rsp+16], xm3 cmp r5d, 3 jne .w4_3tap paddw xm1, xm2 palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 movzx r3d, word [tlq+14] movzx r2d, word [tlq+12] inc maxbased paddw xm2, xm0 sub r2d, r3d paddw xm2, xm2 lea r2d, [r2+r3*8+4] shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 mov [rsp+16], r2w .w4_3tap: pxor xm0, xm0 paddw xm1, xm2 mov tlq, rsp psrlw xm1, 3 cmp hd, 8 sbb maxbased, -1 pavgw xm0, xm1 mova [tlq], xm0 .w4_main: movd xm3, dxd vpbroadcastq m1, [z_base_inc] vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] shl maxbased, 6 vpbroadcastw m3, xm3 movd xm0, maxbased mov r3d, dxd ; xpos vpbroadcastw m0, xm0 paddw m4, m3, m3 psubw m1, m0 ; -max_base_x vpblendd m3, m4, 0xcc paddw m0, m4, m3 vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 paddw m4, m4 paddw m3, m1 .w4_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 movu xm1, [tlq+r3*2] lea r3d, [r5+dxq] shr r5d, 6 ; base1 movu xm2, [tlq+r5*2] lea r5d, [r3+dxq] shr r3d, 6 ; base2 vinserti128 m1, [tlq+r3*2], 1 ; 0 2 lea r3d, [r5+dxq] shr r5d, 6 ; base3 vinserti128 m2, [tlq+r5*2], 1 ; 1 3 punpcklqdq m0, m1, m2 psrldq m1, 2 pslldq m2, 6 vpblendd m1, m2, 0xcc pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 ; xpos < max_base_x paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w4_loop lea r6, [strideq*3] .w4_end_loop: movq [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm6 movq [dstq+strideq*2], xm6 movq [dstq+r6 ], xm6 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_end_loop .w4_end: RET .w8: %assign stack_offset org_stack_offset ALLOC_STACK -64, 7 lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g cmp hd, 4 jne .w8_upsample_h8 ; awkward single-pixel edge case vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ .w8_upsample_h8: paddw m2, m1 paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f add dxd, dxd psubw m0, m2, m0 psraw m0, 3 pxor m4, m4 paddw m2, m0 vpbroadcastw m0, r8m movd xm3, dxd pmaxsw m2, m4 mov r3d, dxd pavgw m2, m4 vpbroadcastw m3, xm3 pminsw m2, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 vbroadcasti128 m4, [z_upsample] mova [rsp+ 0], xm0 mova [rsp+16], xm1 paddw m6, m3, m3 vextracti128 [rsp+32], m0, 1 vextracti128 [rsp+48], m1, 1 vpblendd m3, m6, 0xf0 ; xpos0 xpos1 .w8_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu xm1, [rsp+r3*2] movu xm2, [rsp+r3*2+16] lea r3d, [r2+dxq] shr r2d, 6 ; base1 vinserti128 m1, [rsp+r2*2], 1 vinserti128 m2, [rsp+r2*2+16], 1 pshufb m1, m4 pshufb m2, m4 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m3, m6 paddw m0, m1 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_upsample_loop RET .w8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(h+7, 15) jmp .w8_main .w8_no_upsample: lea maxbased, [hq+7] test angled, 0x400 jnz .w8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w8_main popcnt r5d, r5d vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m2 cmp hd, 8 jl .w8_filter_h4 punpckhwd m2, m2 vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g je .w8_filter_end ; 8x4 and 8x8 are always 3-tap movzx r3d, word [tlq+30] mov maxbased, 16 mov [rsp+32], r3d cmp r5d, 3 jne .w8_filter_end punpcklwd xm6, xm0, xm0 vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e movzx r5d, word [tlq+28] mov [rsp+34], r3w paddw m2, m6 sub r5d, r3d inc maxbased paddw m2, m2 lea r3d, [r5+r3*8+4] paddw m1, m2 shr r3d, 3 mov [rsp+32], r3w jmp .w8_filter_end .w8_filter_h4: pshuflw m3, m2, q3321 vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ .w8_filter_end: paddw m0, m3 pmullw m0, m4 mov tlq, rsp pxor m2, m2 paddw m0, m1 psrlw m0, 3 pavgw m0, m2 mova [tlq], m0 .w8_main: movd xm3, dxd vbroadcasti128 m1, [z_base_inc] vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m3, xm3 movd xm0, maxbased mov r3d, dxd vpbroadcastw m0, xm0 paddw m4, m3, m3 psubw m1, m0 vpblendd m3, m4, 0xf0 ; xpos0 xpos1 paddw m3, m1 .w8_loop: lea r5d, [r3+dxq] shr r3d, 6 movu xm0, [tlq+r3*2] movu xm1, [tlq+r3*2+2] lea r3d, [r5+dxq] shr r5d, 6 vinserti128 m0, [tlq+r5*2], 1 vinserti128 m1, [tlq+r5*2+2], 1 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w8_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w8_loop .w8_end_loop: mova [dstq+strideq*0], xm6 mova [dstq+strideq*1], xm6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_end_loop .w8_end: RET .w16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(h+15, 31) jmp .w16_main .w16: %assign stack_offset org_stack_offset ALLOC_STACK -96, 7 lea maxbased, [hq+15] test angled, 0x400 jnz .w16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w16_main popcnt r5d, r5d mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h cmp r5d, 3 jne .w16_filter_3tap vpbroadcastd m2, [base+pw_3] punpcklwd xm0, xm0 vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g paddw m0, m2 pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i paddw m0, m1 psrlw m0, 2 movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g cmp hd, 8 jl .w16_filter_5tap_h4 punpckhwd m3, m3 je .w16_filter_5tap_h8 vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h movzx r3d, word [tlq+62] movzx r2d, word [tlq+60] pavgw m2, m4 sub r2d, r3d paddw m1, m3 lea r2d, [r2+r3*8+4] paddw m1, m2 shr r2d, 3 psrlw m1, 2 mov [rsp+66], r3w mov [rsp+64], r2w mov tlq, rsp mov r3d, 33 cmp hd, 16 cmovg maxbased, r3d jmp .w16_filter_end2 .w16_filter_5tap_h8: vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 pavgw xm2, xm4 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 jmp .w16_filter_end2 .w16_filter_5tap_h4: pshuflw xm4, xm3, q3332 ; 4 5 5 5 pshuflw xm3, xm3, q3321 ; 3 4 5 5 pavgw xm2, xm4 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 jmp .w16_filter_end2 .w16_filter_3tap: vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m4 pmullw m3, m2 paddw m0, m1 cmp hd, 8 je .w16_filter_3tap_h8 jl .w16_filter_3tap_h4 punpckhwd m2, m2 vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g jmp .w16_filter_end .w16_filter_3tap_h4: pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ jmp .w16_filter_end .w16_filter_3tap_h8: psrldq xm2, 2 pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 .w16_filter_end: paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f pmullw m2, m4 psrlw m0, 3 pxor m1, m1 paddw m2, m3 psrlw m2, 3 pavgw m0, m1 pavgw m1, m2 .w16_filter_end2: mov tlq, rsp mova [tlq+ 0], m0 mova [tlq+32], m1 .w16_main: movd xm4, dxd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 movd xm0, maxbased mov r3d, dxd vpbroadcastw m0, xm0 paddw m3, m4, [z_base_inc] psubw m3, m0 .w16_loop: lea r5d, [r3+dxq] shr r3d, 6 movu m0, [tlq+r3*2] movu m1, [tlq+r3*2+2] lea r3d, [r5+dxq] shr r5d, 6 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 paddw m3, m4 paddw m1, m0 movu m0, [tlq+r5*2] vpblendvb m2, m6, m1, m2 movu m1, [tlq+r5*2+2] mova [dstq+strideq*0], m2 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+strideq*1], m0 sub hd, 2 jz .w16_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w16_loop .w16_end_loop: mova [dstq+strideq*0], m6 mova [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_end_loop .w16_end: RET .w32: %assign stack_offset org_stack_offset ALLOC_STACK -160, 8 lea maxbased, [hq+31] mov r3d, 63 cmp hd, 32 cmova maxbased, r3d test angled, 0x400 jnz .w32_main vpbroadcastd m2, [pw_3] mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g paddw m1, m2 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i mov r3, rsp paddw m0, m1 lea r5d, [maxbaseq-31] psrlw m0, 2 mova [r3], m0 .w32_filter_loop: mova m0, [tlq+30] paddw m1, m2, [tlq+28] add tlq, 32 paddw m0, [tlq+0] pavgw m1, [tlq+4] paddw m0, [tlq+2] add r3, 32 paddw m0, m1 psrlw m0, 2 mova [r3], m0 sub r5d, 16 jg .w32_filter_loop movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h punpckhwd m1, m0, m0 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g jl .w32_filter_h8 vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h movzx r5d, word [tlq+62] movzx r2d, word [tlq+60] pavgw m2, m3 sub r2d, r5d paddw m0, m1 lea r2d, [r2+r5*8+4] paddw m0, m2 shr r2d, 3 psrlw m0, 2 mova [r3+32], m0 mov [r3+66], r5w mov [r3+64], r2w mov tlq, rsp mov r3d, 65 cmp hd, 64 cmove maxbased, r3d jmp .w32_main .w32_filter_h8: vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 pavgw xm2, xm3 paddw xm0, xm1 mov tlq, rsp paddw xm0, xm2 psrlw xm0, 2 mova [r3+32], xm0 .w32_main: movd xm4, dxd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 movd xm0, maxbased mov r5d, dxd vpbroadcastd m7, [pw_m1024] ; -16 * 64 vpbroadcastw m0, xm0 paddw m3, m4, [z_base_inc] psubw m3, m0 .w32_loop: mov r3d, r5d shr r3d, 6 movu m0, [tlq+r3*2] movu m1, [tlq+r3*2+2] pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 psraw m1, m3, 15 vpblendvb m0, m6, m0, m1 mova [dstq+32*0], m0 movu m0, [tlq+r3*2+32] movu m1, [tlq+r3*2+34] add r5d, dxd psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m7, m3 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+32*1], m0 dec hd jz .w32_end add dstq, strideq cmp r5d, maxbased jb .w32_loop .w32_end_loop: mova [dstq+32*0], m6 mova [dstq+32*1], m6 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: %assign stack_offset org_stack_offset ALLOC_STACK -256, 10 lea maxbased, [hq+63] test angled, 0x400 jnz .w64_main vpbroadcastd m2, [pw_3] mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g paddw m1, m2 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i mov r3, rsp paddw m0, m1 lea r5d, [hq+32] psrlw m0, 2 mova [r3], m0 .w64_filter_loop: mova m0, [tlq+30] paddw m1, m2, [tlq+28] add tlq, 32 paddw m0, [tlq+0] pavgw m1, [tlq+4] paddw m0, [tlq+2] add r3, 32 paddw m0, m1 psrlw m0, 2 mova [r3], m0 sub r5d, 16 jg .w64_filter_loop movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h punpckhwd m1, m0, m0 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h pavgw m2, m3 paddw m0, m1 paddw m0, m2 mov tlq, rsp psrlw m0, 2 mova [r3+32], m0 .w64_main: movd xm4, dxd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 movd xm0, maxbased mov r5d, dxd vpbroadcastd m7, [pw_m1024] ; -16 * 64 vpbroadcastw m0, xm0 paddw m3, m4, [z_base_inc] paddw m8, m7, m7 ; -32 * 64 psubw m3, m0 paddw m9, m8, m7 ; -48 * 64 .w64_loop: mov r3d, r5d shr r3d, 6 movu m0, [tlq+r3*2] movu m1, [tlq+r3*2+2] pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 psraw m1, m3, 15 vpblendvb m0, m6, m0, m1 mova [dstq+32*0], m0 movu m0, [tlq+r3*2+32] movu m1, [tlq+r3*2+34] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m7, m3 vpblendvb m0, m6, m0, m1 mova [dstq+32*1], m0 movu m0, [tlq+r3*2+64] movu m1, [tlq+r3*2+66] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m8, m3 vpblendvb m0, m6, m0, m1 mova [dstq+32*2], m0 movu m0, [tlq+r3*2+96] movu m1, [tlq+r3*2+98] add r5d, dxd psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m9, m3 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+32*3], m0 dec hd jz .w64_end add dstq, strideq cmp r5d, maxbased jb .w64_loop .w64_end_loop: mova [dstq+32*0], m6 mova [dstq+32*1], m6 mova [dstq+32*2], m6 mova [dstq+32*3], m6 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy %define base r9-z_filter_t0 lea r9, [ipred_z2_16bpc_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea dxq, [dr_intra_derivative-90] movsxd wq, [r9+wq*4] mova m1, [tlq- 0] movzx dyd, angleb xor angled, 0x400 mova m2, [tlq- 32] mov r8, dxq sub dxq, dyq mova m3, [tlq- 64] add wq, r9 add r9, z_filter_t0-ipred_z2_16bpc_avx2_table mova m4, [tlq- 96] and dyd, ~1 mova m5, [tlq-128] and dxq, ~1 movzx dyd, word [r8+dyq] ; angle - 90 movzx dxd, word [dxq+270] ; 180 - angle vpbroadcastd m11, [base+pw_62] mova [rsp+128], m1 mova [rsp+ 96], m2 mova [rsp+ 64], m3 neg dxd mova [rsp+ 32], m4 neg dyq mova [rsp+ 0], m5 jmp wq .w4: vbroadcasti128 m10, [base+z2_x_shuf] vpbroadcastq m6, [base+z_base_inc+2] lea r8d, [dxq+(65<<6)] ; xpos mov r10d, (63-4)<<6 test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+2] add angled, 1022 shl r3d, 6 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) movq xm0, [tlq+2] ; 1 2 3 4 movq xm1, [tlq+0] ; 0 1 2 3 pshuflw xm2, xm0, q3321 ; 2 3 4 4 pshuflw xm3, xm1, q2100 ; 0 0 1 2 vpbroadcastw xm4, r8m ; pixel_max vbroadcasti128 m10, [base+z_upsample] paddw xm1, xm0 paddw xm2, xm3 lea r8d, [r8+dxq+(1<<6)] psubw xm2, xm1, xm2 add dxd, dxd psraw xm2, 3 pxor xm3, xm3 sub r10d, 3<<6 paddw xm1, xm2 paddw m6, m6 pmaxsw xm1, xm3 sub angled, 1075 ; angle - 53 pavgw xm1, xm3 lea r3d, [hq+3] pminsw xm1, xm4 xor angled, 0x7f ; 180 - angle punpcklwd xm1, xm0 movu [rsp+130], xm1 call .filter_strength jmp .w4_filter_left ALIGN function_align .filter_strength: movd xm8, r3d mov r3d, angled movd xm7, angled vpbroadcastb m8, xm8 shr r3d, 8 ; is_sm << 1 vpbroadcastb m7, xm7 pcmpeqb m8, [base+z_filter_wh] mova xm9, [r9+r3*8] pand m0, m8, m7 pcmpgtb m0, m9 pmovmskb r3d, m0 ret ALIGN function_align .upsample_left: ; h4/h8 mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 vpbroadcastw xm4, r8m ; pixel_max cmp hd, 8 je .upsample_left_h8 pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0 jmp .upsample_left_end .upsample_left_h8: pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2 pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0 .upsample_left_end: paddw xm1, xm0 paddw xm2, xm3 psubw xm2, xm1, xm2 add dyq, dyq psraw xm2, 3 pxor xm3, xm3 paddw xm1, xm2 pmaxsw xm1, xm3 pavgw xm1, xm3 pminsw xm1, xm4 punpcklwd xm2, xm0, xm1 punpckhwd xm0, xm1 mova [rsp+ 96+gprsize], xm2 mova [rsp+112+gprsize], xm0 ret .w4_no_upsample_above: lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength test r3d, r3d jz .w4_no_filter_above popcnt r3d, r3d vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] psrldq xm0, xm1, 2 ; 1 2 3 4 pshuflw xm2, xm1, q2100 ; 0 0 1 2 pmullw xm4, xm0 pshuflw xm3, xm0, q3321 ; 2 3 4 4 paddw xm1, xm3 pshuflw xm3, xm0, q3332 ; 3 4 4 4 pmullw xm1, xm5 vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2] paddw xm2, xm3 vpbroadcastd xm3, r6m ; max_width pmullw xm2, xm5 packssdw xm3, xm3 paddw xm1, xm4 paddw xm1, xm2 psubw xm3, [base+pw_1to16] pxor xm4, xm4 psrlw xm1, 3 pminsw xm3, xm11 ; clip to byte range since there's no variable word blend pavgw xm1, xm4 vpblendvb xm1, xm0, xm3 movq [rsp+130], xm1 .w4_no_filter_above: lea r3d, [hq+2] add angled, 973 ; angle + 883 shl r3d, 6 test r3d, angled jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) vpbroadcastd xm0, [base+pb_90] psubb xm0, xm7 ; 180 - angle pand xm0, xm8 ; reuse from previous filter_strength call pcmpgtb xm0, xm9 pmovmskb r3d, xm0 .w4_filter_left: test r3d, r3d jz .w4_main popcnt r3d, r3d mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f vpbroadcastd m5, r7m ; max_height cmp r3d, 3 je .w4_filter_left_s3 vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] pmullw m2, m0 cmp hd, 8 jl .w4_filter_left_h4 movu m4, [tlq-34] punpcklwd m1, m0, m0 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e je .w4_filter_left_end vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e jmp .w4_filter_left_end .w4_upsample_left: call .upsample_left mov r11, -16 vbroadcasti128 m9, [base+z_upsample] jmp .w4_main_upsample_left .w4_filter_left_s3: ; can only be h16 movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastd m4, [base+pw_3] paddw m1, m0, m2 punpckhwd m2, m2 vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g punpcklwd xm3, xm0, xm0 paddw m2, m4 vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d paddw m1, m4 pavgw m2, m3 paddw m1, m2 psrlw m1, 2 jmp .w4_filter_left_end2 .w4_filter_left_h4: pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e .w4_filter_left_end: paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m3 paddw m1, m2 pxor m2, m2 psrlw m1, 3 pavgw m1, m2 .w4_filter_left_end2: packssdw m5, m5 psubw m5, [base+pw_16to1] pminsw m5, m11 vpblendvb m1, m0, m5 mova [rsp+96], m1 .w4_main: vbroadcasti128 m9, [base+z2_x_shuf] mov r11, -8 .w4_main_upsample_left: movd xm5, dyd mova m4, [base+z2_y_shuf_h4] mov r2d, r8d movd xm0, dxd vpbroadcastw m5, xm5 rorx r5, dyq, 5 lea r8d, [dyq*3] pmullw m5, [base+z2_ymul] rorx r9, dyq, 4 sar dyd, 6 vpbroadcastw m0, xm0 sar r8d, 6 pand m5, m11 ; frac_y neg dyd psllw m5, 9 add r5d, dyd add r8d, dyd add r9d, dyd paddw m7, m0, m0 lea dyq, [rsp+dyq*2+126] vpblendd m0, m7, 0xcc add dyq, r11 neg r5d paddw m1, m0, m7 neg r8d vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3 neg r9d paddw m7, m7 paddw m6, m0 .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm1, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu xm3, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 vinserti128 m1, [rsp+r2*2], 1 lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 vinserti128 m3, [rsp+r3*2], 1 pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3 pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3 pand m2, m11, m6 punpcklqdq m0, m1, m3 punpckhqdq m1, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 cmp r3d, 64 jge .w4_toponly movu xm2, [dyq] vinserti128 m2, [dyq+r8*2], 1 movu xm3, [dyq+r5*2] vinserti128 m3, [dyq+r9*2], 1 pshufb m2, m9 pshufb m3, m9 punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0 punpcklwd m2, m3 psubw m2, m1 pmulhrsw m2, m5 psraw m3, m6, 15 ; base_x < topleft paddw m1, m2 vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 vpblendvb m0, m1, m3 .w4_toponly: paddw m6, m7 ; xpos += dx lea r3, [strideq*3] add dyq, r11 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*4] cmp r2d, r10d jge .w4_loop .w4_leftonly_loop: movu xm1, [dyq] vinserti128 m1, [dyq+r8*2], 1 movu xm2, [dyq+r5*2] vinserti128 m2, [dyq+r9*2], 1 add dyq, r11 pshufb m1, m9 pshufb m2, m9 punpckhwd m0, m1, m2 punpcklwd m1, m2 psubw m1, m0 pmulhrsw m1, m5 paddw m0, m1 vpermd m0, m4, m0 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_leftonly_loop .w4_end: RET .w8: mov r10d, hd test angled, 0x400 jnz .w8_main lea r3d, [angleq+126] xor r8d, r8d mov r3b, hb cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7 pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 vpbroadcastw xm4, r8m ; pixel_max paddw xm1, xm0 paddw xm2, xm3 not r8d psubw xm2, xm1, xm2 add dxd, dxd psraw xm2, 3 sub angled, 53 ; angle - 53 pxor xm3, xm3 paddw xm2, xm1 lea r3d, [hq+7] pmaxsw xm2, xm3 xor angled, 0x7f ; 180 - angle pavgw xm2, xm3 pminsw xm2, xm4 punpcklwd xm1, xm2, xm0 punpckhwd xm2, xm0 movu [rsp+130], xm1 movu [rsp+146], xm2 call .filter_strength jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength test r3d, r3d jz .w8_no_filter_above popcnt r3d, r3d vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2] movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x pmullw xm4, xm0 pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x paddw xm1, xm3 vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x paddw xm2, xm3 vpbroadcastd xm3, r6m ; max_width pmullw xm1, xm5 pmullw xm2, xm6 packssdw xm3, xm3 paddw xm1, xm4 paddw xm1, xm2 psubw xm3, [base+pw_1to16] pxor xm4, xm4 psrlw xm1, 3 pminsw xm3, xm11 pavgw xm1, xm4 vpblendvb xm1, xm0, xm3 movu [rsp+130], xm1 .w8_no_filter_above: lea r3d, [angleq-51] mov r3b, hb cmp r3d, 8 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 .w8_filter_left: test r3d, r3d jz .w8_main popcnt r3d, r3d cmp r3d, 3 jne .w8_filter_left_s12 vpbroadcastd m6, [base+pw_3] vpbroadcastd m7, [base+pw_16] cmp hd, 16 ; flags needed for later jmp .filter_left_s3b .w8_upsample_left: call .upsample_left vbroadcasti128 m7, [base+z2_y_shuf_us] lea r11, [rsp+118] mov r8, -8 jmp .w8_main_upsample_left .w16_filter_left_s12: xor r8d, r8d .w8_filter_left_s12: mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f vpbroadcastd m5, r7m ; max_height vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] pmullw m2, m0 cmp hd, 8 jl .w8_filter_left_h4 movu m4, [tlq-34] punpcklwd m1, m0, m0 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e je .w8_filter_left_end vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e jmp .w8_filter_left_end .w8_filter_left_h4: pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e .w8_filter_left_end: paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m3 paddw m1, m2 pxor m2, m2 psrlw m1, 3 pavgw m1, m2 packssdw m5, m5 psubw m5, [base+pw_16to1] pminsw m5, m11 vpblendvb m1, m0, m5 mova [rsp+96], m1 test r8d, r8d jz .w8_main ; upsample_main vbroadcasti128 m10, [base+z_upsample] vbroadcasti128 m7, [base+z2_y_shuf] lea r5, [rsp+120] movd xm1, dyd vbroadcasti128 m4, [base+z_base_inc+2] movd xm2, dxd vpbroadcastw m1, xm1 vpbroadcastw m2, xm2 mov r7, dstq paddw m4, m4 pmullw m0, m1, [base+z2_ymul8] paddw m5, m2, m2 psllw xm1, 3 vpblendd m2, m5, 0xf0 lea r2d, [dxq+(66<<6)] ; xpos paddw m4, m2 pshufd m6, m0, q2020 psraw xm0, 6 pxor xm1, xm1 psubw xm8, xm1, xm0 pand m6, m11 punpckhwd xm9, xm8, xm1 psllw m6, 9 punpcklwd xm8, xm1 .w8_upsample_above_loop: lea r3d, [r2+dxq] shr r2d, 6 movu xm1, [rsp+r2*2] movu xm2, [rsp+r2*2+16] lea r2d, [r3+dxq] shr r3d, 6 vinserti128 m1, [rsp+r3*2], 1 vinserti128 m2, [rsp+r3*2+16], 1 pshufb m1, m10 pshufb m2, m10 punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 punpckhqdq m1, m2 pand m2, m11, m4 psubw m1, m0 psllw m2, 9 pmulhrsw m1, m2 paddw m0, m1 cmp r3d, 64 jge .w8_upsample_above_toponly mova m1, m5 vpgatherdq m3, [r5+xm9*2], m5 mova m5, m1 vpgatherdq m2, [r5+xm8*2], m1 pshufb m3, m7 pshufb m2, m7 punpckldq m1, m2, m3 punpckhdq m2, m3 psubw m2, m1 pmulhrsw m2, m6 paddw m1, m2 vpermq m1, m1, q3120 psraw m2, m4, 15 vpblendvb m0, m1, m2 .w8_upsample_above_toponly: paddw m4, m5 sub r5, 4 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w8_ret lea dstq, [dstq+strideq*2] jmp .w8_upsample_above_loop .w8_main: vbroadcasti128 m7, [base+z2_y_shuf] lea r11, [rsp+120] mov r8, -4 .w8_main_upsample_left: movd xm1, dyd vbroadcasti128 m4, [base+z_base_inc+2] movd xm2, dxd vpbroadcastw m1, xm1 vpbroadcastw m2, xm2 mov r7, dstq pmullw m0, m1, [base+z2_ymul8] paddw m5, m2, m2 psllw xm1, 3 vpblendd m2, m5, 0xf0 ; xpos0 xpos1 lea r9d, [dxq+(65<<6)] ; xpos paddw m4, m2 movd [rsp+284], xm1 .w8_loop0: mov r2d, r9d mova [rsp+288], m0 mov r5, r11 mova [rsp+320], m4 pshufd m6, m0, q2020 psraw xm0, 6 pxor xm1, xm1 psubw xm8, xm1, xm0 ; base_y pand m6, m11 ; frac_y punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7 psllw m6, 9 punpcklwd xm8, xm1 ; base_y 0 1 4 5 .w8_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm0, [rsp+r2*2] movu xm1, [rsp+r2*2+2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 vinserti128 m0, [rsp+r3*2], 1 vinserti128 m1, [rsp+r3*2+2], 1 pand m2, m11, m4 psubw m1, m0 psllw m2, 9 pmulhrsw m1, m2 paddw m0, m1 cmp r3d, 64 jge .w8_toponly mova m1, m5 vpgatherdq m3, [r5+xm9*2], m5 mova m5, m1 vpgatherdq m2, [r5+xm8*2], m1 pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1 pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1 punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m2, m3 psubw m2, m1 pmulhrsw m2, m6 paddw m1, m2 vpermq m1, m1, q3120 psraw m2, m4, 15 ; base_x < topleft vpblendvb m0, m1, m2 .w8_toponly: paddw m4, m5 ; xpos += dx add r5, r8 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w8_end lea dstq, [dstq+strideq*2] cmp r2d, (63-8)<<6 jge .w8_loop .w8_leftonly_loop: mova m0, m5 vpgatherdq m4, [r5+xm9*2], m5 mova m5, m0 vpgatherdq m3, [r5+xm8*2], m0 add r5, r8 pshufb m2, m4, m7 pshufb m1, m3, m7 punpckldq m0, m1, m2 punpckhdq m1, m2 psubw m1, m0 pmulhrsw m1, m6 paddw m0, m1 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_leftonly_loop .w8_end: sub r10d, 1<<8 jl .w8_ret vpbroadcastd m0, [rsp+284] add r7, 16 paddw m0, [rsp+288] ; base_y += 8*dy add r9d, 8<<6 vpbroadcastd m4, [pw_512] movzx hd, r10b paddw m4, [rsp+320] ; base_x += 8*64 mov dstq, r7 jmp .w8_loop0 .w8_ret: RET .w16: movd xm0, [tlq+32] lea r10d, [hq+(1<<8)] movd [rsp+160], xm0 test angled, 0x400 jnz .w8_main lea r3d, [hq+15] sub angled, 90 call .filter_strength test r3d, r3d jz .w16_no_filter_above popcnt r3d, r3d vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2] movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g punpcklwd xm2, xm1, xm1 vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e punpckhwd m3, m0, m0 pmullw m4, m0 vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g paddw m1, m3 vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g paddw m2, m3 vpbroadcastd m3, r6m ; max_width pmullw m1, m5 pmullw m2, m6 packssdw m3, m3 paddw m1, m4 paddw m1, m2 psubw m3, [base+pw_1to16] pxor m4, m4 psrlw m1, 3 pminsw m3, m11 pavgw m1, m4 vpblendvb m1, m0, m3 movu [rsp+130], m1 .w16_no_filter_above: vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 test r3d, r3d jz .w8_main popcnt r3d, r3d cmp r3d, 3 jne .w16_filter_left_s12 vpbroadcastd m6, [base+pw_3] vpbroadcastd m7, [base+pw_16] cmp hd, 4 jne .filter_left_s3 movq xm0, [tlq-8] ; 0 1 2 3 movq xm1, [tlq-6] ; 1 2 3 4 vpbroadcastd xm5, r7m ; max_height movq xm4, [base+pw_16to1+24] ; 4to1 pshuflw xm2, xm0, q2100 ; 0 0 1 2 pshuflw xm3, xm1, q3321 ; 2 3 4 4 paddw xm1, xm0 paddw xm1, xm2 pshuflw xm2, xm0, q1000 ; 0 0 0 1 paddw xm3, xm6 packssdw xm5, xm5 pavgw xm2, xm3 psubw xm5, xm4 paddw xm1, xm2 pminsw xm5, xm11 psrlw xm1, 2 vpblendvb xm1, xm0, xm5 movq [rsp+120], xm1 jmp .w8_main .w32: mova m2, [tlq+32] movd xm0, [tlq+64] lea r10d, [hq+(3<<8)] mova [rsp+160], m2 movd [rsp+192], xm0 test angled, 0x400 jnz .w8_main vpbroadcastd m6, [base+pw_3] vpbroadcastd m0, r6m ; max_width vpbroadcastd m7, [base+pw_16] mov r3d, 32 packssdw m0, m0 psubw m0, [base+pw_1to16] pminsw m8, m0, m11 psubw m9, m8, m7 .w32_filter_above: movu m0, [tlq+2] punpcklwd xm4, xm1, xm1 paddw m2, m6, [tlq+6] paddw m1, m0 vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m1, [tlq+4] movu m3, [tlq+r3+2] paddw m5, m6, [tlq+r3-2] pavgw m2, m4 punpckhwd m4, m3, m3 paddw m1, m2 vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h pavgw m2, m5 paddw m5, m3, [tlq+r3] paddw m4, m5 psrlw m1, 2 paddw m2, m4 vpblendvb m1, m0, m8 psrlw m2, 2 vpblendvb m2, m3, m9 movu [rsp+130], m1 movu [rsp+r3+130], m2 .filter_left_s3: cmp hd, 16 jl .filter_left_s3_h8 ; h8 .filter_left_s3b: mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i vpbroadcastd m5, r7m ; max_height paddw m1, m0, m2 punpckhwd m2, m2 mov r3d, hd vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i packssdw m5, m5 not r3 psubw m5, [base+pw_16to1] paddw m2, m6 pminsw m8, m11, m5 je .filter_left_s3_end ; h16 paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m2 psrlw m1, 2 vpblendvb m3, m1, m0, m8 mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j psubw m8, m7 mova [rsp+96], m3 jnp .filter_left_s3_end ; h32 mova m5, [tlq-96] paddw m1, [tlq-66] pavgw m2, [tlq-68] paddw m1, m2 paddw m4, m5, [tlq-94] paddw m2, m6, [tlq-92] psrlw m1, 2 paddw m4, [tlq- 98] pavgw m2, [tlq-100] vpblendvb m3, m1, m0, m8 mova m0, [tlq-128] psubw m8, m7 paddw m4, m2 paddw m1, m0, [tlq-126] paddw m2, m6, [tlq-124] psrlw m4, 2 mova [rsp+64], m3 vpblendvb m4, m5, m8 psubw m8, m7 mova [rsp+32], m4 .filter_left_s3_end: punpcklwd xm3, xm0, xm0 vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m4 pavgw m2, m3 paddw m1, m2 psrlw m1, 2 vpblendvb m1, m0, m8 mova [rsp+r3*2+130], m1 jmp .w8_main .filter_left_s3_h8: mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7 movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8 pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6 vpbroadcastd xm5, r7m ; max_height paddw xm1, xm0, xm3 pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8 paddw xm1, xm2 vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5 paddw xm3, xm6 packssdw xm5, xm5 pavgw xm2, xm3 psubw xm5, [base+pw_16to1+16] ; 8to1 paddw xm1, xm2 pminsw xm5, xm11 psrlw xm1, 2 vpblendvb xm1, xm0, xm5 mova [rsp+112], xm1 jmp .w8_main .w64: mova m2, [tlq+ 32] mova m3, [tlq+ 64] mova m4, [tlq+ 96] movd xm0, [tlq+128] lea r10d, [hq+(7<<8)] mova [rsp+160], m2 mova [rsp+192], m3 mova [rsp+224], m4 movd [rsp+256], xm0 test angled, 0x400 jnz .w8_main vpbroadcastd m6, [base+pw_3] movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h movu m4, [tlq+66] paddw m3, m6, [tlq+62] paddw m7, m4, [tlq+64] pavgw m3, [tlq+70] paddw m7, [tlq+68] paddw m2, m5 vpbroadcastd m5, r6m ; max_width mov r3d, 96 packssdw m5, m5 paddw m3, m7 psubw m5, [base+pw_1to16] psrlw m2, 2 vpbroadcastd m7, [base+pw_16] psrlw m3, 2 pminsw m8, m11, m5 psubw m9, m8, m7 vpblendvb m2, m0, m9 psubw m9, m7 vpblendvb m3, m4, m9 psubw m9, m7 movu [rsp+162], m2 movu [rsp+194], m3 jmp .w32_filter_above cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z3_16bpc_avx2_table] tzcnt hd, hm movifnidn angled, anglem lea r7, [dr_intra_derivative+45*2-1] sub tlq, 2 movsxd hq, [r6+hq*4] sub angled, 180 add hq, r6 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e movzx dyd, word [r7+dyq] vpbroadcastd m5, [pw_62] mov org_wd, wd jmp hq .h4: ALLOC_STACK -64, 7 lea r7, [strideq*3] cmp angleb, 40 jae .h4_no_upsample lea r4d, [angleq-1024] sar r4d, 7 add r4d, wd jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 pshufd xm3, xm1, q0000 paddw xm1, xm2 paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 vpbroadcastw xm4, r8m ; pixel_max add dyd, dyd psubw xm0, xm1, xm0 mova [rsp+ 0], xm3 movd xm3, dyd psraw xm0, 3 neg dyd paddw xm1, xm0 pxor xm0, xm0 lea r2d, [dyq+(16<<6)+63] ; ypos pmaxsw xm1, xm0 pavgw xm1, xm0 vpbroadcastw m3, xm3 pminsw xm1, xm4 punpckhwd xm0, xm1, xm2 punpcklwd xm1, xm2 paddw m2, m3, m3 mova [rsp+32], xm0 punpcklwd m3, m2 mova [rsp+16], xm1 paddw m4, m2, m2 paddw m2, m3 vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 .h4_upsample_loop: lea r4d, [r2+dyq] shr r2d, 6 movu xm1, [rsp+r2*2] lea r2d, [r4+dyq] shr r4d, 6 movu xm2, [rsp+r4*2] lea r4d, [r2+dyq] shr r2d, 6 vinserti128 m1, [rsp+r2*2], 1 lea r2d, [r4+dyq] shr r4d, 6 vinserti128 m2, [rsp+r4*2], 1 psrld m0, m1, 16 pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 pslld m2, 16 pblendw m1, m2, 0xaa pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m3, m4 paddw m1, m0 vextracti128 xm2, m1, 1 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 movhps [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movhps [dstq+strideq*2], xm1 movq [dstq+r7 ], xm1 add dstq, 8 sub wd, 4 jg .h4_upsample_loop RET ALIGN function_align .filter_strength: ; h4/h8/h16 %define base r4-z_filter_t0 lea r4, [z_filter_t0] movd xm0, maxbased movd xm1, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m1, xm1 pcmpeqb m0, [base+z_filter_wh] pand m0, m1 mova xm1, [r4+angleq*8] pcmpgtb m0, m1 pmovmskb r5d, m0 ret .h4_no_upsample: mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea maxbased, [wq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 popcnt r5d, r5d mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] pmullw xm2, xm0 pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 paddw xm1, xm0, xm3 movd [rsp+12], xm0 pmullw xm1, xm4 cmp r5d, 3 jne .h4_filter_3tap pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 movzx r4d, word [tlq-14] movzx r2d, word [tlq-12] inc maxbased paddw xm1, xm2 paddw xm0, xm3 sub r2d, r4d paddw xm2, xm0, xm0 lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+14], r2w .h4_filter_3tap: pxor xm0, xm0 paddw xm1, xm2 lea tlq, [rsp+30] psrlw xm1, 3 cmp wd, 8 sbb maxbased, -1 pavgw xm0, xm1 mova [rsp+16], xm0 .h4_main: movd xm3, dyd neg maxbaseq vbroadcasti128 m1, [z_base_inc] vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m3, xm3 lea r4d, [maxbaseq+3*64] neg dyq movd xm2, r4d sub tlq, 8 lea r4, [dyq+63] ; ypos punpcklwd m1, m1 paddw m0, m3, m3 vpbroadcastw m2, xm2 punpcklwd m3, m0 paddw m4, m0, m0 paddw m0, m3 psubw m2, m1 vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 or maxbased, 63 paddw m3, m2 .h4_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 movu xm1, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base1 movu xm2, [tlq+r5*2] lea r5, [r4+dyq] sar r4, 6 ; base2 vinserti128 m1, [tlq+r4*2], 1 lea r4, [r5+dyq] sar r5, 6 ; base3 vinserti128 m2, [tlq+r5*2], 1 punpckhwd m0, m1, m2 punpcklwd m1, m2 pand m2, m5, m3 palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 ; ypos < max_base_y paddw m3, m4 paddw m1, m0 vpblendvb m1, m6, m1, m2 vextracti128 xm2, m1, 1 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 movhps [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movhps [dstq+strideq*2], xm1 movq [dstq+r7 ], xm1 sub wd, 4 jz .h4_end add dstq, 8 cmp r4d, maxbased jg .h4_loop .h4_end_loop: movq [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm6 movq [dstq+strideq*2], xm6 movq [dstq+r7 ], xm6 add dstq, 8 sub wd, 4 jg .h4_end_loop .h4_end: RET .h8: lea r4d, [angleq+216] %assign stack_offset org_stack_offset ALLOC_STACK -64, 8 mov r4b, wb lea r7, [strideq*3] cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d cmp wd, 8 je .h8_upsample_w8 pshufhw xm3, xm2, q1000 vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d .h8_upsample_w8: paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastw m4, r8m ; pixel_max add dyd, dyd psubw m0, m1, m0 movd xm6, dyd psraw m0, 3 neg dyd paddw m1, m0 pxor m0, m0 pmaxsw m1, m0 lea r4d, [dyq+(16<<6)+63] ; ypos pavgw m1, m0 vpbroadcastw m6, xm6 pminsw m1, m4 punpckhwd m0, m1, m2 punpcklwd m1, m2 vextracti128 [rsp+48], m0, 1 vextracti128 [rsp+32], m1, 1 paddw m7, m6, m6 mova [rsp+16], xm0 mova [rsp+ 0], xm1 punpcklwd m6, m7 ; ypos0 ypos1 .h8_upsample_loop: lea r2d, [r4+dyq] shr r4d, 6 ; base0 movu m1, [rsp+r4*2] lea r4d, [r2+dyq] shr r2d, 6 ; base1 movu m2, [rsp+r2*2] lea r2d, [r4+dyq] shr r4d, 6 ; base2 movu m3, [rsp+r4*2] lea r4d, [r2+dyq] shr r2d, 6 ; base3 movu m4, [rsp+r2*2] psrld m0, m1, 16 pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 pslld m2, 16 pblendw m1, m2, 0xaa psrld m2, m3, 16 pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 pslld m4, 16 pblendw m3, m4, 0xaa pand m4, m5, m6 paddw m6, m7 psllw m4, 9 psubw m1, m0 pmulhrsw m1, m4 pand m4, m5, m6 psllw m4, 9 psubw m3, m2 pmulhrsw m3, m4 paddw m6, m7 lea r2, [dstq+strideq*4] paddw m1, m0 paddw m3, m2 punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 movhps [r2 +strideq*0], xm0 movq [r2 +strideq*1], xm0 movhps [r2 +strideq*2], xm1 movq [r2 +r7 ], xm1 movhps [dstq+strideq*0], xm2 movq [dstq+strideq*1], xm2 movhps [dstq+strideq*2], xm3 movq [dstq+r7 ], xm3 add dstq, 8 sub wd, 4 jg .h8_upsample_loop RET .h8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(w+7, 15) jmp .h8_main .h8_no_upsample: lea maxbased, [wq+7] test angled, 0x400 jnz .h8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h8_main popcnt r5d, r5d mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] pmullw m2, m0 cmp wd, 8 jl .h8_filter_w4 punpcklwd xm0, xm0 vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e movd [rsp+28], xm0 paddw m1, m3 mov r4d, 16 pmullw m1, m4 cmovg maxbased, r4d cmp r5d, 3 jne .h8_filter_3tap punpckhwd m3, m3 vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g movzx r4d, word [tlq-30] movzx r2d, word [tlq-28] inc maxbased paddw m1, m2 paddw m0, m3 sub r2d, r4d paddw m2, m0, m0 lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+30], r2w jmp .h8_filter_3tap .h8_filter_w4: pshufhw xm1, xm0, q2100 vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e paddw m1, m3 pmullw m1, m4 .h8_filter_3tap: pxor m0, m0 paddw m1, m2 lea tlq, [rsp+62] psrlw m1, 3 pavgw m0, m1 mova [rsp+32], m0 .h8_main: movd xm4, dyd neg maxbaseq vbroadcasti128 m1, [z_base_inc] vpbroadcastw m7, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 lea r4d, [maxbaseq+7*64] neg dyq movd xm2, r4d sub tlq, 16 lea r4, [dyq+63] paddw m6, m4, m4 vpbroadcastw m2, xm2 vpblendd m4, m6, 0xf0 ; ypos0 ypos1 psubw m2, m1 or maxbased, 63 paddw m4, m2 .h8_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 movu xm0, [tlq+r4*2+2] movu xm1, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base1 vinserti128 m0, [tlq+r5*2+2], 1 vinserti128 m1, [tlq+r5*2], 1 lea r5, [r4+dyq] sar r4, 6 ; base2 pand m3, m5, m4 psllw m3, 9 psubw m1, m0 pmulhrsw m1, m3 psraw m3, m4, 15 paddw m4, m6 paddw m0, m1 movu xm1, [tlq+r4*2+2] movu xm2, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base3 vpblendvb m0, m7, m0, m3 vinserti128 m1, [tlq+r5*2+2], 1 vinserti128 m2, [tlq+r5*2], 1 pand m3, m5, m4 psllw m3, 9 psubw m2, m1 pmulhrsw m2, m3 psraw m3, m4, 15 paddw m4, m6 lea r5, [dstq+strideq*4] paddw m1, m2 vpblendvb m1, m7, m1, m3 punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 vextracti128 xm3, m2, 1 punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 vextracti128 xm3, m0, 1 movhps [dstq+strideq*0], xm1 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movq [dstq+r7 ], xm2 punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 movhps [r5 +strideq*0], xm1 movq [r5 +strideq*1], xm1 movhps [r5 +strideq*2], xm0 movq [r5 +r7 ], xm0 sub wd, 4 jz .h8_end add dstq, 8 cmp r4d, maxbased jg .h8_loop lea r6, [strideq*5] lea r2, [strideq+r7*2] ; stride*7 test wd, 4 jz .h8_end_loop movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm7 movq [dstq+strideq*2], xm7 movq [dstq+r7 ], xm7 movq [dstq+strideq*4], xm7 movq [dstq+r6 ], xm7 movq [dstq+r7*2 ], xm7 movq [dstq+r2 ], xm7 add dstq, 8 sub wd, 4 jz .h8_end .h8_end_loop: mova [dstq+strideq*0], xm7 mova [dstq+strideq*1], xm7 mova [dstq+strideq*2], xm7 mova [dstq+r7 ], xm7 mova [dstq+strideq*4], xm7 mova [dstq+r6 ], xm7 mova [dstq+r7*2 ], xm7 mova [dstq+r2 ], xm7 add dstq, 16 sub wd, 8 jg .h8_end_loop .h8_end: RET .h16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(w+15, 31) jmp .h16_main ALIGN function_align .h16: %assign stack_offset org_stack_offset ALLOC_STACK -96, 10 lea maxbased, [wq+15] lea r7, [strideq*3] test angled, 0x400 jnz .h16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h16_main ; filter_strength == 0 popcnt r5d, r5d movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h pmullw m1, m7 paddw m1, m2 cmp wd, 8 jg .h16_filter_w16 mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 pmullw xm6, xm3 jl .h16_filter_w4 pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 cmp r5d, 3 jne .h16_filter_w8_3tap vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 .h16_filter_w8_5tap: punpckhwd m0, m0 vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw xm4, xm4 paddw m0, m0 paddw xm6, xm4 paddw m1, m0 .h16_filter_w8_3tap: paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 pmullw xm3, xm7 pxor m0, m0 paddw xm3, xm6 psrlw xm3, 3 pavgw xm3, xm0 mova [rsp+48], xm3 jmp .h16_filter_end .h16_filter_w4: pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 cmp r5d, 3 jne .h16_filter_w8_3tap pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 jmp .h16_filter_w8_5tap .h16_filter_w16: mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f pmullw m6, m3 punpcklwd xm3, xm3 vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g mov r4d, 32 cmp wd, 16 cmovg maxbased, r4d movd [rsp+28], xm3 pmullw m4, m7 cmp r5d, 3 jne .h16_filter_w16_3tap punpckhwd m0, m0 vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f movzx r4d, word [tlq-62] movzx r2d, word [tlq-60] or maxbased, 1 paddw m3, m3 sub r2d, r4d paddw m0, m0 lea r2d, [r2+r4*8+4] paddw m4, m3 shr r2d, 3 paddw m1, m0 mov [rsp+30], r2w .h16_filter_w16_3tap: pxor m0, m0 paddw m4, m6 psrlw m4, 3 pavgw m4, m0 mova [rsp+32], m4 .h16_filter_end: psrlw m1, 3 lea tlq, [rsp+94] pavgw m1, m0 mova [rsp+64], m1 .h16_main: movd xm8, dyd neg maxbaseq vpbroadcastw m9, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m8, xm8 lea r4d, [maxbaseq+dyq+15*64] neg dyq movd xm7, r4d sub tlq, 32 lea r4, [dyq+63] vpbroadcastw m7, xm7 or maxbased, 63 psubw m7, [z_base_inc] .h16_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 movu m0, [tlq+r4*2+2] movu m2, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base1 movu m1, [tlq+r5*2+2] movu m3, [tlq+r5*2] lea r5, [r4+dyq] sar r4, 6 ; base3 pand m6, m5, m7 psllw m6, 9 psubw m2, m0 pmulhrsw m2, m6 psraw m6, m7, 15 paddw m7, m8 paddw m0, m2 movu m2, [tlq+r4*2+2] movu m4, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base3 vpblendvb m0, m9, m0, m6 pand m6, m5, m7 psllw m6, 9 psubw m3, m1 pmulhrsw m3, m6 psraw m6, m7, 15 paddw m7, m8 paddw m1, m3 vpblendvb m1, m9, m1, m6 pand m6, m5, m7 psllw m6, 9 psubw m4, m2 pmulhrsw m4, m6 psraw m6, m7, 15 paddw m7, m8 paddw m2, m4 movu m3, [tlq+r5*2+2] movu m4, [tlq+r5*2] vpblendvb m2, m9, m2, m6 pand m6, m5, m7 psllw m6, 9 psubw m4, m3 pmulhrsw m4, m6 psraw m6, m7, 15 paddw m7, m8 lea r5, [dstq+strideq*4] paddw m3, m4 vpblendvb m3, m9, m3, m6 punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 vextracti128 xm6, m3, 1 punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 vextracti128 xm2, m4, 1 movhps [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm6 vextracti128 xm6, m1, 1 movhps [dstq+strideq*2], xm2 movq [dstq+r7 ], xm2 vextracti128 xm2, m0, 1 movhps [r5 +strideq*0], xm6 movq [r5 +strideq*1], xm6 movhps [r5 +strideq*2], xm2 movq [r5 +r7 ], xm2 lea r5, [dstq+strideq*8] movhps [r5 +strideq*0], xm3 movq [r5 +strideq*1], xm3 movhps [r5 +strideq*2], xm4 movq [r5 +r7 ], xm4 lea r5, [r5+strideq*4] movhps [r5 +strideq*0], xm1 movq [r5 +strideq*1], xm1 movhps [r5 +strideq*2], xm0 movq [r5 +r7 ], xm0 sub wd, 4 jz .h16_end add dstq, 8 cmp r4d, maxbased jg .h16_loop mov hd, 4 .h16_end_loop0: mov r6d, wd mov r2, dstq test wb, 4 jz .h16_end_loop movq [dstq+strideq*0], xm9 movq [dstq+strideq*1], xm9 movq [dstq+strideq*2], xm9 movq [dstq+r7 ], xm9 and r6d, 120 jz .h16_end_w4 add dstq, 8 .h16_end_loop: mova [dstq+strideq*0], xm9 mova [dstq+strideq*1], xm9 mova [dstq+strideq*2], xm9 mova [dstq+r7 ], xm9 add dstq, 16 sub r6d, 8 jg .h16_end_loop .h16_end_w4: lea dstq, [r2+strideq*4] dec hd jg .h16_end_loop0 .h16_end: RET .h32: %assign stack_offset org_stack_offset ALLOC_STACK -160, 9 lea maxbased, [wq+31] and maxbased, 31 or maxbased, 32 ; imin(w+31, 63) test angled, 0x400 jnz .h32_main vpbroadcastd m2, [pw_3] movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i punpckhwd m1, m0, m0 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m1, m2 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f lea r4, [rsp+128] paddw m0, m1 lea r5d, [maxbaseq-31] psrlw m0, 2 mova [r4], m0 .h32_filter_loop: mova m0, [tlq-62] paddw m1, m2, [tlq-66] paddw m0, [tlq-64] pavgw m1, [tlq-58] paddw m0, [tlq-60] sub tlq, 32 sub r4, 32 paddw m0, m1 psrlw m0, 2 mova [r4], m0 sub r5d, 16 jg .h32_filter_loop jl .h32_filter_h8 mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e movzx r5d, word [tlq-62] movzx r2d, word [tlq-60] pavgw m2, m3 sub r2d, r5d paddw m0, m1 lea r2d, [r2+r5*8+4] paddw m0, m2 shr r2d, 3 psrlw m0, 2 mova [r4-32], m0 mov [r4-36], r5w mov [r4-34], r2w lea tlq, [rsp+158] mov r4d, 65 cmp wd, 64 cmove maxbased, r4d jmp .h32_main .h32_filter_h8: mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 lea tlq, [rsp+158] pavgw xm2, xm3 paddw xm0, xm1 paddw xm0, xm2 psrlw xm0, 2 mova [r4-16], xm0 .h32_main: movd xm6, dyd neg maxbaseq vpbroadcastw m7, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m6, xm6 lea r4d, [maxbaseq+dyq+15*64] neg dyq movd xm4, r4d vpbroadcastd m8, [pw_m1024] lea r4, [dyq+63] vpbroadcastw m4, xm4 or maxbased, 63 psubw m4, [z_base_inc] .h32_loop: mov r5, r4 sar r5, 6 movu m1, [tlq+r5*2-64] movu m0, [tlq+r5*2-62] pand m3, m5, m4 psllw m3, 9 psubw m1, m0 pmulhrsw m1, m3 pcmpgtw m2, m8, m4 paddw m0, m1 vpblendvb m0, m7, m0, m2 movu m2, [tlq+r5*2-32] movu m1, [tlq+r5*2-30] add r4, dyq sub rsp, 64 psubw m2, m1 pmulhrsw m2, m3 psraw m3, m4, 15 paddw m4, m6 mova [rsp+32*0], m0 paddw m1, m2 vpblendvb m1, m7, m1, m3 mova [rsp+32*1], m1 dec wd jz .h32_transpose cmp r4d, maxbased jg .h32_loop .h32_end_loop: sub rsp, 64 mova [rsp+32*0], m7 mova [rsp+32*1], m7 dec wd jg .h32_end_loop .h32_transpose: lea r3, [strideq*3] lea r4, [strideq*5] mov r8, dstq lea r5, [strideq+r3*2] .h32_transpose_loop0: lea r6, [rsp+32] lea r2, [r8+org_wq*2-16] .h32_transpose_loop: mova m0, [r6+64*7] mova m1, [r6+64*6] mova m2, [r6+64*5] mova m3, [r6+64*4] mova m4, [r6+64*3] mova m5, [r6+64*2] mova m6, [r6+64*1] mova m7, [r6+64*0] punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 lea dstq, [r2+strideq*8] sub r6, 32 punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 punpckhqdq m5, m7, m1 ; 8 0 vextracti128 [r2 +strideq*0], m5, 1 punpcklqdq m7, m1 ; 9 1 mova [dstq+strideq*0], xm5 punpckhqdq m1, m8, m3 ; 10 2 vextracti128 [r2 +strideq*1], m7, 1 punpcklqdq m8, m3 ; 11 3 mova [dstq+strideq*1], xm7 punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 vextracti128 [r2 +strideq*2], m1, 1 punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 mova [dstq+strideq*2], xm1 punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 vextracti128 [r2 +r3 ], m8, 1 punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 mova [dstq+r3 ], xm8 punpckhqdq m6, m3, m2 ; 12 4 vextracti128 [r2 +strideq*4], m6, 1 punpcklqdq m3, m2 ; 13 5 mova [dstq+strideq*4], xm6 punpckhqdq m2, m0, m4 ; 14 6 vextracti128 [r2 +r4 ], m3, 1 punpcklqdq m0, m4 ; 15 7 mova [dstq+r4 ], xm3 vextracti128 [r2 +r3*2 ], m2, 1 mova [dstq+r3*2 ], xm2 vextracti128 [r2 +r5 ], m0, 1 mova [dstq+r5 ], xm0 lea r2, [dstq+strideq*8] cmp r6, rsp jae .h32_transpose_loop add rsp, 64*8 sub org_wd, 8 jg .h32_transpose_loop0 .h32_end: RET .h64: %assign stack_offset org_stack_offset ALLOC_STACK -256, 10 lea maxbased, [wq+63] test angled, 0x400 jnz .h64_main vpbroadcastd m2, [pw_3] movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i punpckhwd m1, m0, m0 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m1, m2 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f lea r4, [rsp+224] paddw m0, m1 lea r5d, [wq+32] psrlw m0, 2 mova [r4], m0 .h64_filter_loop: mova m0, [tlq-62] paddw m1, m2, [tlq-66] paddw m0, [tlq-64] pavgw m1, [tlq-58] paddw m0, [tlq-60] sub tlq, 32 sub r4, 32 paddw m0, m1 psrlw m0, 2 mova [r4], m0 sub r5d, 16 jg .h64_filter_loop mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e lea tlq, [rsp+254] pavgw m2, m3 paddw m0, m1 paddw m0, m2 psrlw m0, 2 mova [r4-32], m0 .h64_main: neg maxbaseq movd xm4, dyd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 lea r4d, [maxbaseq+dyq+15*64] neg dyq vpbroadcastd m7, [pw_m1024] movd xm3, r4d lea r4, [dyq+63] paddw m8, m7, m7 vpbroadcastw m3, xm3 or maxbased, 63 paddw m9, m8, m7 psubw m3, [z_base_inc] .h64_loop: mov r5, r4 sar r5, 6 movu m1, [tlq+r5*2-128] movu m0, [tlq+r5*2-126] pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 sub rsp, 128 paddw m0, m1 pcmpgtw m1, m9, m3 vpblendvb m0, m6, m0, m1 mova [rsp+32*0], m0 movu m1, [tlq+r5*2-96] movu m0, [tlq+r5*2-94] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m8, m3 vpblendvb m0, m6, m0, m1 mova [rsp+32*1], m0 movu m1, [tlq+r5*2-64] movu m0, [tlq+r5*2-62] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m7, m3 vpblendvb m0, m6, m0, m1 mova [rsp+32*2], m0 movu m1, [tlq+r5*2-32] movu m0, [tlq+r5*2-30] psubw m1, m0 pmulhrsw m1, m2 add r4, dyq psraw m2, m3, 15 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [rsp+32*3], m0 dec wd jz .h64_transpose cmp r4d, maxbased jg .h64_loop .h64_end_loop: sub rsp, 128 mova [rsp+32*0], m6 mova [rsp+32*1], m6 mova [rsp+32*2], m6 mova [rsp+32*3], m6 dec wd jg .h64_end_loop .h64_transpose: lea r2, [strideq*3] lea r3, [strideq*5] mov r5, dstq lea r4, [strideq+r2*2] .h64_transpose_loop0: lea r6, [rsp+112] lea dstq, [r5+org_wq*2-32] .h64_transpose_loop: mova xm0, [r6+128*15] vinserti128 m0, [r6+128* 7], 1 mova xm1, [r6+128*14] vinserti128 m1, [r6+128* 6], 1 mova xm2, [r6+128*13] vinserti128 m2, [r6+128* 5], 1 mova xm3, [r6+128*12] vinserti128 m3, [r6+128* 4], 1 mova xm4, [r6+128*11] vinserti128 m4, [r6+128* 3], 1 mova xm5, [r6+128*10] vinserti128 m5, [r6+128* 2], 1 mova xm6, [r6+128* 9] vinserti128 m6, [r6+128* 1], 1 mova xm7, [r6+128* 8] vinserti128 m7, [r6+128* 0], 1 punpckhwd m8, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m4, m5 punpcklwd m4, m5 punpckhwd m5, m6, m7 punpcklwd m6, m7 sub r6, 16 punpckhdq m7, m8, m1 punpckldq m8, m1 punpckhdq m1, m3, m5 punpckldq m3, m5 punpckhqdq m5, m7, m1 punpcklqdq m7, m1 punpckhqdq m1, m8, m3 punpcklqdq m8, m3 punpckhdq m3, m0, m2 mova [dstq+strideq*0], m5 punpckldq m0, m2 mova [dstq+strideq*1], m7 punpckhdq m2, m4, m6 mova [dstq+strideq*2], m1 punpckldq m4, m6 mova [dstq+r2 ], m8 punpckhqdq m6, m3, m2 mova [dstq+strideq*4], m6 punpcklqdq m3, m2 mova [dstq+r3 ], m3 punpckhqdq m2, m0, m4 mova [dstq+r2*2 ], m2 punpcklqdq m0, m4 mova [dstq+r4 ], m0 lea dstq, [dstq+strideq*8] cmp r6, rsp jae .h64_transpose_loop add rsp, 128*16 sub org_wd, 16 jg .h64_transpose_loop0 .h64_end: RET %macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax %ifnum %4 pshufb xm%2, xm%4 %else pshufb xm%2, %4 %endif vinserti128 m%2, xm%2, 1 pshufd m%1, m%2, q0000 pmaddwd m%1, m2 pshufd m%3, m%2, q1111 pmaddwd m%3, m3 paddd m%1, m1 paddd m%1, m%3 pshufd m%3, m%2, q2222 pmaddwd m%3, m4 paddd m%1, m%3 pshufd m%3, m%2, q3333 pmaddwd m%3, m5 paddd m%1, m%3 psrad m%1, 4 packusdw m%1, m%1 pminsw m%1, m%5 %endmacro %macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax pshufb m%2, m%6 vpermq m%4, m%2, q3232 vinserti128 m%2, xm%2, 1 pshufd m%1, m%2, q0000 pshufd m%3, m%4, q0000 pmaddwd m%1, m2 pmaddwd m%3, m2 paddd m%1, m1 paddd m%3, m1 pshufd m%5, m%2, q1111 pmaddwd m%5, m3 paddd m%1, m%5 pshufd m%5, m%4, q1111 pmaddwd m%5, m3 paddd m%3, m%5 pshufd m%5, m%2, q2222 pmaddwd m%5, m4 paddd m%1, m%5 pshufd m%5, m%4, q2222 pmaddwd m%5, m4 paddd m%3, m%5 pshufd m%5, m%2, q3333 pmaddwd m%5, m5 paddd m%1, m%5 pshufd m%5, m%4, q3333 pmaddwd m%5, m5 paddd m%3, m%5 psrad m%1, 4 psrad m%3, 4 packusdw m%1, m%3 pminsw m%1, m%7 %endmacro ; The ipred_filter SIMD processes 4x2 blocks in the following order which ; increases parallelism compared to doing things row by row. One redundant ; block is calculated for w8 and w16, two for w32. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 5 1 2 3 5 b c d f ; 2 2 3 2 4 5 7 2 4 5 7 c e f h ; 3 3 4 4 6 7 9 4 6 7 9 e g h j ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ ; 5 8 8 i cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter %assign org_stack_offset stack_offset %define base r6-ipred_filter_16bpc_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 add filterq, r6 lea r6, [ipred_filter_16bpc_avx2_table] vbroadcasti128 m0, [tlq-6] movsxd wq, [r6+wq*4] vpbroadcastd m1, [base+pd_8] pmovsxbw m2, [filterq+16*0] pmovsxbw m3, [filterq+16*1] pmovsxbw m4, [filterq+16*2] pmovsxbw m5, [filterq+16*3] add wq, r6 mov hd, hm jmp wq .w4: WIN64_SPILL_XMM 10 mova xm8, [base+filter_shuf2] vpbroadcastw m9, r8m ; bitdepth_max lea r7, [6+hq*2] sub tlq, r7 jmp .w4_loop_start .w4_loop: pinsrq xm0, [tlq+hq*2], 0 lea dstq, [dstq+strideq*2] .w4_loop_start: FILTER_1BLK 6, 0, 7, 8, 9 vextracti128 xm0, m6, 1 movq [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm0 sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vbroadcasti128 m14, [base+filter_shuf3] vpbroadcastw m15, r8m ; bitdepth_max FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15 vpermq m6, m10, q1302 ; ____ ____ | ____ 4321 pslldq m8, m0, 4 psrldq m7, m6, 2 psrldq m0, m6, 10 punpcklwd m7, m0 vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321 vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321 vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321 lea r7, [16+hq*2] sub tlq, r7 jmp .w8_loop_start .w8_loop: vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321 vpermq m6, m9, q2031 psrldq m0, m6, 2 psrldq m6, 10 punpcklwd m6, m0 vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321 vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321 mova m10, m9 .w8_loop_start: vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321 call .main vpblendd m10, m9, 0xCC mova [dstq+strideq*0], xm10 vextracti128 [dstq+strideq*1], m10, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: %assign stack_offset stack_offset - stack_size_padded ALLOC_STACK 32, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 TAIL_CALL .w16_main, 0 .w16_main: mova xm10, [base+filter_shuf2] FILTER_1BLK 13, 0, 6, 10, 15 vpermq m12, m13, q3120 mova xm14, [base+filter_shuf3] vinserti128 m14, [base+filter_shuf1], 1 vpbroadcastq m0, [tlq+10] vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____ psrldq m6, m12, 8 vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321 punpcklwd m6, m12 vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 vpblendd m13, m12, 0xCC vpermq m12, m12, q2031 ; 6___ 5___ psrldq xm6, xm12, 2 psrldq xm8, xm12, 12 vpblendd xm6, xm8, 0x01 pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ FILTER_1BLK 11, 6, 8, 10, 15 vpermq m11, m11, q3120 pshufd m9, m11, q1032 movu m8, [tlq+6] ; __43 210_ | ____ ____ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 lea r7, [20+hq*2] sub tlq, r7 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 jmp .w16_loop_start .w16_loop: vpermq m13, m13, q3322 vpermq m11, m9, q2020 vpermq m9, m9, q1302 vpermq m6, m12, q0123 psrldq m7, 4 vpblendd m13, m10, 0xCC vpblendd m9, m7, 0x40 mova m0, [rsp+8] mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 .w16_loop_start: mova m13, m12 vpblendd m0, [tlq+hq*2], 0x0C psrldq m7, m12, 8 punpcklwd m7, m12 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 vpermq m12, m10, q2031 mova [rsp+8], m0 psrldq m8, m11, 8 psrldq xm6, xm12, 2 psrldq xm7, xm12, 10 psrldq xm0, xm13, 2 punpcklwd m8, m11 punpcklwd xm7, xm6 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 call .main vpermq m8, m11, q3120 vpblendd m6, m8, m9, 0xCC mova [dstq+strideq*0+16], xm6 vextracti128 [dstq+strideq*1+16], m6, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop vpermq m8, m9, q3120 vextracti128 xm0, m8, 1 ; 4321 ____ pshufd xm11, xm11, q1032 vpblendd xm0, xm11, 0x02 ; 4321 0___ psrldq xm6, xm8, 2 psrldq xm7, xm8, 12 pblendw xm0, xm6, 0x4 ; 4321 05__ pblendw xm0, xm7, 0x2 ; 4321 056_ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 vpermq m12, m13, q1302 vpblendd m12, m10, 0xCC vpblendd m9, m6, 0xCC mova [dstq+strideq*0+ 0], xm12 mova [dstq+strideq*0+16], xm9 vextracti128 [dstq+strideq*1+ 0], m12, 1 vextracti128 [dstq+strideq*1+16], m9, 1 ret ALIGN function_align .w32: %assign stack_offset org_stack_offset ALLOC_STACK 64, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 lea r3, [dstq+32] lea r5d, [hd*2+20] call .w16_main mov dstq, r3 lea tlq, [tlq+r5+32] sub r5d, 20 shr r5d, 1 sub r5d, 2 lea r4, [dstq+strideq*2-2] DEFINE_ARGS dst, stride, tl, stride3, left, h lea stride3q, [strideq*3] movu m8, [tlq-6] ; 4321 0___ mova xm10, [base+filter_shuf2] pinsrw xm0, xm8, [dstq+strideq*0-2], 2 pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_ pinsrw xm9, [leftq+strideq*0], 5 pinsrw xm9, [leftq+strideq*1], 4 FILTER_1BLK 13, 0, 6, 10, 15 vpermq m12, m13, q3120 mova xm14, [base+filter_shuf3] vinserti128 m14, [base+filter_shuf1], 1 psrldq m6, m12, 8 punpcklwd m7, m6, m12 vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321 vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321 vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321 vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 vpblendd m13, m12, 0xCC pinsrw xm9, [leftq+strideq*2], 3 pinsrw xm9, [leftq+stride3q ], 2 lea leftq, [leftq+strideq*4] pinsrw xm9, [leftq+strideq*0], 1 pinsrw xm9, [leftq+strideq*1], 0 movq [rsp+32], xm9 mov r7d, 1 pslldq m8, m9, 4 vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____ vpermq m12, m12, q2031 ; 6___ 5___ psrldq xm6, xm12, 2 psrldq xm7, xm12, 12 vpblendd xm6, xm7, 0x01 ; ____ _56_ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ FILTER_1BLK 11, 6, 7, 10, 15 vpermq m11, m11, q3120 pshufd m9, m11, q1032 vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 jmp .w32_loop_start .w32_loop_last: mova m0, [rsp+0] jmp .w32_loop .w32_loop_left: mova m0, [rsp+0] vpblendd m0, [rsp+32+r7*4-12], 0x0C dec r7d jg .w32_loop cmp hd, 2 je .w32_loop pinsrw xm6, [rsp+32], 6 pinsrw xm6, [leftq+strideq*2], 5 pinsrw xm6, [leftq+stride3q ], 4 lea leftq, [leftq+strideq*4] pinsrw xm6, [leftq+strideq*0], 3 pinsrw xm6, [leftq+strideq*1], 2 pinsrw xm6, [leftq+strideq*2], 1 pinsrw xm6, [leftq+stride3q ], 0 lea leftq, [leftq+strideq*4] movu [rsp+36], xm6 pinsrw xm6, [leftq+strideq*0], 1 pinsrw xm6, [leftq+strideq*1], 0 movd [rsp+32], xm6 mov r7d, 4 .w32_loop: vpermq m13, m13, q3322 vpermq m11, m9, q2020 vpermq m9, m9, q1302 vpermq m6, m12, q0123 psrldq m7, 4 vpblendd m13, m10, 0xCC vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321 mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 .w32_loop_start: mova m13, m12 psrldq m7, m12, 8 punpcklwd m7, m12 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 vpermq m12, m10, q2031 mova [rsp+0], m0 psrldq m8, m11, 8 psrldq xm6, xm12, 2 psrldq xm7, xm12, 10 psrldq xm0, xm13, 2 punpcklwd m8, m11 punpcklwd xm7, xm6 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 call .main vpermq m8, m11, q3120 vpblendd m6, m8, m9, 0xCC mova [dstq+strideq*0+16], xm6 vextracti128 [dstq+strideq*1+16], m6, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop_left jz .w32_loop_last vpermq m8, m9, q3120 vextracti128 xm0, m8, 1 ; 4321 ____ pshufd xm11, xm11, q1032 vpblendd xm0, xm11, 0x02 ; 4321 0___ psrldq xm6, xm8, 2 psrldq xm7, xm8, 12 pblendw xm0, xm6, 0x4 ; 4321 05__ pblendw xm0, xm7, 0x2 ; 4321 056_ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 vpermq m12, m13, q1302 vpblendd m12, m10, 0xCC vpblendd m9, m6, 0xCC mova [dstq+strideq*0+ 0], xm12 mova [dstq+strideq*0+16], xm9 vextracti128 [dstq+strideq*1+ 0], m12, 1 vextracti128 [dstq+strideq*1+16], m9, 1 RET .main: FILTER_2BLK 9, 8, 6, 7, 0, 14, 15 ret %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif %macro IPRED_CFL 1 ; ac in, unpacked pixels out psignw m3, m%1, m1 pabsw m%1, m%1 pmulhrsw m%1, m2 psignw m%1, m3 paddw m%1, m0 %endmacro cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm add tlq, 2 movd xm4, wd pxor m6, m6 vpbroadcastw m7, r7m pavgw xm4, xm6 tzcnt wd, wd movd xm5, wd movu m0, [tlq] lea t0, [ipred_cfl_left_16bpc_avx2_table] movsxd r6, [t0+wq*4] add r6, t0 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half sub tlq, hq movd xm4, hd sub tlq, hq pxor m6, m6 vpbroadcastw m7, r7m pavgw xm4, xm6 tzcnt r6d, hd movd xm5, r6d movu m0, [tlq] lea t0, [ipred_cfl_left_16bpc_avx2_table] movsxd r6, [t0+r6*4] add r6, t0 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table tzcnt wd, wd movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 .h32: paddw m0, [tlq+32] .h16: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h8: psrldq xm1, xm0, 8 paddw xm0, xm1 .h4: punpcklwd xm0, xm6 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 paddd xm0, xm4 psrld xm0, xm5 vpbroadcastw m0, xm0 jmp wq cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea t0d, [wq+hq] movd xm4, t0d tzcnt t0d, t0d movd xm5, t0d lea t0, [ipred_cfl_16bpc_avx2_table] tzcnt wd, wd movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] psrlw xm4, 1 pxor m6, m6 vpbroadcastw m7, r7m add r6, t0 add wq, t0 movifnidn acq, acmp jmp r6 .h4: movq xm0, [tlq-8] jmp wq .w4: movq xm1, [tlq+2] paddw m0, m4 paddw m0, m1 psrlq m1, m0, 32 paddw m0, m1 psrld m1, m0, 16 paddw m0, m1 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: vextracti128 xm1, m0, 1 paddw xm0, xm1 lea r2d, [hq*2] mov r6d, 0xAAAB6667 shrx r6d, r6d, r2d punpckhwd xm1, xm0, xm6 punpcklwd xm0, xm6 paddd xm0, xm1 movd xm1, r6d psrld xm0, 2 pmulhuw xm0, xm1 psrlw xm0, 1 .w4_end: vpbroadcastw m0, xm0 .s4: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq] IPRED_CFL 4 pmaxsw m4, m6 pminsw m4, m7 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movq [dstq+strideq*2], xm5 movhps [dstq+strideq*1], xm4 movhps [dstq+r6 ], xm5 lea dstq, [dstq+strideq*4] add acq, 32 sub hd, 4 jg .s4_loop RET ALIGN function_align .h8: mova xm0, [tlq-16] jmp wq .w8: vextracti128 xm1, m0, 1 paddw xm0, [tlq+2] paddw xm0, xm4 paddw xm0, xm1 psrld xm1, xm0, 16 paddw xm0, xm1 pblendw xm0, xm6, 0xAA psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w8_end: vpbroadcastw m0, xm0 .s8: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 pmaxsw m4, m6 pmaxsw m5, m6 pminsw m4, m7 pminsw m5, m7 mova [dstq+strideq*0], xm4 mova [dstq+strideq*2], xm5 vextracti128 [dstq+strideq*1], m4, 1 vextracti128 [dstq+r6 ], m5, 1 lea dstq, [dstq+strideq*4] add acq, 64 sub hd, 4 jg .s8_loop RET ALIGN function_align .h16: mova m0, [tlq-32] jmp wq .w16: paddw m0, [tlq+2] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpckhwd xm1, xm0, xm6 punpcklwd xm0, xm6 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w16_end: vpbroadcastw m0, xm0 .s16: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 pmaxsw m4, m6 pmaxsw m5, m6 pminsw m4, m7 pminsw m5, m7 mova [dstq+strideq*0], m4 mova [dstq+strideq*1], m5 lea dstq, [dstq+strideq*2] add acq, 64 sub hd, 2 jg .s16_loop RET ALIGN function_align .h32: mova m0, [tlq-64] paddw m0, [tlq-32] jmp wq .w32: paddw m0, [tlq+ 2] paddw m0, [tlq+34] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpcklwd xm1, xm0, xm6 punpckhwd xm0, xm6 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x6667AAAB shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w32_end: vpbroadcastw m0, xm0 .s32: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 pmaxsw m4, m6 pmaxsw m5, m6 pminsw m4, m7 pminsw m5, m7 mova [dstq+32*0], m4 mova [dstq+32*1], m5 add dstq, strideq add acq, 64 dec hd jg .s32_loop RET cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha mov r6d, r7m shr r6d, 11 lea t0, [ipred_cfl_splat_16bpc_avx2_table] tzcnt wd, wd movifnidn hd, hm movsxd wq, [t0+wq*4] vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4] pxor m6, m6 vpbroadcastw m7, r7m add wq, t0 movifnidn acq, acmp jmp wq cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm vpbroadcastd m5, [pw_2] mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 jg .w16 je .w8 .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: mova xm0, [ypxq+strideq*2] mova xm1, [ypxq+r3 ] vinserti128 m0, [ypxq+strideq*0], 1 vinserti128 m1, [ypxq+strideq*1], 1 lea ypxq, [ypxq+strideq*4] pmaddwd m0, m5 pmaddwd m1, m5 paddd m0, m1 vextracti128 xm1, m0, 1 paddd m4, m0 packssdw xm1, xm0 mova [acq], xm1 add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .dc vpermq m1, m1, q1111 pslld xm0, 2 .w4_hpad_loop: mova [acq], m1 paddd m4, m0 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .dc .w8: mov r5, acq test wpadd, wpadd jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m0, m1 vextracti128 xm1, m0, 1 paddd m4, m0 packssdw xm1, xm0, xm1 mova [acq], xm1 add acq, 16 dec hd jg .w8_loop .w8_hpad: test hpadd, hpadd jz .dc vinserti128 m1, xm1, 1 pslld m0, 2 jmp .hpad .w8_wpad1: pmaddwd xm0, xm5, [ypxq+strideq*0] pmaddwd xm3, xm5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd xm0, xm3 pshufd xm3, xm0, q3333 packssdw xm1, xm0, xm3 paddd xm0, xm3 paddd xm4, xm0 mova [acq], xm1 add acq, 16 dec hd jg .w8_wpad1 jmp .w8_hpad .w16_wpad: mova m0, [ypxq+strideq*0+ 0] mova m1, [ypxq+strideq*1+ 0] cmp wpadd, 2 jl .w16_wpad1 je .w16_wpad2 vpbroadcastd m2, [ypxq+strideq*0+12] vpbroadcastd m3, [ypxq+strideq*1+12] vpblendd m0, m2, 0xf0 vpblendd m1, m3, 0xf0 jmp .w16_wpad_end .w16_wpad2: vpbroadcastd m2, [ypxq+strideq*0+28] vpbroadcastd m3, [ypxq+strideq*1+28] jmp .w16_wpad_end .w16_wpad1: vpbroadcastd m2, [ypxq+strideq*0+44] vpbroadcastd m3, [ypxq+strideq*1+44] vinserti128 m2, [ypxq+strideq*0+32], 0 vinserti128 m3, [ypxq+strideq*1+32], 0 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] REPX {pmaddwd x, m5}, m0, m1, m2, m3 paddd m0, m1 paddd m2, m3 packssdw m1, m0, m2 paddd m0, m2 vpermq m1, m1, q3120 paddd m4, m0 mova [acq], m1 add acq, 32 dec hd jg .w16_wpad jmp .w16_hpad .w16: mov r5, acq test wpadd, wpadd jnz .w16_wpad .w16_loop: pmaddwd m0, m5, [ypxq+strideq*0+ 0] pmaddwd m2, m5, [ypxq+strideq*0+32] pmaddwd m1, m5, [ypxq+strideq*1+ 0] pmaddwd m3, m5, [ypxq+strideq*1+32] lea ypxq, [ypxq+strideq*2] paddd m0, m1 paddd m2, m3 packssdw m1, m0, m2 paddd m0, m2 vpermq m1, m1, q3120 paddd m4, m0 mova [acq], m1 add acq, 32 dec hd jg .w16_loop .w16_hpad: add hpadd, hpadd jz .dc paddd m0, m0 .hpad: mova [acq+32*0], m1 paddd m4, m0 mova [acq+32*1], m1 add acq, 32*2 sub hpadd, 4 jg .hpad .dc: vextracti128 xm1, m4, 1 sub r5, acq ; -w*h*2 tzcnt r1d, r5d paddd xm4, xm1 sub r1d, 2 punpckhqdq xm1, xm4, xm4 movd xm0, r1d paddd xm1, xm4 pshuflw xm4, xm1, q1032 paddd xm1, xm4 psrld xm1, xm0 pxor xm0, xm0 pavgw xm1, xm0 vpbroadcastw m1, xm1 .dc_loop: mova m0, [acq+r5] psubw m0, m1 mova [acq+r5], m0 add r5, 32 jl .dc_loop RET cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm vpbroadcastd m5, [pw_4] mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 jg .w16 je .w8 .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: mova xm0, [ypxq+strideq*0] mova xm1, [ypxq+strideq*1] vinserti128 m0, [ypxq+strideq*2], 1 vinserti128 m1, [ypxq+r3 ], 1 lea ypxq, [ypxq+strideq*4] pmaddwd m0, m5 pmaddwd m1, m5 paddd m4, m0 packssdw m0, m1 paddd m4, m1 mova [acq], m0 add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vextracti128 xm1, m1, 1 vpermq m0, m0, q3333 pslld xm1, 2 .w4_hpad_loop: mova [acq], m0 paddd m4, m1 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc .w8: mov r5, acq test wpadd, wpadd jnz .w8_wpad1 .w8_loop: pmaddwd m1, m5, [ypxq+strideq*0] pmaddwd m0, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m4, m1 packssdw m1, m0 paddd m4, m0 vpermq m2, m1, q3120 mova [acq], m2 add acq, 32 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vpermq m1, m1, q3131 pslld m0, 2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad .w8_wpad1: vpbroadcastd m1, [ypxq+strideq*0+12] vpbroadcastd m0, [ypxq+strideq*1+12] vinserti128 m1, [ypxq+strideq*0+ 0], 0 vinserti128 m0, [ypxq+strideq*1+ 0], 0 lea ypxq, [ypxq+strideq*2] pmaddwd m1, m5 pmaddwd m0, m5 paddd m4, m1 packssdw m1, m0 paddd m4, m0 vpermq m2, m1, q3120 mova [acq], m2 add acq, 32 sub hd, 2 jg .w8_wpad1 jmp .w8_hpad .w16: mov r5, acq test wpadd, wpadd jnz .w16_wpad .w16_loop: pmaddwd m2, m5, [ypxq+strideq*0+ 0] pmaddwd m1, m5, [ypxq+strideq*0+32] pmaddwd m0, m5, [ypxq+strideq*1+ 0] pmaddwd m3, m5, [ypxq+strideq*1+32] lea ypxq, [ypxq+strideq*2] paddd m4, m2 packssdw m2, m1 paddd m4, m1 packssdw m1, m0, m3 paddd m0, m3 vpermq m2, m2, q3120 paddd m4, m0 vpermq m1, m1, q3120 mova [acq+32*0], m2 mova [acq+32*1], m1 add acq, 32*2 sub hd, 2 jg .w16_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad .w16_wpad: mova m2, [ypxq+strideq*0+ 0] mova m0, [ypxq+strideq*1+ 0] cmp wpadd, 2 jl .w16_wpad1 je .w16_wpad2 vpbroadcastd m1, [ypxq+strideq*0+12] vpbroadcastd m3, [ypxq+strideq*1+12] vpblendd m2, m1, 0xf0 vpblendd m0, m3, 0xf0 jmp .w16_wpad_end .w16_wpad2: vpbroadcastd m1, [ypxq+strideq*0+28] vpbroadcastd m3, [ypxq+strideq*1+28] jmp .w16_wpad_end .w16_wpad1: vpbroadcastd m1, [ypxq+strideq*0+44] vpbroadcastd m3, [ypxq+strideq*1+44] vinserti128 m1, [ypxq+strideq*0+32], 0 vinserti128 m3, [ypxq+strideq*1+32], 0 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] REPX {pmaddwd x, m5}, m2, m0, m1, m3 paddd m4, m2 packssdw m2, m1 paddd m4, m1 packssdw m1, m0, m3 paddd m0, m3 vpermq m2, m2, q3120 paddd m4, m0 vpermq m1, m1, q3120 mova [acq+32*0], m2 mova [acq+32*1], m1 add acq, 32*2 sub hd, 2 jg .w16_wpad jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h lea r6, [ipred_cfl_ac_444_16bpc_avx2_table] tzcnt wd, wm movifnidn hpadd, hpadm vpbroadcastd m5, [pw_1] movsxd wq, [r6+wq*4] shl hpadd, 2 add wq, r6 mov hd, hm pxor m4, m4 sub hd, hpadd jmp wq .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: movq xm0, [ypxq+strideq*0] movhps xm0, [ypxq+strideq*1] vpbroadcastq m1, [ypxq+strideq*2] vpbroadcastq m2, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 psllw m0, 3 pmaddwd m1, m0, m5 mova [acq], m0 add acq, 32 paddd m4, m1 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vpermq m0, m0, q3333 paddd m1, m1 mova [acq+32*0], m0 vpermq m1, m1, q3333 mova [acq+32*1], m0 add acq, 32*2 paddd m4, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc .w8: lea r3, [strideq*3] mov r5, acq .w8_loop: mova xm2, [ypxq+strideq*0] vinserti128 m2, [ypxq+strideq*1], 1 mova xm1, [ypxq+strideq*2] vinserti128 m1, [ypxq+r3 ], 1 lea ypxq, [ypxq+strideq*4] psllw m2, 3 psllw m1, 3 mova [acq+32*0], m2 pmaddwd m2, m5 mova [acq+32*1], m1 pmaddwd m0, m1, m5 add acq, 32*2 paddd m4, m2 paddd m4, m0 sub hd, 4 jg .w8_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vperm2i128 m1, m1, 0x11 pslld m0, 2 pxor m2, m2 vpblendd m0, m2, 0x0f jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad .w16_wpad2: vpbroadcastw m3, [ypxq+strideq*0+14] vpbroadcastw m0, [ypxq+strideq*1+14] vpblendd m2, m3, 0xf0 vpblendd m1, m0, 0xf0 jmp .w16_wpad_end .w16: mov r5, acq .w16_loop: mova m2, [ypxq+strideq*0] mova m1, [ypxq+strideq*1] test wpadd, wpadd jnz .w16_wpad2 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] psllw m2, 3 psllw m1, 3 mova [acq+32*0], m2 pmaddwd m2, m5 mova [acq+32*1], m1 pmaddwd m0, m1, m5 add acq, 32*2 paddd m4, m2 paddd m4, m0 sub hd, 2 jg .w16_loop add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc paddd m0, m0 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad .w32: mov r5, acq test wpadd, wpadd jnz .w32_wpad .w32_loop: mova m0, [ypxq+ 0] mova m1, [ypxq+32] add ypxq, strideq psllw m0, 3 psllw m1, 3 pmaddwd m2, m0, m5 mova [acq+32*0], m0 pmaddwd m3, m1, m5 mova [acq+32*1], m1 add acq, 32*2 paddd m2, m3 paddd m4, m2 dec hd jg .w32_loop .w32_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc paddd m2, m2 .w32_hpad_loop: mova [acq+32*0], m0 mova [acq+32*1], m1 paddd m4, m2 mova [acq+32*2], m0 mova [acq+32*3], m1 add acq, 32*4 sub hpadd, 2 jg .w32_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc .w32_wpad: mova m0, [ypxq+ 0] cmp wpadd, 4 jl .w32_wpad2 je .w32_wpad4 vpbroadcastw m1, [ypxq+14] vpblendd m0, m1, 0xf0 jmp .w32_wpad_end .w32_wpad4: vpbroadcastw m1, [ypxq+30] jmp .w32_wpad_end .w32_wpad2: vpbroadcastw m1, [ypxq+46] vinserti128 m1, [ypxq+32], 0 .w32_wpad_end: add ypxq, strideq psllw m0, 3 psllw m1, 3 pmaddwd m2, m0, m5 mova [acq+32*0], m0 pmaddwd m3, m1, m5 mova [acq+32*1], m1 add acq, 32*2 paddd m2, m3 paddd m4, m2 dec hd jg .w32_wpad jmp .w32_hpad cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h vbroadcasti128 m3, [palq] lea r2, [pal_pred_16bpc_avx2_table] tzcnt wd, wm vbroadcasti128 m4, [pal_pred_shuf] movifnidn hd, hm movsxd wq, [r2+wq*4] pshufb m3, m4 punpckhqdq m4, m3, m3 add wq, r2 DEFINE_ARGS dst, stride, stride3, idx, w, h lea stride3q, [strideq*3] jmp wq .w4: mova xm2, [idxq] add idxq, 16 pshufb xm1, xm3, xm2 pshufb xm2, xm4, xm2 punpcklbw xm0, xm1, xm2 punpckhbw xm1, xm2 movq [dstq+strideq*0], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+strideq*1], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: movu m2, [idxq] ; only 16-byte alignment add idxq, 32 pshufb m1, m3, m2 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 vextracti128 [dstq+strideq*2], m0, 1 vextracti128 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: vpermq m2, [idxq+ 0], q3120 vpermq m5, [idxq+32], q3120 add idxq, 64 pshufb m1, m3, m2 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 pshufb m1, m3, m5 pshufb m2, m4, m5 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpermq m2, [idxq+ 0], q3120 vpermq m5, [idxq+32], q3120 add idxq, 64 pshufb m1, m3, m2 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0+ 0], m0 mova [dstq+strideq*0+32], m1 pshufb m1, m3, m5 pshufb m2, m4, m5 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*1+ 0], m0 mova [dstq+strideq*1+32], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: vpermq m2, [idxq+ 0], q3120 vpermq m5, [idxq+32], q3120 add idxq, 64 pshufb m1, m3, m2 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+ 0], m0 mova [dstq+32], m1 pshufb m1, m3, m5 pshufb m2, m4, m5 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+64], m0 mova [dstq+96], m1 add dstq, strideq dec hd jg .w64 RET %endif av-scenechange-0.14.1/src/asm/x86/ipred16_avx512.asm000064400000000000000000000700751046102023000176450ustar 00000000000000; Copyright © 2022, VideoLAN and dav1d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3 db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11 db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7 db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15 smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 times 4 db 10, 11, 12, 13, 2, 3, -1, -1 filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 times 4 db 26, 27, 28, 29, 14, 15, -1, -1 filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9 pw_1: times 2 dw 1 dd 10 filter_rnd: dd 32 dd 1 dd 8 dd 11 filter_shift: times 2 dw 6 dd 0 times 2 dw 4 dd 9 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc cextern filter_intra_taps SECTION .text %macro PAETH 3 ; top, signed_ldiff, ldiff paddw m0, m%2, m2 psubw m1, m0, m3 ; tldiff psubw m0, m%1 ; tdiff pabsw m1, m1 pabsw m0, m0 pcmpgtw k1, m0, m1 pminsw m0, m1 pcmpgtw k2, m%3, m0 vpblendmw m0{k1}, m%1, m3 vpblendmw m0{k2}, m2, m0 %endmacro INIT_ZMM avx512icl cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h %define base r6-ipred_paeth_16bpc_avx512icl_table lea r6, [ipred_paeth_16bpc_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] vpbroadcastw m3, [tlq] ; topleft add wq, r6 jmp wq .w4: vpbroadcastq m4, [tlq+2] ; top movsldup m7, [base+ipred_shuf] lea r6, [strideq*3] psubw m5, m4, m3 pabsw m6, m5 .w4_loop: sub tlq, 16 vbroadcasti32x4 m2, [tlq] pshufb m2, m7 ; left PAETH 4, 5, 6 vextracti32x4 xm1, m0, 2 vextracti32x4 xm8, ym0, 1 vextracti32x4 xm9, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm8 movq [dstq+r6 ], xm9 sub hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm8 movhps [dstq+r6 ], xm9 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .w8: vbroadcasti32x4 m4, [tlq+2] movsldup m7, [base+ipred_shuf] lea r6, [strideq*3] psubw m5, m4, m3 pabsw m6, m5 .w8_loop: sub tlq, 8 vpbroadcastq m2, [tlq] pshufb m2, m7 PAETH 4, 5, 6 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: vbroadcasti32x8 m4, [tlq+2] movsldup m7, [base+ipred_shuf] psubw m5, m4, m3 pabsw m6, m5 .w16_loop: sub tlq, 4 vpbroadcastd m2, [tlq] pshufb m2, m7 PAETH 4, 5, 6 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET .w32: movu m4, [tlq+2] psubw m5, m4, m3 pabsw m6, m5 .w32_loop: sub tlq, 2 vpbroadcastw m2, [tlq] PAETH 4, 5, 6 mova [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET .w64: movu m4, [tlq+ 2] movu m7, [tlq+66] psubw m5, m4, m3 psubw m8, m7, m3 pabsw m6, m5 pabsw m9, m8 .w64_loop: sub tlq, 2 vpbroadcastw m2, [tlq] PAETH 4, 5, 6 mova [dstq+64*0], m0 PAETH 7, 8, 9 mova [dstq+64*1], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 %define base r6-$$ lea r6, [$$] tzcnt wd, wm mov hd, hm movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4] lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] neg hq vpbroadcastw m6, [tlq+hq*2] ; bottom lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq] lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastq m5, [tlq+2] ; top movsldup m4, [ipred_shuf] psubw m5, m6 ; top - bottom .w4_loop: vbroadcasti32x4 m3, [weightsq+hq*2] pshufb m3, m4 pmulhrsw m3, m5 paddw m3, m6 vextracti32x4 xm0, m3, 3 vextracti32x4 xm1, ym3, 1 vextracti32x4 xm2, m3, 2 movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 add hq, 8 jg .end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jl .w4_loop .end: RET .w8: vbroadcasti32x4 m5, [tlq+2] ; top movsldup m4, [ipred_shuf] psubw m5, m6 ; top - bottom .w8_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m4 pmulhrsw m0, m5 paddw m0, m6 vextracti32x4 [dstq+strideq*0], m0, 3 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: vbroadcasti32x8 m5, [tlq+2] ; top movsldup m4, [ipred_shuf] psubw m5, m6 ; top - bottom .w16_loop: vpbroadcastd m0, [weightsq+hq*2+0] vpbroadcastd m1, [weightsq+hq*2+4] pshufb m0, m4 pshufb m1, m4 pmulhrsw m0, m5 pmulhrsw m1, m5 paddw m0, m6 paddw m1, m6 vextracti32x8 [dstq+strideq*0], m0, 1 mova [dstq+strideq*1], ym0 vextracti32x8 [dstq+strideq*2], m1, 1 mova [dstq+stride3q ], ym1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: movu m5, [tlq+2] psubw m5, m6 .w32_loop: vpbroadcastw m0, [weightsq+hq*2+0] vpbroadcastw m1, [weightsq+hq*2+2] vpbroadcastw m2, [weightsq+hq*2+4] vpbroadcastw m3, [weightsq+hq*2+6] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w32_loop RET .w64: movu m4, [tlq+ 2] movu m5, [tlq+66] psubw m4, m6 psubw m5, m6 .w64_loop: vpbroadcastw m1, [weightsq+hq*2+0] vpbroadcastw m3, [weightsq+hq*2+2] pmulhrsw m0, m4, m1 pmulhrsw m1, m5 pmulhrsw m2, m4, m3 pmulhrsw m3, m5 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 mova [dstq+strideq*1+64*0], m2 mova [dstq+strideq*1+64*1], m3 lea dstq, [dstq+strideq*2] add hq, 2 jl .w64_loop RET cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 lea r6, [$$] mov wd, wm movifnidn hd, hm vpbroadcastw m6, [tlq+wq*2] ; right tzcnt wd, wd add hd, hd movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4] sub tlq, hq lea stride3q, [strideq*3] lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq] jmp wq .w4: movsldup m4, [base+ipred_shuf] vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2] .w4_loop: vbroadcasti32x4 m0, [tlq+hq-16] ; left pshufb m0, m4 psubw m0, m6 ; left - right pmulhrsw m0, m5 paddw m0, m6 vextracti32x4 xm1, m0, 2 vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 sub hd, 8*2 jl .end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w4_loop .end: RET .w8: movsldup m4, [base+ipred_shuf] vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2] .w8_loop: vpbroadcastq m0, [tlq+hq-8] ; left pshufb m0, m4 psubw m0, m6 ; left - right pmulhrsw m0, m5 paddw m0, m6 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w8_loop RET .w16: movsldup m4, [base+ipred_shuf] vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2] .w16_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastd m1, [tlq+hq-8] pshufb m0, m4 pshufb m1, m4 psubw m0, m6 psubw m1, m6 pmulhrsw m0, m5 pmulhrsw m1, m5 paddw m0, m6 paddw m1, m6 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w16_loop RET .w32: movu m5, [base+smooth_weights_1d_16bpc+32*2] .w32_loop: vpbroadcastq m3, [tlq+hq-8] punpcklwd m3, m3 psubw m3, m6 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w32_loop RET .w64: movu m4, [base+smooth_weights_1d_16bpc+64*2] movu m5, [base+smooth_weights_1d_16bpc+64*3] .w64_loop: vpbroadcastw m1, [tlq+hq-2] vpbroadcastw m3, [tlq+hq-4] psubw m1, m6 psubw m3, m6 pmulhrsw m0, m4, m1 pmulhrsw m1, m5 pmulhrsw m2, m4, m3 pmulhrsw m3, m5 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 mova [dstq+strideq*1+64*0], m2 mova [dstq+strideq*1+64*1], m3 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w64_loop RET cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 lea r6, [$$] mov wd, wm movifnidn hd, hm vpbroadcastw m13, [tlq+wq*2] ; right tzcnt wd, wd add hd, hd movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4] mov r5d, 0x55555555 sub tlq, hq mova m14, [base+smooth_perm] kmovd k1, r5d vpbroadcastw m0, [tlq] ; bottom mov r5, 0x3333333333333333 pxor m15, m15 lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq] kmovq k2, r5 lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2] jmp wq .w4: vpbroadcastq m5, [tlq+hq+2] movshdup m3, [base+ipred_shuf] movsldup m4, [base+ipred_shuf] vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4] lea stride3q, [strideq*3] punpcklwd m5, m0 ; top, bottom .w4_loop: vbroadcasti32x4 m0, [v_weightsq] vpbroadcastq m2, [tlq+hq-8] mova m1, m13 pshufb m0, m3 pmaddwd m0, m5 pshufb m1{k2}, m2, m4 ; left, right vpdpwssd m0, m1, m6 vpermb m0, m14, m0 pavgw ym0, ym15 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add v_weightsq, 4*4 sub hd, 4*2 jg .w4_loop RET .w8: vbroadcasti32x4 ym5, [tlq+hq+2] movshdup m6, [base+ipred_shuf] movsldup m7, [base+ipred_shuf] pmovzxwd m5, ym5 vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4] lea stride3q, [strideq*3] vpblendmw m5{k1}, m0, m5 ; top, bottom .w8_loop: vpbroadcastq m0, [v_weightsq+0] vpbroadcastq m1, [v_weightsq+8] vpbroadcastd m3, [tlq+hq-4] vpbroadcastd m4, [tlq+hq-8] pshufb m0, m6 pmaddwd m0, m5 pshufb m1, m6 pmaddwd m1, m5 mova m2, m13 pshufb m2{k2}, m3, m7 ; left, right mova m3, m13 pshufb m3{k2}, m4, m7 vpdpwssd m0, m2, m8 vpdpwssd m1, m3, m8 add v_weightsq, 4*4 vpermt2b m0, m14, m1 pavgw m0, m15 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w8_loop RET .w16: pmovzxwd m5, [tlq+hq+2] mova m6, [base+smooth_weights_2d_16bpc+16*4] vpblendmw m5{k1}, m0, m5 ; top, bottom .w16_loop: vpbroadcastd m0, [v_weightsq+0] vpbroadcastd m1, [v_weightsq+4] pmaddwd m0, m5 pmaddwd m1, m5 mova m2, m13 vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right mova m3, m13 vpbroadcastw m3{k1}, [tlq+hq-4] vpdpwssd m0, m2, m6 vpdpwssd m1, m3, m6 add v_weightsq, 2*4 vpermt2b m0, m14, m1 pavgw m0, m15 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w16_loop RET .w32: pmovzxwd m5, [tlq+hq+ 2] pmovzxwd m6, [tlq+hq+34] mova m7, [base+smooth_weights_2d_16bpc+32*4] mova m8, [base+smooth_weights_2d_16bpc+32*6] vpblendmw m5{k1}, m0, m5 ; top, bottom vpblendmw m6{k1}, m0, m6 .w32_loop: vpbroadcastd m2, [v_weightsq+0] vpbroadcastd m3, [v_weightsq+4] pmaddwd m0, m5, m2 pmaddwd m2, m6 pmaddwd m1, m5, m3 pmaddwd m3, m6 mova m4, m13 vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right vpdpwssd m0, m4, m7 vpdpwssd m2, m4, m8 mova m4, m13 vpbroadcastw m4{k1}, [tlq+hq-4] vpdpwssd m1, m4, m7 vpdpwssd m3, m4, m8 add v_weightsq, 2*4 vpermt2b m0, m14, m2 vpermt2b m1, m14, m3 pavgw m0, m15 pavgw m1, m15 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w32_loop RET .w64: pmovzxwd m5, [tlq+hq+ 2] pmovzxwd m6, [tlq+hq+34] pmovzxwd m7, [tlq+hq+66] pmovzxwd m8, [tlq+hq+98] mova m9, [base+smooth_weights_2d_16bpc+64*4] vpblendmw m5{k1}, m0, m5 ; top, bottom mova m10, [base+smooth_weights_2d_16bpc+64*5] vpblendmw m6{k1}, m0, m6 mova m11, [base+smooth_weights_2d_16bpc+64*6] vpblendmw m7{k1}, m0, m7 mova m12, [base+smooth_weights_2d_16bpc+64*7] vpblendmw m8{k1}, m0, m8 .w64_loop: vpbroadcastd m3, [v_weightsq] mova m4, m13 vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right pmaddwd m0, m5, m3 pmaddwd m2, m6, m3 pmaddwd m1, m7, m3 pmaddwd m3, m8 vpdpwssd m0, m4, m9 vpdpwssd m2, m4, m10 vpdpwssd m1, m4, m11 vpdpwssd m3, m4, m12 add v_weightsq, 1*4 vpermt2b m0, m14, m2 vpermt2b m1, m14, m3 pavgw m0, m15 pavgw m1, m15 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq sub hd, 1*2 jg .w64_loop RET cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 lea r6, [pal_pred_16bpc_avx512icl_table] tzcnt wd, wm mova m2, [pal_pred_perm] movsxd wq, [r6+wq*4] mova xm3, [palq] movifnidn hd, hm add wq, r6 lea stride3q, [strideq*3] jmp wq .w4: pmovzxbw ym0, [idxq] add idxq, 16 vpermw ym0, ym0, ym3 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: pmovzxbw m0, [idxq] add idxq, 32 vpermw m0, m0, m3 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: vpermb m1, m2, [idxq] add idxq, 64 vpermw m0, m1, m3 psrlw m1, 8 vpermw m1, m1, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpermb m1, m2, [idxq] add idxq, 64 vpermw m0, m1, m3 psrlw m1, 8 vpermw m1, m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: vpermb m1, m2, [idxq] add idxq, 64 vpermw m0, m1, m3 psrlw m1, 8 vpermw m1, m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq dec hd jg .w64 RET ; The ipred_filter SIMD processes 4x2 blocks in the following order which ; increases parallelism compared to doing things row by row. ; w4 w8 w16 w32 ; 1 1 2 1 2 5 6 1 2 5 6 9 a d e ; 2 2 3 2 3 6 7 2 3 6 7 a b e f ; 3 3 4 3 4 7 8 3 4 7 8 b c f g ; 4 4 5 4 5 8 9 4 5 8 9 c d g h cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top %define base r6-$$ lea r6, [$$] %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 movifnidn hd, hm movu xm0, [tlq-6] pmovsxbw m7, [base+filter_intra_taps+filterq+32*0] pmovsxbw m8, [base+filter_intra_taps+filterq+32*1] mov r5d, r8m ; bitdepth_max movsldup m9, [base+filter_permA] movshdup m10, [base+filter_permA] shr r5d, 11 ; is_12bpc jnz .12bpc psllw m7, 2 ; upshift multipliers so that packusdw psllw m8, 2 ; will perform clipping for free .12bpc: vpbroadcastd m5, [base+filter_rnd+r5*8] vpbroadcastd m6, [base+filter_shift+r5*8] sub wd, 8 jl .w4 .w8: call .main4 movsldup m11, [filter_permB] lea r5d, [hq*2+2] movshdup m12, [filter_permB] lea topq, [tlq+2] mova m13, [filter_permC] sub hd, 4 vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1 sub tlq, r5 %if WIN64 push r7 push r8 %endif mov r7, dstq mov r8d, hd .w8_loop: movlps xm4, xm0, [tlq+hq*2] call .main8 lea dstq, [dstq+strideq*2] sub hd, 2 jge .w8_loop test wd, wd jz .end mov r2d, 0x0d kmovb k1, r2d lea r2, [strideq*3] .w16: movd xmm0, [r7+strideq*1+12] vpblendd xmm0, [topq+8], 0x0e ; t1 t2 pinsrw xm4, xmm0, [r7+strideq*0+14], 2 call .main8 add r7, 16 vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3 mov hd, r8d mov dstq, r7 add topq, 16 .w16_loop: movd xmm1, [dstq+strideq*2-4] punpcklwd xm4, xmm1, xmm0 movd xmm0, [dstq+r2-4] shufps xm4{k1}, xmm0, xm0, q3210 call .main8 lea dstq, [dstq+strideq*2] sub hd, 2 jge .w16_loop sub wd, 8 jg .w16 .end: vpermb m2, m11, m0 mova ym1, ym5 vpdpwssd m1, m2, m7 vpermb m2, m12, m0 vpdpwssd m1, m2, m8 %if WIN64 pop r8 pop r7 %endif vextracti32x8 ym2, m1, 1 paddd ym1, ym2 packusdw ym1, ym1 vpsrlvw ym1, ym6 vpermt2q m0, m13, m1 vextracti32x4 [dstq+strideq*0], m0, 2 vextracti32x4 [dstq+strideq*1], ym0, 1 RET .w4_loop: movlps xm0, [tlq-10] lea dstq, [dstq+strideq*2] sub tlq, 4 .w4: call .main4 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 sub hd, 2 jg .w4_loop RET ALIGN function_align .main4: vpermb m2, m9, m0 mova ym1, ym5 vpdpwssd m1, m2, m7 vpermb m0, m10, m0 vpdpwssd m1, m0, m8 vextracti32x8 ym0, m1, 1 paddd ym0, ym1 vextracti32x4 xm1, ym0, 1 packusdw xm0, xm1 ; clip vpsrlvw xm0, xm6 ret ALIGN function_align .main8: vpermb m3, m11, m0 mova ym2, ym5 vpdpwssd m2, m3, m7 vpermb m3, m9, m4 mova ym1, ym5 vpdpwssd m1, m3, m7 vpermb m3, m12, m0 vpdpwssd m2, m3, m8 vpermb m3, m10, m4 vpdpwssd m1, m3, m8 vextracti32x8 ym4, m2, 1 vextracti32x8 ym3, m1, 1 paddd ym2, ym4 paddd ym1, ym3 packusdw ym1, ym2 ; clip vpsrlvw ym1, ym6 vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1 vextracti32x4 [dstq+strideq*0], m0, 2 vextracti32x4 [dstq+strideq*1], ym0, 1 ret %endif av-scenechange-0.14.1/src/asm/x86/ipred16_sse.asm000064400000000000000000004076101046102023000174100ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" SECTION_RODATA filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 z_base_inc_z2: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1, 8, 9, 8, 9, 10, 11, 12, 13 db 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 z2_top_shufA: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 z2_top_shufB: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_left_shufA: db 14, 15, 12, 13, 10, 11, 8, 9, 12, 13, 10, 11, 8, 9, 6, 7 z2_left_shufB: db 14, 15, 10, 11, 6, 7, 2, 3, 12, 13, 8, 9, 4, 5, 0, 1 z_filt_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 z_filt_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 z_filt_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 z_filt_wh4: db 7, 7, 19, 7, z_filt_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 ALIGN 8 pb_2_3: times 4 db 2, 3 z2_dy_offset: dw 96*64, 96*64, 95*64, 95*64 z_filt_k: times 4 dw 8 times 4 dw 6 times 4 dw 4 times 4 dw 5 pw_m3584: times 4 dw -3584 pw_m3072: times 4 dw -3072 pw_m2560: times 4 dw -2560 pw_m2048: times 4 dw -2048 pw_m1536: times 4 dw -1536 pw_m1024: times 4 dw -1024 pw_m512: times 4 dw -512 pw_1: times 4 dw 1 pw_2: times 4 dw 2 pw_3: times 4 dw 3 pw_62: times 4 dw 62 pw_256: times 4 dw 256 pw_512: times 4 dw 512 pw_2048: times 4 dw 2048 %define pw_4 (z_filt_k+8*2) %define pw_8 (z_filt_k+8*0) %define pw_m1to4 z2_upsample_l %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) %define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) %define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc cextern dr_intra_derivative cextern filter_intra_taps SECTION .text INIT_XMM ssse3 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_16bpc_ssse3_table movd m4, wm tzcnt wd, wm add tlq, 2 movifnidn hd, hm pxor m3, m3 pavgw m4, m3 movd m5, wd movu m0, [tlq] movsxd r6, [r5+wq*4] add r6, r5 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_left_16bpc_ssse3_table mov hd, hm movd m4, hm tzcnt r6d, hd sub tlq, hq tzcnt wd, wm pxor m3, m3 sub tlq, hq pavgw m4, m3 movd m5, r6d movu m0, [tlq] movsxd r6, [r5+r6*4] add r6, r5 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu m2, [tlq+112] movu m1, [tlq+ 96] paddw m0, m2 movu m2, [tlq+ 80] paddw m1, m2 movu m2, [tlq+ 64] paddw m0, m2 paddw m0, m1 .h32: movu m1, [tlq+ 48] movu m2, [tlq+ 32] paddw m1, m2 paddw m0, m1 .h16: movu m1, [tlq+ 16] paddw m0, m1 .h8: movhlps m1, m0 paddw m0, m1 .h4: punpcklwd m0, m3 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 lea stride3q, [strideq*3] pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm tzcnt r6d, hd lea r5d, [wq+hq] movd m4, r5d tzcnt r5d, r5d movd m5, r5d LEA r5, ipred_dc_16bpc_ssse3_table tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] pxor m3, m3 psrlw m4, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movq m0, [tlq-8] jmp wq .w4: movq m1, [tlq+2] paddw m1, m0 punpckhwd m0, m3 punpcklwd m1, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 cmp hd, 4 jg .w4_mul psrlw m0, 3 jmp .w4_end .w4_mul: mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 16 cmove r2d, r3d psrld m0, 2 movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w4_end: pshuflw m0, m0, q0000 .s4: movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET .h8: mova m0, [tlq-16] jmp wq .w8: movu m1, [tlq+2] paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 8 je .w8_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 32 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET .h16: mova m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w16: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 paddw m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 16 je .w16_end mov r2d, 0xAAAB mov r3d, 0x6667 test hd, 8|32 cmovz r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16c: mova m1, m0 .s16: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*2+16*0], m0 mova [dstq+strideq*2+16*1], m1 mova [dstq+stride3q +16*0], m0 mova [dstq+stride3q +16*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET .h32: mova m0, [tlq-64] paddw m0, [tlq-48] paddw m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w32: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 movu m2, [tlq+34] paddw m0, m2 movu m2, [tlq+50] paddw m1, m2 paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 32 je .w32_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 8 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32c: mova m1, m0 mova m2, m0 mova m3, m0 .s32: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 mova [dstq+strideq*0+16*2], m2 mova [dstq+strideq*0+16*3], m3 mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*1+16*2], m2 mova [dstq+strideq*1+16*3], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s32 RET .h64: mova m0, [tlq-128] mova m1, [tlq-112] paddw m0, [tlq- 96] paddw m1, [tlq- 80] paddw m0, [tlq- 64] paddw m1, [tlq- 48] paddw m0, [tlq- 32] paddw m1, [tlq- 16] paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 2] movu m2, [tlq+ 18] paddw m1, m2 movu m2, [tlq+ 34] paddw m0, m2 movu m2, [tlq+ 50] paddw m1, m2 movu m2, [tlq+ 66] paddw m0, m2 movu m2, [tlq+ 82] paddw m1, m2 movu m2, [tlq+ 98] paddw m0, m2 movu m2, [tlq+114] paddw m1, m2 paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 64 je .w64_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 16 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w64_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s64: mova [dstq+16*0], m0 mova [dstq+16*1], m0 mova [dstq+16*2], m0 mova [dstq+16*3], m0 mova [dstq+16*4], m0 mova [dstq+16*5], m0 mova [dstq+16*6], m0 mova [dstq+16*7], m0 add dstq, strideq dec hd jg .s64 RET cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 mov r6d, r8m LEA r5, ipred_dc_128_16bpc_ssse3_table tzcnt wd, wm shr r6d, 11 movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_16bpc_ssse3_table movifnidn hd, hm movu m0, [tlq+ 2] movu m1, [tlq+ 18] movu m2, [tlq+ 34] movu m3, [tlq+ 50] cmp wd, 64 je .w64 tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq .w64: WIN64_SPILL_XMM 8 movu m4, [tlq+ 66] movu m5, [tlq+ 82] movu m6, [tlq+ 98] movu m7, [tlq+114] .w64_loop: mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 mova [dstq+16*4], m4 mova [dstq+16*5], m5 mova [dstq+16*6], m6 mova [dstq+16*7], m7 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 %define base r5-ipred_h_16bpc_ssse3_table tzcnt wd, wm LEA r5, ipred_h_16bpc_ssse3_table movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m2, [base+pw_256] movddup m3, [base+pb_2_3] add wq, r5 lea stride3q, [strideq*3] jmp wq .w4: sub tlq, 8 movq m3, [tlq] pshuflw m0, m3, q3333 pshuflw m1, m3, q2222 pshuflw m2, m3, q1111 pshuflw m3, m3, q0000 movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m1 movq [dstq+strideq*2], m2 movq [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: sub tlq, 8 movq m3, [tlq] punpcklwd m3, m3 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: sub tlq, 4 movd m1, [tlq] pshufb m0, m1, m3 pshufb m1, m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m0 mova [dstq+strideq*1+16*0], m1 mova [dstq+strideq*1+16*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16 RET .w32: sub tlq, 4 movd m1, [tlq] pshufb m0, m1, m3 pshufb m1, m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m0 mova [dstq+strideq*0+16*2], m0 mova [dstq+strideq*0+16*3], m0 mova [dstq+strideq*1+16*0], m1 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*1+16*2], m1 mova [dstq+strideq*1+16*3], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: sub tlq, 2 movd m0, [tlq] pshufb m0, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m0 mova [dstq+16*2], m0 mova [dstq+16*3], m0 mova [dstq+16*4], m0 mova [dstq+16*5], m0 mova [dstq+16*6], m0 mova [dstq+16*7], m0 add dstq, strideq dec hd jg .w64 RET cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left %define base r5-ipred_paeth_16bpc_ssse3_table movifnidn hd, hm pshuflw m4, [tlq], q0000 mov leftq, tlq add hd, hd punpcklqdq m4, m4 ; topleft sub leftq, hq and wd, ~7 jnz .w8 movddup m5, [tlq+2] ; top psubw m6, m5, m4 pabsw m7, m6 .w4_loop: movd m1, [leftq+hq-4] punpcklwd m1, m1 punpckldq m1, m1 ; left %macro PAETH 0 paddw m0, m6, m1 psubw m2, m4, m0 ; tldiff psubw m0, m5 ; tdiff pabsw m2, m2 pabsw m0, m0 pminsw m2, m0 pcmpeqw m0, m2 pand m3, m5, m0 pandn m0, m4 por m0, m3 pcmpgtw m3, m7, m2 pand m0, m3 pandn m3, m1 por m0, m3 %endmacro PAETH movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2*2 jg .w4_loop RET .w8: %if ARCH_X86_32 PUSH r6 %define r7d hm %assign regs_used 7 %elif WIN64 movaps r4m, m8 PUSH r7 %assign regs_used 8 %endif %if ARCH_X86_64 movddup m8, [pw_256] %endif lea tlq, [tlq+wq*2+2] neg wq mov r7d, hd .w8_loop0: movu m5, [tlq+wq*2] mov r6, dstq add dstq, 16 psubw m6, m5, m4 pabsw m7, m6 .w8_loop: movd m1, [leftq+hq-2] %if ARCH_X86_64 pshufb m1, m8 %else pshuflw m1, m1, q0000 punpcklqdq m1, m1 %endif PAETH mova [r6], m0 add r6, strideq sub hd, 1*2 jg .w8_loop mov hd, r7d add wq, 8 jl .w8_loop0 %if WIN64 movaps m8, r4m %endif RET %if ARCH_X86_64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 4 %endif cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights LEA weightsq, smooth_weights_1d_16bpc mov hd, hm lea weightsq, [weightsq+hq*4] neg hq movd m5, [tlq+hq*2] ; bottom pshuflw m5, m5, q0000 punpcklqdq m5, m5 cmp wd, 4 jne .w8 movddup m4, [tlq+2] ; top lea r3, [strideq*3] psubw m4, m5 ; top - bottom .w4_loop: movq m1, [weightsq+hq*2] punpcklwd m1, m1 pshufd m0, m1, q1100 punpckhdq m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop RET .w8: %if ARCH_X86_32 PUSH r6 %assign regs_used 7 mov hm, hq %define hq hm %elif WIN64 PUSH r7 %assign regs_used 8 %endif .w8_loop0: mov t0, hq movu m4, [tlq+2] add tlq, 16 mov r6, dstq add dstq, 16 psubw m4, m5 .w8_loop: movq m3, [weightsq+t0*2] punpcklwd m3, m3 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [r6+strideq*0], m0 mova [r6+strideq*1], m1 lea r6, [r6+strideq*2] mova [r6+strideq*0], m2 mova [r6+strideq*1], m3 lea r6, [r6+strideq*2] add t0, 4 jl .w8_loop sub wd, 8 jg .w8_loop0 RET cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights LEA weightsq, smooth_weights_1d_16bpc mov wd, wm movifnidn hd, hm movd m5, [tlq+wq*2] ; right sub tlq, 8 add hd, hd pshuflw m5, m5, q0000 sub tlq, hq punpcklqdq m5, m5 cmp wd, 4 jne .w8 movddup m4, [weightsq+4*2] lea r3, [strideq*3] .w4_loop: movq m1, [tlq+hq] ; left punpcklwd m1, m1 psubw m1, m5 ; left - right pshufd m0, m1, q3322 punpckldq m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movhps [dstq+strideq*2], m1 movq [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w4_loop RET .w8: lea weightsq, [weightsq+wq*4] neg wq %if ARCH_X86_32 PUSH r6 %assign regs_used 7 %define hd hm %elif WIN64 PUSH r7 %assign regs_used 8 %endif .w8_loop0: mov t0d, hd mova m4, [weightsq+wq*2] mov r6, dstq add dstq, 16 .w8_loop: movq m3, [tlq+t0*(1+ARCH_X86_32)] punpcklwd m3, m3 psubw m3, m5 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [r6+strideq*0], m0 mova [r6+strideq*1], m1 lea r6, [r6+strideq*2] mova [r6+strideq*0], m2 mova [r6+strideq*1], m3 lea r6, [r6+strideq*2] sub t0d, 4*(1+ARCH_X86_64) jg .w8_loop add wq, 8 jl .w8_loop0 RET %if ARCH_X86_64 DECLARE_REG_TMP 10 %else DECLARE_REG_TMP 3 %endif cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ h_weights, v_weights, top LEA h_weightsq, smooth_weights_2d_16bpc mov wd, wm mov hd, hm movd m7, [tlq+wq*2] ; right lea v_weightsq, [h_weightsq+hq*8] neg hq movd m6, [tlq+hq*2] ; bottom pshuflw m7, m7, q0000 pshuflw m6, m6, q0000 cmp wd, 4 jne .w8 movq m4, [tlq+2] ; top mova m5, [h_weightsq+4*4] punpcklwd m4, m6 ; top, bottom pxor m6, m6 .w4_loop: movq m1, [v_weightsq+hq*4] sub tlq, 4 movd m3, [tlq] ; left pshufd m0, m1, q0000 pshufd m1, m1, q1111 pmaddwd m0, m4 punpcklwd m3, m7 ; left, right pmaddwd m1, m4 pshufd m2, m3, q1111 pshufd m3, m3, q0000 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m6 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] add hq, 2 jl .w4_loop RET .w8: %if ARCH_X86_32 lea h_weightsq, [h_weightsq+wq*4] mov t0, tlq mov r1m, tlq mov r2m, hq %define m8 [h_weightsq+16*0] %define m9 [h_weightsq+16*1] %else %if WIN64 movaps r4m, m8 movaps r6m, m9 PUSH r7 PUSH r8 %endif PUSH r9 PUSH r10 %assign regs_used 11 lea h_weightsq, [h_weightsq+wq*8] lea topq, [tlq+wq*2] neg wq mov r8, tlq mov r9, hq %endif punpcklqdq m6, m6 .w8_loop0: %if ARCH_X86_32 movu m5, [t0+2] add t0, 16 mov r0m, t0 %else movu m5, [topq+wq*2+2] mova m8, [h_weightsq+wq*4+16*0] mova m9, [h_weightsq+wq*4+16*1] %endif mov t0, dstq add dstq, 16 punpcklwd m4, m5, m6 punpckhwd m5, m6 .w8_loop: movd m1, [v_weightsq+hq*4] sub tlq, 2 movd m3, [tlq] ; left pshufd m1, m1, q0000 pmaddwd m0, m4, m1 pshuflw m3, m3, q0000 pmaddwd m1, m5 punpcklwd m3, m7 ; left, right pmaddwd m2, m8, m3 pmaddwd m3, m9 paddd m0, m2 paddd m1, m3 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pxor m1, m1 pavgw m0, m1 mova [t0], m0 add t0, strideq inc hq jl .w8_loop %if ARCH_X86_32 mov t0, r0m mov tlq, r1m add h_weightsq, 16*2 mov hq, r2m sub dword wm, 8 jg .w8_loop0 %else mov tlq, r8 mov hq, r9 add wq, 8 jl .w8_loop0 %endif %if WIN64 movaps m8, r4m movaps m9, r6m %endif RET %if ARCH_X86_64 cglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx %define base r7-$$ %define bdmaxm r8m lea r7, [$$] %else cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx %define base r1-$$ %define stridemp [rsp+4*0] %define bdmaxm [rsp+4*1] mov r3, r8m mov stridemp, r1 mov bdmaxm, r3 LEA r1, $$ %endif tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm add tlq, 2 movsxd wq, [base+ipred_z1_16bpc_ssse3_table+wq*4] mov dxd, angled movddup m0, [base+pw_256] and dxd, 0x7e movddup m7, [base+pw_62] add angled, 165 ; ~90 lea wq, [base+wq+ipred_z1_16bpc_ssse3_table] movzx dxd, word [base+dr_intra_derivative+dxq] xor angled, 0x4ff ; d = 90 - angle jmp wq .w4: lea r3d, [angleq+88] test r3d, 0x480 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r3d, 9 add r3d, hd cmp r3d, 8 jg .w4_no_upsample ; h > 8 || (w == h && is_sm) movd m3, [tlq+14] movu m2, [tlq+ 0] ; 1 2 3 4 5 6 7 8 movd m1, bdmaxm pshufb m3, m0 palignr m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8 paddw m4, [tlq- 2] ; 0 1 2 3 4 5 6 7 add dxd, dxd mova [rsp+32], m3 palignr m3, m2, 2 ; 2 3 4 5 6 7 8 8 pshufb m1, m0 paddw m3, m2 ; -1 * a + 9 * b + 9 * c + -1 * d psubw m5, m3, m4 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 movd m4, dxd psraw m5, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 paddw m3, m5 pxor m5, m5 pmaxsw m3, m5 mov r3d, dxd pavgw m3, m5 pshufb m4, m0 pminsw m3, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 mova m3, [base+z_upsample] movifnidn strideq, stridemp mova [rsp+ 0], m1 paddw m5, m4, m4 mova [rsp+16], m2 punpcklqdq m4, m5 ; xpos0 xpos1 .w4_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu m1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base1 movu m2, [rsp+r2*2] pshufb m1, m3 pshufb m2, m3 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m7, m4 ; frac psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) paddw m4, m5 ; xpos += dx paddw m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_upsample_loop RET .w4_no_upsample: mov r3d, 7 ; max_base test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea r3d, [hq+3] movd m1, r3d movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 pcmpeqb m1, [base+z_filt_wh4] pand m1, m3 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] pmovmskb r5d, m1 mov r3d, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x55555555 movd m3, [tlq+r3*2] shr r5d, 30 ; filter_strength movd [rsp+12], m1 pshuflw m3, m3, q0000 mova [rsp+16*1], m2 lea r2d, [r3+2] movq [rsp+r3*2+18], m3 cmp hd, 8 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w4_main: lea tlq, [tlq+r3*2] movd m4, dxd movddup m1, [base+z_base_inc] ; base_inc << 6 movd m6, [tlq] ; top[max_base_x] shl r3d, 6 movd m3, r3d pshufb m4, m0 mov r5d, dxd ; xpos pshufb m6, m0 sub r5, r3 pshufb m3, m0 paddw m5, m4, m4 psubw m3, m1 ; max_base_x punpcklqdq m4, m5 ; xpos0 xpos1 movifnidn strideq, stridemp .w4_loop: lea r3, [r5+dxq] sar r5, 6 ; base0 movq m0, [tlq+r5*2+0] movq m1, [tlq+r5*2+2] lea r5, [r3+dxq] sar r3, 6 ; base1 movhps m0, [tlq+r3*2+0] movhps m1, [tlq+r3*2+2] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 ; xpos < max_base_x paddw m4, m5 ; xpos += dx paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jz .w4_end lea dstq, [dstq+strideq*2] test r5d, r5d jl .w4_loop .w4_end_loop: movq [dstq+strideq*0], m6 movq [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_end_loop .w4_end: RET .w8: lea r3d, [angleq+88] and r3d, ~0x7f or r3d, hd cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 movu m1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 movu m5, [tlq+ 2] ; 2 3 4 5 6 7 8 9 movu m3, [tlq+ 4] ; 3 4 5 6 7 8 9 a paddw m5, m1 paddw m3, [tlq- 2] ; 0 1 2 3 4 5 6 7 psubw m2, m5, m3 movu m6, [tlq+18] ; a b c d e f g _ psraw m2, 3 movu m3, [tlq+20] ; b c d e f g _ _ paddw m5, m2 movu m2, [tlq+16] ; 9 a b c d e f g paddw m6, m2 add dxd, dxd cmp hd, 4 jne .w8_upsample_h8 ; awkward single-pixel edge case pshuflw m3, m3, q1110 ; b c c _ _ _ _ _ .w8_upsample_h8: paddw m3, [tlq+14] ; 8 9 a b c d e f psubw m4, m6, m3 movd m3, bdmaxm psraw m4, 3 mov r3d, dxd paddw m6, m4 pxor m4, m4 pmaxsw m5, m4 pmaxsw m6, m4 pshufb m3, m0 pavgw m5, m4 pavgw m6, m4 movd m4, dxd pminsw m5, m3 pminsw m6, m3 mova m3, [base+z_upsample] pshufb m4, m0 movifnidn strideq, stridemp punpcklwd m0, m1, m5 mova [rsp+ 0], m0 punpckhwd m1, m5 mova [rsp+16], m1 punpcklwd m0, m2, m6 mova [rsp+32], m0 punpckhwd m2, m6 mova [rsp+48], m2 mova m5, m4 .w8_upsample_loop: mov r2d, r3d shr r2d, 6 movu m1, [rsp+r2*2+ 0] movu m2, [rsp+r2*2+16] add r3d, dxd pshufb m1, m3 pshufb m2, m3 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w8_upsample_loop RET .w8_no_upsample: lea r3d, [hq+7] movd m1, r3d and r3d, 7 or r3d, 8 ; imin(h+7, 15) test angled, 0x400 jnz .w8_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movu m2, [base+z_filt_wh8] psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pcmpeqb m2, m1 pand m2, m3 pcmpgtb m2, m4 pmovmskb r5d, m2 test r5d, r5d jz .w8_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x55555555 movu m3, [tlq+16*1] movd m4, [tlq+r3*2] shr r5d, 30 ; filter_strength movd [rsp+12], m1 mova [rsp+16*1], m2 pshuflw m4, m4, q0000 mova [rsp+16*2], m3 lea r2d, [r3+2] movq [rsp+r3*2+18], m4 cmp hd, 16 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w8_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 movifnidn strideq, stridemp .w8_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+0] movu m1, [tlq+r3*2+2] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m4, 15 ; xpos < max_base_x paddw m4, m5 ; xpos += dx paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [dstq], m0 dec hd jz .w8_end add dstq, strideq add r5, dxq jl .w8_loop .w8_end_loop: mova [dstq], m6 add dstq, strideq dec hd jg .w8_end_loop .w8_end: RET .w16: %if ARCH_X86_32 %define strideq r3 %endif lea r3d, [hq+15] movd m1, r3d and r3d, 15 or r3d, 16 ; imin(h+15, 31) test angled, 0x400 jnz .w16_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m1, [base+z_filt_wh16] pand m1, m3 pcmpgtb m1, m4 pmovmskb r5d, m1 test r5d, r5d jz .w16_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x24924924 movu m3, [tlq+16*1] movu m4, [tlq+16*2] shr r5d, 30 movu m5, [tlq+16*3] movd m6, [tlq+r3*2] adc r5d, -1 ; filter_strength movd [rsp+12], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 pshuflw m6, m6, q0000 mova [rsp+16*3], m4 mova [rsp+16*4], m5 lea r2d, [r3+2] movq [rsp+r3*2+18], m6 cmp hd, 32 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w16_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w16_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 movddup m3, [base+pw_m512] paddw m1, m2 psraw m2, m4, 15 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*0], m0 por m1, m3 mova [dstq+16*1], m1 dec hd jz .w16_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w16_loop .w16_end_loop: mova [dstq+16*0], m6 mova [dstq+16*1], m6 add dstq, strideq dec hd jg .w16_end_loop .w16_end: RET .w32: lea r3d, [hq+31] and r3d, 31 or r3d, 32 ; imin(h+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main call .filter_copy lea r5d, [r3+2] cmp hd, 64 cmove r3d, r5d call .filter_edge_s3 .w32_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w32_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 psraw m2, m4, 15 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m512] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*0], m0 por m1, m2 mova [dstq+16*1], m1 movu m0, [tlq+r3*2+32] movu m2, [tlq+r3*2+34] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+48] paddw m0, m2 movu m2, [tlq+r3*2+50] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m1024] movddup m3, [base+pw_m1536] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*2], m0 por m1, m3 mova [dstq+16*3], m1 dec hd jz .w32_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w32_loop .w32_end_loop: REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: lea r3d, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main call .filter_copy call .filter_edge_s3 .w64_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w64_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 psraw m2, m4, 15 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m512] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*0], m0 por m1, m2 mova [dstq+16*1], m1 movu m0, [tlq+r3*2+32] movu m2, [tlq+r3*2+34] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+48] paddw m0, m2 movu m2, [tlq+r3*2+50] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m1024] pcmpgtw m2, m4 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m1536] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*2], m0 por m1, m2 mova [dstq+16*3], m1 movu m0, [tlq+r3*2+64] movu m2, [tlq+r3*2+66] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+80] paddw m0, m2 movu m2, [tlq+r3*2+82] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m2048] pcmpgtw m2, m4 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m2560] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*4], m0 por m1, m2 mova [dstq+16*5], m1 movu m0, [tlq+r3*2+96] movu m2, [tlq+r3*2+98] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+112] paddw m0, m2 movu m2, [tlq+r3*2+114] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m3072] movddup m3, [base+pw_m3584] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*6], m0 por m1, m3 mova [dstq+16*7], m1 dec hd jz .w64_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w64_loop .w64_end_loop: REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET ALIGN function_align .filter_copy: pshuflw m2, [tlq-2], q0000 pshuflw m3, [tlq+r3*2], q0000 xor r5d, r5d movd [rsp+gprsize+12], m2 .filter_copy_loop: movu m1, [tlq+r5*2+16*0] movu m2, [tlq+r5*2+16*1] add r5d, 16 mova [rsp+r5*2+gprsize-16*1], m1 mova [rsp+r5*2+gprsize-16*0], m2 cmp r5d, r3d jle .filter_copy_loop lea tlq, [rsp+gprsize+16*1] movq [tlq+r3*2+2], m3 ret .filter_edge: cmp r5d, 3 je .filter_edge_s3 movddup m4, [base+z_filt_k+r5*8-8] movddup m5, [base+z_filt_k+r5*8+8] xor r5d, r5d movddup m6, [base+pw_8] movu m2, [tlq-2] jmp .filter_edge_start .filter_edge_loop: movu m2, [tlq+r5*2-2] mova [tlq+r5*2-16], m1 .filter_edge_start: pmullw m1, m4, [tlq+r5*2] movu m3, [tlq+r5*2+2] paddw m2, m3 pmullw m2, m5 add r5d, 8 paddw m1, m6 paddw m1, m2 psrlw m1, 4 cmp r5d, r3d jl .filter_edge_loop mova [tlq+r5*2-16], m1 ret .filter_edge_s3: movddup m5, [base+pw_3] xor r5d, r5d movu m2, [tlq-2] movu m3, [tlq-4] jmp .filter_edge_s3_start .filter_edge_s3_loop: movu m2, [tlq+r5*2-2] movu m3, [tlq+r5*2-4] mova [tlq+r5*2-16], m1 .filter_edge_s3_start: paddw m2, [tlq+r5*2+0] paddw m3, m5 movu m1, [tlq+r5*2+2] movu m4, [tlq+r5*2+4] add r5d, 8 paddw m1, m2 pavgw m3, m4 paddw m1, m3 psrlw m1, 2 cmp r5d, r3d jl .filter_edge_s3_loop mova [tlq+r5*2-16], m1 ret %if ARCH_X86_64 cglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy %define base r7-$$ %define maxwm r6m %define maxhm r7m %define bdmaxm r8m lea r7, [$$] mov hd, hm movddup m8, [base+pw_62] lea r9d, [wq-4] shl r9d, 6 mova m9, [base+z2_top_shufA] or r9d, hd mova m10, [base+z2_left_shufA] %else cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx %define base r1-$$ %define r9b byte [rsp+16*26+4*0] %define r9d dword [rsp+16*26+4*0] %define r10d dword [rsp+16*26+4*1] %define r11d dword [rsp+16*26+4*2] %define maxwm [rsp+16*2+4*0] %define maxhm [rsp+16*2+4*1] %define bdmaxm [rsp+16*2+4*2] %define stridemp [rsp+16*26+4*3] %define strideq r3 %define dyd r4 %define dyq r4 mov stridemp, r1 mov r1d, r6m mov r4d, r7m mov r5d, r8m mov maxwm, r1d mov maxhm, r4d mov bdmaxm, r5d LEA r1, $$ lea hd, [wq-4] mova m0, [base+z2_top_shufA] shl hd, 6 mova m1, [base+z2_left_shufA] or hd, hm mova [rsp+16*24], m0 mov r9d, hd mova [rsp+16*25], m1 %endif tzcnt wd, wd movifnidn angled, anglem mova m0, [tlq-16*8] mova m1, [tlq-16*7] mova m2, [tlq-16*6] mova m3, [tlq-16*5] movsxd wq, [base+ipred_z2_16bpc_ssse3_table+wq*4] %if ARCH_X86_64 movzx dxd, angleb %else movzx dxd, byte anglem %endif mova m4, [tlq-16*4] mova m5, [tlq-16*3] mova m6, [tlq-16*2] mova m7, [tlq-16*1] mova [rsp+16* 5], m0 xor angled, 0x400 mova [rsp+16* 6], m1 mov dyd, dxd mova [rsp+16* 7], m2 neg dxq mova [rsp+16* 8], m3 and dyd, ~1 mova [rsp+16* 9], m4 and dxq, ~1 mova [rsp+16*10], m5 lea wq, [base+ipred_z2_16bpc_ssse3_table+wq] mova [rsp+16*11], m6 pxor m3, m3 mova [rsp+16*12], m7 movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle movddup m0, [base+pw_256] ; 4<<6 movd m4, [tlq] movu m5, [tlq+16*0+2] movu m6, [tlq+16*1+2] movsldup m1, [base+z2_dy_offset] pshufb m4, m0 movq m7, [base+z_base_inc+2] mov r11d, (112-4)<<6 mova [rsp+16*13], m4 neg dxd mova [rsp+16*14], m5 or dyd, 4<<16 mova [rsp+16*15], m6 %if ARCH_X86_64 lea r10d, [dxq+(112<<6)] ; xpos %else mov [rsp+8*3], dyd lea r4d, [dxq+(112<<6)] mov r10d, r4d movzx hd, r9b %endif movq [rsp+8*0], m1 movq [rsp+8*1], m0 movq [rsp+8*2], m7 jmp wq .w4: test angled, 0x400 jnz .w4_main lea r3d, [hq+2] add angled, 1022 pshuflw m1, m5, q3333 shl r3d, 6 movq [rsp+16*14+8], m1 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle movd m2, r3d movd m7, angled shr angled, 8 ; is_sm << 1 pshufb m2, m3 pshufb m7, m3 pcmpeqb m2, [base+z_filt_wh4] pand m7, m2 pcmpgtb m7, [base+z_filt_t_w48+angleq*8] jmp .w8_filter_left .upsample_above: ; w4/w8 paddw m2, m5, [tlq] movu m1, [rsp+gprsize+16*14+2] movu m4, [rsp+gprsize+16*14-4] %if ARCH_X86_64 movd m6, r9m ; bdmax, offset due to call %else movd m6, [rsp+gprsize+16*2+4*2] %endif paddw m4, m1 psubw m1, m2, m4 pshufb m6, m0 psraw m1, 3 paddw m2, m1 add dxd, dxd pmaxsw m2, m3 paddw m7, m7 pavgw m2, m3 pminsw m2, m6 %if ARCH_X86_64 mova m9, [base+z2_top_shufB] lea r10d, [dxq+(113<<6)] mov r11d, (112-7)<<6 %else mova m1, [base+z2_top_shufB] lea r3d, [dxq+(113<<6)] mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6 mov [rsp+gprsize+16*26+4*1], r3d mova [rsp+gprsize+16*24], m1 %endif punpcklwd m1, m2, m5 punpckhwd m2, m5 movq [rsp+gprsize+8*2], m7 mova [rsp+gprsize+16*14], m1 mova [rsp+gprsize+16*15], m2 ret .w4_no_upsample_above: lea r3d, [hq+3] mov [rsp+16*4], angled sub angled, 1112 ; angle - 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 mova m4, [base+z_filt_wh4] movd m7, r3d mova m5, [base+z_filt_t_w48+angleq*8] mov r3d, 4 call .w8_filter_top mov angled, [rsp+16*4] lea r3d, [hq+2] sub angled, 139 shl r3d, 6 test r3d, angled jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) .upsample_left: ; w4/w8 mova m2, [tlq-16] lea r3d, [hq-4] movu m3, [tlq-14] movu m4, [rsp+16*12+4] pshufb m1, m2, [base+z2_upsample_l+r3*4] movd m6, bdmaxm pxor m5, m5 paddw m3, m2 paddw m4, m1 psubw m1, m3, m4 movshdup m4, [base+z2_dy_offset] psraw m1, 3 pshufb m6, m0 paddw m3, m1 pmaxsw m3, m5 pavgw m3, m5 pminsw m3, m6 %if ARCH_X86_64 mova m10, [base+z2_left_shufB] add dyd, dyd %else mova m1, [base+z2_left_shufB] shl dword [rsp+8*3], 1 mova [rsp+16*25], m1 %endif punpckhwd m1, m2, m3 punpcklwd m2, m3 movq [rsp+8*0], m4 mova [rsp+16*12], m1 mova [rsp+16*11], m2 .w4_main: movd m6, dxd %if ARCH_X86_64 movd m3, dyd %else movd m3, [rsp+8*3] %endif pshufb m6, m0 movddup m0, [rsp+8*2] paddw m7, m6, m6 movq m5, [base+pw_m1to4] pshuflw m4, m3, q0000 punpcklqdq m6, m7 pmullw m4, m5 pshuflw m3, m3, q1111 paddw m6, m0 mov r2d, r10d pshuflw m0, m4, q3333 psubw m4, [rsp+8*0] movq [rsp+8*3], m3 movq [rsp+8*5], m0 ; dy*4 mov r5, dstq .w4_loop0: mova [rsp+16*4], m6 movq [rsp+8*4], m4 %if ARCH_X86_64 pand m0, m8, m4 %else movq m0, [base+pw_62] pand m0, m4 %endif psraw m4, 6 psllw m0, 9 ; frac_y << 9 movq [rsp+8*7], m0 pabsw m4, m4 movq [rsp+8*6], m4 movzx hd, r9b .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu m2, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu m1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movu m3, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movu m4, [rsp+r3*2] %if ARCH_X86_64 REPX {pshufb x, m9}, m2, m1, m3, m4 %else mova m0, [rsp+16*24] REPX {pshufb x, m0}, m2, m1, m3, m4 %endif punpcklqdq m0, m2, m1 punpckhqdq m2, m1 punpcklqdq m1, m3, m4 punpckhqdq m3, m4 %if ARCH_X86_64 pand m5, m8, m6 %else movddup m5, [base+pw_62] pand m5, m6 %endif psllw m5, 9 psubw m2, m0 pmulhrsw m2, m5 paddw m5, m6, m7 psubw m3, m1 paddw m0, m2 %if ARCH_X86_64 pand m2, m8, m5 %else movddup m2, [base+pw_62] pand m2, m5 %endif psllw m2, 9 pmulhrsw m3, m2 paddw m1, m3 cmp r3d, 111 ; topleft jge .w4_toponly mova [rsp+16*22], m0 mova [rsp+16*23], m1 movzx r3d, byte [rsp+8*6+0] ; base_y0 movu m3, [rsp+r3*2] movzx r3d, byte [rsp+8*6+2] ; base_y1 movu m2, [rsp+r3*2] movzx r3d, byte [rsp+8*6+4] ; base_y2 movu m4, [rsp+r3*2] movzx r3d, byte [rsp+8*6+6] ; base_y3 movu m0, [rsp+r3*2] %if ARCH_X86_64 REPX {pshufb x, m10}, m3, m2, m4, m0 %else mova m1, [rsp+16*25] REPX {pshufb x, m1}, m3, m2, m4, m0 %endif punpcklwd m1, m3, m2 punpckhwd m3, m2 ; 01 punpcklwd m2, m4, m0 punpckhwd m4, m0 ; 23 punpckldq m0, m1, m2 ; y0 d1 punpckhdq m1, m2 ; y2 y3 punpckldq m2, m3, m4 punpckhdq m3, m4 movddup m4, [rsp+8*7] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 psraw m6, 15 ; base_x < topleft psraw m4, m5, 15 paddw m0, m2 paddw m1, m3 pand m0, m6 pandn m6, [rsp+16*22] pand m1, m4 pandn m4, [rsp+16*23] por m0, m6 por m1, m4 .w4_toponly: movifnidn strideq, stridemp movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jz .w4_end movq m4, [rsp+8*6] paddsw m6, m5, m7 ; xpos += dx movq m5, [rsp+8*3] psubw m4, m5 lea dstq, [dstq+strideq*2] movq [rsp+8*6], m4 cmp r2d, r11d jge .w4_loop .w4_leftonly_loop: movzx r2d, byte [rsp+8*6+0] ; base_y0 movu m3, [rsp+r2*2] movzx r2d, byte [rsp+8*6+2] ; base_y1 movu m2, [rsp+r2*2] movzx r2d, byte [rsp+8*6+4] ; base_y2 movu m6, [rsp+r2*2] movzx r2d, byte [rsp+8*6+6] ; base_y3 movu m0, [rsp+r2*2] psubw m4, m5 %if ARCH_X86_64 REPX {pshufb x, m10}, m3, m2, m6, m0 %else mova m1, [rsp+16*25] REPX {pshufb x, m1}, m3, m2, m6, m0 %endif movq [rsp+8*6], m4 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m6, m0 punpckhwd m6, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m6 punpckhdq m3, m6 movddup m6, [rsp+8*7] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m6 pmulhrsw m3, m6 paddw m0, m2 paddw m1, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4_leftonly_loop .w4_end: sub r9d, 1<<8 jl .w4_ret movq m4, [rsp+8*5] add r5, 8 mov dstq, r5 paddw m4, [rsp+8*4] ; base_y += 4*dy movzx r2d, word [rsp+8*1] movddup m6, [rsp+8*1] paddw m6, [rsp+16*4] ; base_x += (4 << upsample_above) add r2d, r10d mov r10d, r2d jmp .w4_loop0 .w4_ret: RET .w8: test angled, 0x400 jnz .w4_main lea r3d, [angleq+126] pshufhw m1, m5, q3333 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif movhps [rsp+16*15], m1 cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm call .upsample_above sub angled, 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle movu m1, [base+z_filt_wh8] movd m2, r3d movd m7, angled shr angled, 8 ; is_sm << 1 psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pshufb m2, m3 pshufb m7, m3 pcmpeqb m2, m1 movq m1, [base+pw_512] pand m7, m2 pcmpgtb m7, m4 movq [rsp+8*1], m1 ; 8<<6 jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] mov [rsp+16*4], angled sub angled, 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m4, [base+z_filt_wh8] movd m7, r3d psrldq m5, [base+z_filt_t_w48+angleq*8], 4 mov r3d, 8 call .w8_filter_top mov r3d, [rsp+16*4] sub r3d, 141 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif cmp r3d, 8 jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm .w8_filter_left: pmovmskb r5d, m7 test r5d, r5d jz .w4_main imul r5d, 0x55555555 neg hq mov r3, tlq movd m1, [tlq+hq*2] shr r5d, 30 ; filter_strength lea tlq, [rsp+16*13-2] pshuflw m1, m1, q0000 movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge jmp .filter_left_end .w8_filter_top: REPX {pshufb x, m3}, m2, m1, m7 pcmpeqb m2, m4 pand m1, m2 pand m7, m2 pcmpgtb m1, m5 pcmpgtb m7, m5 pmovmskb r5d, m1 test r5d, r5d jz .w8_filter_top_end ; filter_strength == 0 imul r5d, 0x55555555 mov [dstq], tlq lea tlq, [rsp+16*14+gprsize] shr r5d, 30 ; filter_strength call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge %if ARCH_X86_64 mov r3d, r7m ; maxw, offset due to call %else mov r3d, [rsp+16*2+4*1] %endif mov tlq, [dstq] cmp r3d, 8 jge .w8_filter_top_end movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14+gprsize], m1 movu [rsp+r3*2+16*15+gprsize], m2 .w8_filter_top_end: ret .w16: test angled, 0x400 jnz .w4_main lea r3d, [hq+15] sub angled, 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movd m7, r3d REPX {pshufb x, m3}, m2, m1, m7 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m2, [base+z_filt_wh16] pand m1, m2 pand m7, m2 pcmpgtb m1, m4 pcmpgtb m7, m4 pmovmskb r5d, m1 test r5d, r5d jz .w16_filter_left ; filter_strength == 0 imul r5d, 0x24924924 pshufhw m6, m6, q3333 mov [dstq], tlq lea tlq, [rsp+16*14] shr r5d, 30 movhps [tlq+16*2], m6 adc r5d, -1 ; filter_strength mov r3d, 16 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge mov r3d, maxwm mov tlq, [dstq] cmp r3d, 16 jge .w16_filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 .w16_filter_left: pmovmskb r5d, m7 test r5d, r5d jz .w4_main imul r5d, 0x24924924 neg hq mov r3, tlq movd m1, [tlq+hq*2] shr r5d, 30 lea tlq, [rsp+16*13-2] pshuflw m1, m1, q0000 adc r5d, -1 ; filter_strength movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge jmp .filter_left_end .w32: movu m1, [tlq+16*2+2] movu m2, [tlq+16*3+2] mova [rsp+16*16], m1 mova [rsp+16*17], m2 test angled, 0x400 jnz .w4_main mov [dstq], tlq lea tlq, [rsp+16*14] pshufhw m2, m2, q3333 mov r3d, 32 movhps [tlq+16*4], m2 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 mov r3d, maxwm mov tlq, [dstq] cmp r3d, 32 jge .filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 cmp r3d, 16 jge .filter_left movu m1, [tlq+r3*2+16*2+2] movu m2, [tlq+r3*2+16*3+2] movu [rsp+r3*2+16*16], m1 movu [rsp+r3*2+16*17], m2 .filter_left: neg hq mov r3, tlq pshuflw m1, [tlq+hq*2], q0000 lea tlq, [rsp+16*13-2] movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3 .filter_left_end: mov r2d, maxhm cmp r2d, hd jge .w4_main neg r2 movu m1, [r3+r2*2-16*1] movu m2, [r3+r2*2-16*2] movu [rsp+r2*2+16*12], m1 movu [rsp+r2*2+16*11], m2 cmp r2d, -48 jle .w4_main movu m1, [r3+r2*2-16*3] movu m2, [r3+r2*2-16*4] movu [rsp+r2*2+16*10], m1 movu [rsp+r2*2+16* 9], m2 cmp r2d, -32 jle .w4_main movu m1, [r3+r2*2-16*5] movu m2, [r3+r2*2-16*6] movu [rsp+r2*2+16* 8], m1 movu [rsp+r2*2+16* 7], m2 cmp r2d, -16 jle .w4_main movu m1, [r3+r2*2-16*7] movu m2, [r3+r2*2-16*8] movu [rsp+r2*2+16* 6], m1 movu [rsp+r2*2+16* 5], m2 jmp .w4_main .w64: movu m1, [tlq+16*2+2] movu m2, [tlq+16*3+2] movu m3, [tlq+16*4+2] movu m4, [tlq+16*5+2] movu m5, [tlq+16*6+2] movu m6, [tlq+16*7+2] mov [dstq], tlq lea tlq, [rsp+16*14] mova [tlq+16*2], m1 mova [tlq+16*3], m2 mova [tlq+16*4], m3 mova [tlq+16*5], m4 mova [tlq+16*6], m5 mova [tlq+16*7], m6 test angled, 0x400 jnz .w4_main pshufhw m6, m6, q3333 mov r3d, 64 movhps [tlq+16*8], m6 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 mov r3d, maxwm mov tlq, [dstq] cmp r3d, 64 jge .filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 cmp r3d, 48 jge .filter_left movu m1, [tlq+r3*2+16*2+2] movu m2, [tlq+r3*2+16*3+2] movu [rsp+r3*2+16*16], m1 movu [rsp+r3*2+16*17], m2 cmp r3d, 32 jge .filter_left movu m1, [tlq+r3*2+16*4+2] movu m2, [tlq+r3*2+16*5+2] movu [rsp+r3*2+16*18], m1 movu [rsp+r3*2+16*19], m2 cmp r3d, 16 jge .filter_left movu m1, [tlq+r3*2+16*6+2] movu m2, [tlq+r3*2+16*7+2] movu [rsp+r3*2+16*20], m1 movu [rsp+r3*2+16*21], m2 jmp .filter_left %if ARCH_X86_64 cglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w %define base r7-$$ lea r7, [$$] mov org_wd, wd %else cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy %define base r1-$$ %define org_wd r5 %define org_wq r5 movd m6, r8m ; pixel_max mov [dstq+4*0], strideq LEA r1, $$ mov [dstq+4*1], wd %endif tzcnt hd, hm movifnidn angled, anglem sub tlq, 2 movsxd hq, [base+ipred_z3_16bpc_ssse3_table+hq*4] sub angled, 180 movddup m0, [base+pw_256] mov dyd, angled neg dyd xor angled, 0x400 movddup m7, [base+pw_62] or dyq, ~0x7e lea hq, [base+ipred_z3_16bpc_ssse3_table+hq] movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] jmp hq .h4: lea r4d, [angleq+88] test r4d, 0x480 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r4d, 9 add r4d, wd cmp r4d, 8 jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) mova m2, [tlq-14] ; 7 6 5 4 3 2 1 0 movu m3, [tlq-12] ; 8 7 6 5 4 3 2 1 %if ARCH_X86_64 movd m6, r8m %endif pshufb m4, m2, m0 mov tlq, rsp palignr m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2 add dyd, dyd palignr m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3 paddw m1, m2 paddw m3, m5 psubw m5, m1, m3 mova m3, [base+z_upsample] mova [tlq+ 0], m4 movd m4, dyd psraw m5, 3 neg dyd paddw m1, m5 pxor m5, m5 lea r5d, [dyq+(16<<6)+63] ; ypos pmaxsw m1, m5 pshufb m6, m0 shl wd, 3 pavgw m1, m5 pshufb m4, m0 pminsw m1, m6 sub rsp, wq punpckhwd m0, m1, m2 paddw m5, m4, m4 punpcklwd m1, m2 mova [tlq+32], m0 movsd m4, m5 mova [tlq+16], m1 .h4_upsample_loop: lea r4d, [r5+dyq] sar r5d, 6 movu m2, [tlq+r5*2] lea r5d, [r4+dyq] sar r4d, 6 movu m1, [tlq+r4*2] pshufb m2, m3 pshufb m1, m3 punpckhqdq m0, m1, m2 punpcklqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jg .h4_upsample_loop or r3d, 4*2 jmp .end_transpose .h4_no_upsample: mov r4d, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea r4d, [wq+3] movd m1, r4d movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 pcmpeqb m1, [base+z_filt_wh4] pand m1, m3 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] pmovmskb r5d, m1 mov r4d, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 imul r5d, 0x55555555 mova m2, [tlq-14] neg r4 movd m3, [tlq+r4*2] shr r5d, 30 movd [rsp+16*17], m1 pshuflw m3, m3, q0000 mova [rsp+16*16], m2 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m3 cmp wd, 8 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h4_main: movd m4, dyd sub tlq, r4 movddup m1, [base+z_base_inc_z2+8] ; base_inc << 6 sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m4, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] ; ypos pshufb m3, m0 shl wd, 3 paddw m5, m4, m4 sub rsp, wq psubw m3, m1 ; max_base_y movsd m4, m5 ; ypos1 ypos0 .h4_loop: lea r4, [r5+dyq] sar r5, 6 movddup m0, [tlq+r5*2-6] movddup m1, [tlq+r5*2-8] lea r5, [r4+dyq] sar r4, 6 movlps m0, [tlq+r4*2-6] movlps m1, [tlq+r4*2-8] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 paddw m4, m5 paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [rsp+wq-16], m0 sub wd, 16 jz .h4_transpose test r5d, r5d jg .h4_loop .h4_end_loop: mova [rsp+wq-16], m6 sub wd, 16 jg .h4_end_loop .h4_transpose: or r3d, 4*2 jmp .end_transpose .h8: lea r4d, [angleq+88] and r4d, ~0x7f or r4d, wd cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m2, [tlq-30] ; g f e d c b a 9 movu m1, [tlq-32] ; _ g f e d c b a movu m3, [tlq-16] ; 9 8 7 6 5 4 3 2 paddw m3, [tlq-14] ; 8 7 6 5 4 3 2 1 pshufd m4, m2, q2100 ; _ _ g f e d c b paddw m1, m2 movu m5, [tlq-28] ; f e d c b a 9 8 add dyd, dyd cmp wd, 8 je .h8_upsample_w8 pshufhw m4, m2, q1000 ; _ _ _ _ c c c b .h8_upsample_w8: paddw m4, m5 psubw m5, m1, m4 movu m4, [tlq-18] ; a 9 8 7 6 5 4 3 psraw m5, 3 paddw m1, m5 movu m5, [tlq-12] ; 7 6 5 4 3 2 1 0 %if ARCH_X86_64 movd m6, r8m ; pixel_max %endif paddw m4, m5 shl wd, 4 psubw m5, m3, m4 movd m4, dyd psraw m5, 3 neg dyd paddw m3, m5 pshufb m6, m0 mova m5, [tlq-14] pshufb m4, m0 pxor m0, m0 pmaxsw m1, m0 pmaxsw m3, m0 mov tlq, rsp pavgw m1, m0 pavgw m3, m0 sub rsp, wq pminsw m1, m6 pminsw m6, m3 mova m3, [base+z_upsample] lea r5d, [dyq+(16<<6)+63] ; ypos punpcklwd m0, m1, m2 mova [tlq+16*0], m0 punpckhwd m1, m2 mova [tlq+16*1], m1 punpcklwd m0, m6, m5 mova [tlq+16*2], m0 punpckhwd m6, m5 mova [tlq+16*3], m6 mova m5, m4 .h8_upsample_loop: mov r4d, r5d sar r4d, 6 movu m1, [tlq+r4*2+16*0] movu m2, [tlq+r4*2+16*1] add r5d, dyd pshufb m2, m3 pshufb m1, m3 punpckhqdq m0, m1, m2 punpcklqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jg .h8_upsample_loop or r3d, 8*2 jmp .end_transpose .h8_no_upsample: lea r4d, [wq+7] movd m1, r4d and r4d, 7 or r4d, 8 ; imin(w+7, 15) test angled, 0x400 jnz .h8_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movu m2, [base+z_filt_wh8] psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pcmpeqb m2, m1 pand m2, m3 pcmpgtb m2, m4 pmovmskb r5d, m2 test r5d, r5d jz .h8_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 imul r5d, 0x55555555 mova m2, [tlq-16*1+2] neg r4 mova m3, [tlq-16*2+2] shr r5d, 30 movd m4, [tlq+r4*2] movd [rsp+16*17], m1 mova [rsp+16*16], m2 pshuflw m4, m4, q0000 mova [rsp+16*15], m3 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m4 cmp wd, 16 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h8_main: sub tlq, r4 movd m4, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m4, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 shl wd, 4 mova m5, m4 sub rsp, wq psubw m3, [base+z_base_inc_z2] .h8_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m1, [tlq+r4*2-16] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 paddw m4, m5 paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [rsp+wq-16], m0 sub wd, 8*2 jz .h8_transpose add r5, dyq jg .h8_loop .h8_end_loop: mova [rsp+wq-16], m6 sub wd, 8*2 jg .h8_end_loop .h8_transpose: or r3d, 8*2 jmp .end_transpose .h16: lea r4d, [wq+15] movd m1, r4d and r4d, 15 or r4d, 16 ; imin(w+15, 31) test angled, 0x400 jnz .h16_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m1, [base+z_filt_wh16] pand m1, m3 pcmpgtb m1, m4 pmovmskb r5d, m1 test r5d, r5d jz .h16_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 mova m2, [tlq-16*1+2] imul r5d, 0x24924924 mova m3, [tlq-16*2+2] neg r4 mova m4, [tlq-16*3+2] shr r5d, 30 mova m5, [tlq-16*4+2] movd m6, [tlq+r4*2] adc r5d, -1 ; filter_strength movd [rsp+16*17], m1 mova [rsp+16*16], m2 mova [rsp+16*15], m3 pshuflw m6, m6, q0000 mova [rsp+16*14], m4 mova [rsp+16*13], m5 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m6 cmp wd, 32 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h16_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 shl wd, 5 paddw m4, m5, [base+z_base_inc_z2] sub rsp, wq psubw m4, m3 .h16_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m2, [tlq+r4*2-16] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r4*2-30] paddw m0, m2 movu m2, [tlq+r4*2-32] psubw m2, m1 pmulhrsw m2, m3 movddup m3, [base+pw_m512] paddw m1, m2 psraw m2, m4, 15 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+wq-16*1], m0 por m1, m3 mova [rsp+wq-16*2], m1 sub wd, 16*2 jz .h16_transpose add r5, dyq jg .h16_loop .h16_end_loop: mova [rsp+wq-16*1], m6 mova [rsp+wq-16*2], m6 sub wd, 16*2 jg .h16_end_loop .h16_transpose: or r3d, 16*2 jmp .end_transpose .h32: lea r4d, [wq+31] and r4d, 31 or r4d, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main call .filter_copy lea r5, [r4-2] cmp wd, 64 cmove r4, r5 call .filter_edge_s3 .h32_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 paddw m4, m5, [base+z_base_inc_z2] psubw m4, m3 .h32_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m3, [tlq+r4*2-16] pand m2, m7, m4 psllw m2, 9 psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-30] paddw m0, m3 movu m3, [tlq+r4*2-32] psubw m3, m1 pmulhrsw m3, m2 sub rsp, 16*4 paddw m1, m3 psraw m3, m4, 15 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m512] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*3], m0 por m1, m3 mova [rsp+16*2], m1 movu m0, [tlq+r4*2-46] movu m3, [tlq+r4*2-48] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-62] paddw m0, m3 movu m3, [tlq+r4*2-64] psubw m3, m1 pmulhrsw m3, m2 movddup m2, [base+pw_m1024] paddw m1, m3 movddup m3, [base+pw_m1536] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+16*1], m0 por m1, m3 mova [rsp+16*0], m1 dec wd jz .h32_transpose add r5, dyq jg .h32_loop .h32_end_loop: sub rsp, 16*4 REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0 dec wd jg .h32_end_loop .h32_transpose: or r3d, 32*2 jmp .end_transpose .h64: lea r4d, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main call .filter_copy call .filter_edge_s3 .h64_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 paddw m4, m5, [base+z_base_inc_z2] psubw m4, m3 .h64_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2- 14] movu m3, [tlq+r4*2- 16] pand m2, m7, m4 psllw m2, 9 psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 30] paddw m0, m3 movu m3, [tlq+r4*2- 32] psubw m3, m1 pmulhrsw m3, m2 sub rsp, 16*8 paddw m1, m3 psraw m3, m4, 15 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m512] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*7], m0 por m1, m3 mova [rsp+16*6], m1 movu m0, [tlq+r4*2- 46] movu m3, [tlq+r4*2- 48] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 62] paddw m0, m3 movu m3, [tlq+r4*2- 64] psubw m3, m1 pmulhrsw m3, m2 paddw m1, m3 movddup m3, [base+pw_m1024] pcmpgtw m3, m4 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m1536] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*5], m0 por m1, m3 mova [rsp+16*4], m1 movu m0, [tlq+r4*2- 78] movu m3, [tlq+r4*2- 80] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 94] paddw m0, m3 movu m3, [tlq+r4*2- 96] psubw m3, m1 pmulhrsw m3, m2 paddw m1, m3 movddup m3, [base+pw_m2048] pcmpgtw m3, m4 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m2560] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*3], m0 por m1, m3 mova [rsp+16*2], m1 movu m0, [tlq+r4*2-110] movu m3, [tlq+r4*2-112] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-126] paddw m0, m3 movu m3, [tlq+r4*2-128] psubw m3, m1 pmulhrsw m3, m2 movddup m2, [base+pw_m3072] paddw m1, m3 movddup m3, [base+pw_m3584] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+16*1], m0 por m1, m3 mova [rsp+16*0], m1 dec wd jz .h64_transpose add r5, dyq jg .h64_loop .h64_end_loop: sub rsp, 16*8 REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0 dec wd jg .h64_end_loop .h64_transpose: add r3d, 64*2 .end_transpose: %if ARCH_X86_64 lea r7, [strideq*3] %else mov strideq, [dstq+4*0] mov org_wd, [dstq+4*1] %endif lea r4d, [r3*3] .end_transpose_loop: lea r2, [rsp+r3-8] lea r6, [dstq+org_wq*2-8] .end_transpose_loop_y: movq m0, [r2+r4 ] movq m1, [r2+r3*2] movq m2, [r2+r3*1] movq m3, [r2+r3*0] sub r2, 8 punpcklwd m0, m1 punpcklwd m2, m3 punpckhdq m1, m0, m2 punpckldq m0, m2 movhps [r6+strideq*0], m1 movq [r6+strideq*1], m1 %if ARCH_X86_64 movhps [r6+strideq*2], m0 movq [r6+r7 ], m0 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m0 movq [r6+strideq*1], m0 lea r6, [r6+strideq*2] %endif cmp r2, rsp jae .end_transpose_loop_y lea rsp, [rsp+r3*4] sub org_wd, 4 jg .end_transpose_loop RET .filter_copy: neg r4 pshuflw m2, [tlq+2], q0000 xor r5d, r5d pshuflw m3, [tlq+r4*2], q0000 movq [rsp+gprsize+16*17], m2 .filter_copy_loop: mova m1, [tlq+r5*2-16*1+2] mova m2, [tlq+r5*2-16*2+2] sub r5, 16 mova [rsp+r5*2+gprsize+16*18], m1 mova [rsp+r5*2+gprsize+16*17], m2 cmp r5d, r4d jg .filter_copy_loop lea tlq, [rsp+gprsize+16*17-2] movq [tlq+r4*2-8], m3 ret .filter_edge: cmp r5d, 3 je .filter_edge_s3 movddup m4, [base+z_filt_k+r5*8-8] movddup m5, [base+z_filt_k+r5*8+8] xor r5d, r5d movddup m6, [base+pw_8] movu m2, [tlq-12] jmp .filter_edge_start .filter_edge_loop: movu m2, [tlq+r5*2-12] mova [tlq+r5*2+2], m1 .filter_edge_start: pmullw m1, m4, [tlq+r5*2-14] movu m3, [tlq+r5*2-16] sub r5, 8 paddw m2, m3 pmullw m2, m5 paddw m1, m6 paddw m1, m2 psrlw m1, 4 cmp r5d, r4d jg .filter_edge_loop mova [tlq+r5*2+2], m1 neg r4d ret .filter_edge_s3: movddup m5, [base+pw_3] xor r5d, r5d movu m2, [tlq-12] movu m3, [tlq-10] jmp .filter_edge_s3_start .filter_edge_s3_loop: movu m2, [tlq+r5*2-12] movu m3, [tlq+r5*2-10] mova [tlq+r5*2+2], m1 .filter_edge_s3_start: paddw m2, [tlq+r5*2-14] paddw m3, m5 movu m1, [tlq+r5*2-16] movu m4, [tlq+r5*2-18] sub r5, 8 paddw m1, m2 pavgw m3, m4 paddw m1, m3 psrlw m1, 2 cmp r5d, r4d jg .filter_edge_s3_loop mova [tlq+r5*2+2], m1 neg r4d ret %if ARCH_X86_64 cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter %else cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter %define m8 [esp+16*0] %define m9 [esp+16*1] %define m10 [esp+16*2] %define m11 [esp+16*3] %define m12 [esp+16*4] %define m13 [esp+16*5] %define m14 [esp+16*6] %define m15 [esp+16*7] %endif %define base r6-$$ movifnidn hd, hm movd m6, r8m ; bitdepth_max %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif LEA r6, $$ shl filterd, 6 movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 mova m1, [base+filter_intra_taps+filterq+16*0] mova m2, [base+filter_intra_taps+filterq+16*1] mova m3, [base+filter_intra_taps+filterq+16*2] mova m4, [base+filter_intra_taps+filterq+16*3] pxor m5, m5 %if ARCH_X86_64 punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid punpcklbw m10, m5, m2 ; having to perform sign-extension. punpckhbw m11, m5, m2 punpcklbw m12, m5, m3 punpckhbw m13, m5, m3 punpcklbw m14, m5, m4 punpckhbw m15, m5, m4 %else punpcklbw m7, m5, m1 mova m8, m7 punpckhbw m7, m5, m1 mova m9, m7 punpcklbw m7, m5, m2 mova m10, m7 punpckhbw m7, m5, m2 mova m11, m7 punpcklbw m7, m5, m3 mova m12, m7 punpckhbw m7, m5, m3 mova m13, m7 punpcklbw m7, m5, m4 mova m14, m7 punpckhbw m7, m5, m4 mova m15, m7 %endif mova m7, [base+filter_shuf] add hd, hd mov r5, dstq pshuflw m6, m6, q0000 mov r6, tlq punpcklqdq m6, m6 sub tlq, hq .left_loop: pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ pshufd m1, m0, q0000 pmaddwd m2, m8, m1 pmaddwd m1, m9 pshufd m4, m0, q1111 pmaddwd m3, m10, m4 pmaddwd m4, m11 paddd m2, m3 paddd m1, m4 pshufd m4, m0, q2222 pmaddwd m3, m12, m4 pmaddwd m4, m13 paddd m2, m3 paddd m1, m4 pshufd m3, m0, q3333 pmaddwd m0, m14, m3 pmaddwd m3, m15 paddd m0, m2 paddd m1, m3 psrad m0, 11 ; x >> 3 psrad m1, 11 packssdw m0, m1 pmaxsw m0, m5 pavgw m0, m5 ; (x + 8) >> 4 pminsw m0, m6 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movlps m0, [tlq+hq-10] lea dstq, [dstq+strideq*2] sub hd, 2*2 jg .left_loop sub wd, 4 jz .end sub tld, r6d ; -h*2 sub r6, r5 ; tl-dst .right_loop0: add r5, 8 mov hd, tld movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ mov dstq, r5 .right_loop: pshufd m2, m0, q0000 pmaddwd m1, m8, m2 pmaddwd m2, m9 pshufd m4, m0, q1111 pmaddwd m3, m10, m4 pmaddwd m4, m11 pinsrw m0, [dstq+strideq*0-2], 5 paddd m1, m3 paddd m2, m4 pshufd m0, m0, q2222 movddup m4, [dstq+strideq*1-8] pmaddwd m3, m12, m0 pmaddwd m0, m13 paddd m1, m3 paddd m0, m2 pshuflw m2, m4, q3333 punpcklwd m2, m5 pmaddwd m3, m14, m2 pmaddwd m2, m15 paddd m1, m3 paddd m0, m2 psrad m1, 11 psrad m0, 11 packssdw m0, m1 pmaxsw m0, m5 pavgw m0, m5 pminsw m0, m6 movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 palignr m0, m4, 14 lea dstq, [dstq+strideq*2] add hd, 2*2 jl .right_loop sub wd, 4 jg .right_loop0 .end: RET %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac LEA t0, ipred_cfl_left_16bpc_ssse3_table movd m4, wd tzcnt wd, wd movifnidn hd, hm add tlq, 2 movsxd r6, [t0+wq*4] movd m5, wd jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm LEA t0, ipred_cfl_left_16bpc_ssse3_table tzcnt wd, wm lea r6d, [hq*2] movd m4, hd sub tlq, r6 tzcnt r6d, hd movd m5, r6d movsxd r6, [t0+r6*4] .start: movd m7, r7m movu m0, [tlq] add r6, t0 add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table movsxd wq, [t0+wq*4] pxor m6, m6 pshuflw m7, m7, q0000 pcmpeqw m3, m3 add wq, t0 movifnidn acq, acmp pavgw m4, m6 punpcklqdq m7, m7 jmp r6 .h32: movu m1, [tlq+48] movu m2, [tlq+32] paddw m0, m1 paddw m0, m2 .h16: movu m1, [tlq+16] paddw m0, m1 .h8: pshufd m1, m0, q1032 paddw m0, m1 .h4: pmaddwd m0, m3 psubd m4, m0 pshuflw m0, m4, q1032 paddd m0, m4 psrld m0, m5 pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq %macro IPRED_CFL 2 ; dst, src pabsw m%1, m%2 pmulhrsw m%1, m2 psignw m%2, m1 psignw m%1, m%2 paddw m%1, m0 pmaxsw m%1, m6 pminsw m%1, m7 %endmacro cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm tzcnt r6d, hd lea t0d, [wq+hq] movd m4, t0d tzcnt t0d, t0d movd m5, t0d LEA t0, ipred_cfl_16bpc_ssse3_table tzcnt wd, wd movd m7, r7m movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] psrlw m4, 1 pxor m6, m6 pshuflw m7, m7, q0000 add r6, t0 add wq, t0 movifnidn acq, acmp pcmpeqw m3, m3 punpcklqdq m7, m7 jmp r6 .h4: movq m0, [tlq-8] jmp wq .w4: movq m1, [tlq+2] paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 cmp hd, 4 jg .w4_mul psrld m0, 3 jmp .w4_end .w4_mul: mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 16 cmove r6d, r2d movd m1, r6d psrld m0, 2 pmulhuw m0, m1 psrlw m0, 1 .w4_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s4: movd m1, alpham lea r6, [strideq*3] pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 movq [dstq+strideq*0], m3 movhps [dstq+strideq*1], m3 movq [dstq+strideq*2], m4 movhps [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4_loop RET .h8: mova m0, [tlq-16] jmp wq .w8: movu m1, [tlq+2] paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmove r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+strideq*0], m3 mova [dstq+strideq*1], m4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s8_loop RET .h16: mova m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w16: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*0], m3 mova [dstq+16*1], m4 add dstq, strideq dec hd jg .s16_loop RET .h32: mova m0, [tlq-64] paddw m0, [tlq-48] paddw m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w32: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 movu m2, [tlq+34] paddw m1, m2 movu m2, [tlq+50] paddw m1, m2 paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 32 je .w32_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 8 cmove r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*0], m3 mova [dstq+16*1], m4 mova m4, [acq+16*2] mova m5, [acq+16*3] add acq, 16*4 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*2], m3 mova [dstq+16*3], m4 add dstq, strideq dec hd jg .s32_loop RET cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac tzcnt wd, wm LEA t0, ipred_cfl_splat_16bpc_ssse3_table mov r6d, r7m movifnidn hd, hm shr r6d, 11 movd m7, r7m movsxd wq, [t0+wq*4] movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] pshuflw m7, m7, q0000 pxor m6, m6 add wq, t0 movifnidn acq, acmp punpcklqdq m7, m7 jmp wq cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm %if ARCH_X86_32 && PIC pcmpeqw m5, m5 pabsw m5, m5 paddw m5, m5 %else movddup m5, [pw_2] %endif mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 mov r5, acq jg .w16 je .w8 lea r3, [strideq*3] .w4_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] pmaddwd m2, m5, [ypxq+strideq*2] pmaddwd m3, m5, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] paddd m0, m1 paddd m2, m3 paddd m4, m0 packssdw m0, m2 paddd m4, m2 mova [acq], m0 add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .dc punpckhqdq m0, m0 pslld m2, 2 .w4_hpad: mova [acq+16*0], m0 paddd m4, m2 mova [acq+16*1], m0 add acq, 16*2 sub hpadd, 4 jg .w4_hpad jmp .dc .w8: %if ARCH_X86_32 cmp dword wpadm, 0 %else test wpadd, wpadd %endif jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m2, m5, [ypxq+strideq*1+16*0] pmaddwd m1, m5, [ypxq+strideq*0+16*1] pmaddwd m3, m5, [ypxq+strideq*1+16*1] lea ypxq, [ypxq+strideq*2] paddd m0, m2 paddd m1, m3 paddd m2, m0, m1 packssdw m0, m1 paddd m4, m2 mova [acq], m0 add acq, 16 dec hd jg .w8_loop .w8_hpad: test hpadd, hpadd jz .dc pslld m2, 2 mova m1, m0 jmp .hpad .w8_wpad1: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m0, m1 pshufd m1, m0, q3333 paddd m2, m0, m1 packssdw m0, m1 paddd m4, m2 mova [acq], m0 add acq, 16 dec hd jg .w8_wpad1 jmp .w8_hpad .w16_wpad3: pshufd m3, m0, q3333 mova m1, m3 mova m2, m3 jmp .w16_wpad_end .w16_wpad2: pshufd m1, m3, q3333 mova m2, m1 jmp .w16_wpad_end .w16_wpad1: pshufd m2, m1, q3333 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm WIN64_SPILL_XMM 7 .w16_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m6, m5, [ypxq+strideq*1+16*0] paddd m0, m6 cmp wpadd, 2 jg .w16_wpad3 pmaddwd m3, m5, [ypxq+strideq*0+16*1] pmaddwd m6, m5, [ypxq+strideq*1+16*1] paddd m3, m6 je .w16_wpad2 pmaddwd m1, m5, [ypxq+strideq*0+16*2] pmaddwd m6, m5, [ypxq+strideq*1+16*2] paddd m1, m6 jp .w16_wpad1 pmaddwd m2, m5, [ypxq+strideq*0+16*3] pmaddwd m6, m5, [ypxq+strideq*1+16*3] paddd m2, m6 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] paddd m6, m0, m3 packssdw m0, m3 paddd m6, m1 mova [acq+16*0], m0 packssdw m1, m2 paddd m2, m6 mova [acq+16*1], m1 add acq, 16*2 paddd m4, m2 dec hd jg .w16_loop WIN64_RESTORE_XMM add hpadd, hpadd jz .dc paddd m2, m2 .hpad: mova [acq+16*0], m0 mova [acq+16*1], m1 paddd m4, m2 mova [acq+16*2], m0 mova [acq+16*3], m1 add acq, 16*4 sub hpadd, 4 jg .hpad .dc: sub r5, acq ; -w*h*2 pshufd m2, m4, q1032 tzcnt r1d, r5d paddd m2, m4 sub r1d, 2 pshufd m4, m2, q2301 movd m0, r1d paddd m2, m4 psrld m2, m0 pxor m0, m0 pavgw m2, m0 packssdw m2, m2 .dc_loop: mova m0, [acq+r5+16*0] mova m1, [acq+r5+16*1] psubw m0, m2 psubw m1, m2 mova [acq+r5+16*0], m0 mova [acq+r5+16*1], m1 add r5, 16*2 jl .dc_loop RET cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm %if ARCH_X86_32 && PIC pcmpeqw m5, m5 pabsw m5, m5 psllw m5, 2 %else movddup m5, [pw_4] %endif mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 mov r5, acq jg .w16 je .w8 lea r3, [strideq*3] .w4_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m3, m5, [ypxq+strideq*1] pmaddwd m1, m5, [ypxq+strideq*2] pmaddwd m2, m5, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] paddd m4, m0 packssdw m0, m3 paddd m3, m1 packssdw m1, m2 paddd m4, m2 paddd m4, m3 mova [acq+16*0], m0 mova [acq+16*1], m1 add acq, 16*2 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc punpckhqdq m1, m1 pslld m2, 3 mova [acq+16*0], m1 mova [acq+16*1], m1 paddd m4, m2 mova [acq+16*2], m1 mova [acq+16*3], m1 add acq, 16*4 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w8: %if ARCH_X86_32 cmp dword wpadm, 0 %else test wpadd, wpadd %endif jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m2, m5, [ypxq+strideq*0+16*1] pmaddwd m1, m5, [ypxq+strideq*1+16*0] pmaddwd m3, m5, [ypxq+strideq*1+16*1] lea ypxq, [ypxq+strideq*2] paddd m4, m0 packssdw m0, m2 paddd m4, m2 mova [acq+16*0], m0 paddd m2, m1, m3 packssdw m1, m3 paddd m4, m2 mova [acq+16*1], m1 add acq, 16*2 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc pslld m2, 2 mova m0, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w8_wpad1: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] pshufd m2, m0, q3333 pshufd m3, m1, q3333 paddd m4, m0 packssdw m0, m2 paddd m4, m2 paddd m2, m1, m3 packssdw m1, m3 paddd m4, m2 mova [acq+16*0], m0 mova [acq+16*1], m1 add acq, 16*2 sub hd, 2 jg .w8_wpad1 jmp .w8_hpad .w16_wpad3: pshufd m3, m0, q3333 mova m1, m3 mova m2, m3 jmp .w16_wpad_end .w16_wpad2: pshufd m1, m3, q3333 mova m2, m1 jmp .w16_wpad_end .w16_wpad1: pshufd m2, m1, q3333 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm WIN64_SPILL_XMM 7 .w16_loop: pmaddwd m0, m5, [ypxq+16*0] cmp wpadd, 2 jg .w16_wpad3 pmaddwd m3, m5, [ypxq+16*1] je .w16_wpad2 pmaddwd m1, m5, [ypxq+16*2] jp .w16_wpad1 pmaddwd m2, m5, [ypxq+16*3] .w16_wpad_end: add ypxq, strideq paddd m6, m0, m3 packssdw m0, m3 mova [acq+16*0], m0 paddd m6, m1 packssdw m1, m2 paddd m2, m6 mova [acq+16*1], m1 add acq, 16*2 paddd m4, m2 dec hd jg .w16_loop WIN64_RESTORE_XMM add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc paddd m2, m2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h %define base r6-ipred_cfl_ac_444_16bpc_ssse3_table LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table tzcnt wd, wm movifnidn hpadd, hpadm pxor m4, m4 movsxd wq, [r6+wq*4] movddup m5, [base+pw_1] add wq, r6 mov hd, hm shl hpadd, 2 sub hd, hpadd jmp wq .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: movq m0, [ypxq+strideq*0] movhps m0, [ypxq+strideq*1] movq m1, [ypxq+strideq*2] movhps m1, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] psllw m0, 3 psllw m1, 3 mova [acq+16*0], m0 pmaddwd m0, m5 mova [acq+16*1], m1 pmaddwd m2, m5, m1 add acq, 16*2 paddd m4, m0 paddd m4, m2 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc punpckhqdq m1, m1 mova [acq+16*0], m1 pslld m2, 2 mova [acq+16*1], m1 punpckhqdq m2, m2 mova [acq+16*2], m1 paddd m4, m2 mova [acq+16*3], m1 add acq, 16*4 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w8: mov r5, acq .w8_loop: mova m0, [ypxq+strideq*0] mova m1, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] psllw m0, 3 psllw m1, 3 mova [acq+16*0], m0 pmaddwd m0, m5 mova [acq+16*1], m1 pmaddwd m2, m5, m1 add acq, 16*2 paddd m4, m0 paddd m4, m2 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc pslld m2, 2 mova m0, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w16_wpad2: pshufhw m3, m2, q3333 pshufhw m1, m0, q3333 punpckhqdq m3, m3 punpckhqdq m1, m1 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm mov r5, acq .w16_loop: mova m2, [ypxq+strideq*0+16*0] mova m0, [ypxq+strideq*1+16*0] psllw m2, 3 psllw m0, 3 test wpadd, wpadd jnz .w16_wpad2 mova m3, [ypxq+strideq*0+16*1] mova m1, [ypxq+strideq*1+16*1] psllw m3, 3 psllw m1, 3 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] mova [acq+16*0], m2 pmaddwd m2, m5 mova [acq+16*1], m3 pmaddwd m3, m5 paddd m4, m2 pmaddwd m2, m5, m0 mova [acq+16*2], m0 paddd m4, m3 pmaddwd m3, m5, m1 mova [acq+16*3], m1 add acq, 16*4 paddd m2, m3 paddd m4, m2 sub hd, 2 jg .w16_loop add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc paddd m2, m2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w32_wpad6: pshufhw m1, m0, q3333 punpckhqdq m1, m1 mova m2, m1 mova m3, m1 jmp .w32_wpad_end .w32_wpad4: pshufhw m2, m1, q3333 punpckhqdq m2, m2 mova m3, m2 jmp .w32_wpad_end .w32_wpad2: pshufhw m3, m2, q3333 punpckhqdq m3, m3 jmp .w32_wpad_end .w32: movifnidn wpadd, wpadm mov r5, acq WIN64_SPILL_XMM 8 .w32_loop: mova m0, [ypxq+16*0] psllw m0, 3 cmp wpadd, 4 jg .w32_wpad6 mova m1, [ypxq+16*1] psllw m1, 3 je .w32_wpad4 mova m2, [ypxq+16*2] psllw m2, 3 jnp .w32_wpad2 mova m3, [ypxq+16*3] psllw m3, 3 .w32_wpad_end: add ypxq, strideq pmaddwd m6, m5, m0 mova [acq+16*0], m0 pmaddwd m7, m5, m1 mova [acq+16*1], m1 paddd m6, m7 pmaddwd m7, m5, m2 mova [acq+16*2], m2 paddd m6, m7 pmaddwd m7, m5, m3 mova [acq+16*3], m3 add acq, 16*4 paddd m6, m7 paddd m4, m6 dec hd jg .w32_loop %if WIN64 mova m5, m6 WIN64_RESTORE_XMM SWAP 5, 6 %endif test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w32_hpad_loop: mova [acq+16*0], m0 mova [acq+16*1], m1 paddd m4, m6 mova [acq+16*2], m2 mova [acq+16*3], m3 add acq, 16*4 dec hpadd jg .w32_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h %define base r2-pal_pred_16bpc_ssse3_table %if ARCH_X86_32 %define hd r2d %endif mova m3, [palq] LEA r2, pal_pred_16bpc_ssse3_table tzcnt wd, wm pshufb m3, [base+pal_pred_shuf] movsxd wq, [r2+wq*4] pshufd m4, m3, q1032 add wq, r2 movifnidn hd, hm jmp wq .w4: mova m0, [idxq] add idxq, 16 pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4 RET .w8: mova m0, [idxq] add idxq, 16 pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8 RET .w16: mova m0, [idxq] add idxq, 16 pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, strideq dec hd jg .w16 RET .w32: mova m0, [idxq+16*0] pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova m2, [idxq+16*1] add idxq, 16*2 mova [dstq+16*0], m0 pshufb m0, m3, m2 mova [dstq+16*1], m1 pshufb m1, m4, m2 punpcklbw m2, m0, m1 punpckhbw m0, m1 mova [dstq+16*2], m2 mova [dstq+16*3], m0 add dstq, strideq dec hd jg .w32 RET .w64: mova m0, [idxq+16*0] pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova m2, [idxq+16*1] mova [dstq+16*0], m0 pshufb m0, m3, m2 mova [dstq+16*1], m1 pshufb m1, m4, m2 punpcklbw m2, m0, m1 punpckhbw m0, m1 mova m1, [idxq+16*2] mova [dstq+16*2], m2 pshufb m2, m3, m1 mova [dstq+16*3], m0 pshufb m0, m4, m1 punpcklbw m1, m2, m0 punpckhbw m2, m0 mova m0, [idxq+16*3] add idxq, 16*4 mova [dstq+16*4], m1 pshufb m1, m3, m0 mova [dstq+16*5], m2 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*6], m0 mova [dstq+16*7], m1 add dstq, strideq dec hd jg .w64 RET av-scenechange-0.14.1/src/asm/x86/ipred_avx2.asm000064400000000000000000005565731046102023000173440ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 %macro SMOOTH_WEIGHT_TABLE 1-* %rep %0 db %1-128, 127-%1 %rotate 1 %endrep %endmacro ; sm_weights[], but modified to precalculate x and 256-x with offsets to ; enable efficient use of pmaddubsw (which requires signed values) smooth_weights: SMOOTH_WEIGHT_TABLE \ 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line pb_128: times 4 db 128 ; those are just placed here for alignment. pb_36_m4: times 2 db 36, -4 z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 ; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 pb_127_m127: times 2 db 127, -127 ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 pw_64: times 2 dw 64 cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 times 9 db 7, -1 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ; w=8, w_pad=1 as well as second half of previous one cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 times 5 db 6, 7 ; w=16,w_pad=2 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 times 8 db 14, 15 ; w=16,w_pad=3 db 0, 1, 2, 3, 4, 5 times 13 db 6, 7 pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 %define pb_0to15 cfl_ac_w16_pad_shuffle %define pb_1 (ipred_h_shuf+12) %define pb_2 (ipred_h_shuf+20) %define pb_3 (ipred_h_shuf+ 4) %define pb_4 (ipred_h_shuf+24) %define pb_5 (ipred_h_shuf+ 8) %define pb_7 (ipred_h_shuf+ 0) %define pb_8 (z_upsample2 +12) %define pb_12 (z2_y_shuf_h4+20) %define pb_14 (z2_y_shuf_h4+ 4) %define pb_15 (z_filter_s +32) %define pb_27 (z2_y_shuf_h4+ 8) %define pb_31 (z2_y_shuf_h4+12) %define pb_32 (z2_y_shuf_h4+16) %define pb_90 (z2_y_shuf_h4+ 0) %define pw_1 (z2_y_shuf_h4+24) %define pw_8 (z_filter_k +32) pw_62: times 2 dw 62 pw_128: times 2 dw 128 pw_255: times 2 dw 255 pw_512: times 2 dw 512 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4) %define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 cextern dr_intra_derivative cextern filter_intra_taps SECTION .text INIT_YMM avx2 cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h lea r5, [ipred_dc_left_avx2_table] tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm mov r6d, 0x8000 shrx r6d, r6d, wd movd xm3, r6d movsxd r6, [r5+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] mov r5d, 0x8000 shrx r5d, r5d, r6d movd xm3, r5d lea r5, [ipred_dc_left_avx2_table] movsxd r6, [r5+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu m1, [tlq+32] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h32: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 .h8: psrlq xm1, xm0, 32 paddw xm0, xm1 .h4: pmaddwd xm0, xm2 pmulhrsw xm0, xm3 lea stride3q, [strideq*3] vpbroadcastb m0, xm0 mova m1, m0 jmp wq cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea r5d, [wq+hq] movd xm4, r5d tzcnt r5d, r5d movd xm5, r5d lea r5, [ipred_dc_avx2_table] tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] pcmpeqd m3, m3 psrlw xm4, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movd xm0, [tlq-4] pmaddubsw xm0, xm3 jmp wq .w4: movd xm1, [tlq+1] pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm1 pmaddwd xm0, xm3 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: punpckhqdq xm1, xm0, xm0 lea r2d, [hq*2] mov r6d, 0x55563334 paddw xm0, xm1 shrx r6d, r6d, r2d psrlq xm1, xm0, 32 paddw xm0, xm1 movd xm1, r6d psrlw xm0, 2 pmulhuw xm0, xm1 .w4_end: vpbroadcastb xm0, xm0 .s4: movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm0 movd [dstq+strideq*2], xm0 movd [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET ALIGN function_align .h8: movq xm0, [tlq-8] pmaddubsw xm0, xm3 jmp wq .w8: movq xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 punpckhqdq xm2, xm0, xm0 paddw xm0, xm2 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmove r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w8_end: vpbroadcastb xm0, xm0 .s8: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET ALIGN function_align .h16: mova xm0, [tlq-16] pmaddubsw xm0, xm3 jmp wq .w16: movu xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w16_end: vpbroadcastb xm0, xm0 .s16: mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm0 mova [dstq+strideq*2], xm0 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 vextracti128 xm1, m0, 1 psubw xm0, xm4 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x33345556 shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w32_end: vpbroadcastb m0, xm0 .s32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET ALIGN function_align .h64: mova m0, [tlq-64] mova m1, [tlq-32] pmaddubsw m0, m3 pmaddubsw m1, m3 paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 1] movu m2, [tlq+33] pmaddubsw m1, m3 pmaddubsw m2, m3 paddw m0, m1 paddw m0, m2 vextracti128 xm1, m0, 1 psubw xm0, xm4 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 64 je .w64_end mov r6d, 0x33345556 shrx r6d, r6d, hd movd xm1, r6d pmulhuw xm0, xm1 .w64_end: vpbroadcastb m0, xm0 mova m1, m0 .s64: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m0 mova [dstq+strideq*2+32*1], m1 mova [dstq+stride3q +32*0], m0 mova [dstq+stride3q +32*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s64 RET cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128] mova m1, m0 add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_avx2_table] tzcnt wd, wm movu m0, [tlq+ 1] movu m1, [tlq+33] movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq %macro IPRED_H 2 ; w, store_type vpbroadcastb m0, [tlq-1] vpbroadcastb m1, [tlq-2] vpbroadcastb m2, [tlq-3] sub tlq, 4 vpbroadcastb m3, [tlq+0] mov%2 [dstq+strideq*0], m0 mov%2 [dstq+strideq*1], m1 mov%2 [dstq+strideq*2], m2 mov%2 [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w%1 RET ALIGN function_align %endmacro INIT_XMM avx2 cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 lea r5, [ipred_h_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq .w4: IPRED_H 4, d .w8: IPRED_H 8, q .w16: IPRED_H 16, a INIT_YMM avx2 .w32: IPRED_H 32, a .w64: vpbroadcastb m0, [tlq-1] vpbroadcastb m1, [tlq-2] vpbroadcastb m2, [tlq-3] sub tlq, 4 vpbroadcastb m3, [tlq+0] mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m0 mova [dstq+strideq*1+32*0], m1 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m2 mova [dstq+strideq*2+32*1], m2 mova [dstq+stride3q +32*0], m3 mova [dstq+stride3q +32*1], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w64 RET %macro PAETH 2 ; top, ldiff pavgb m1, m%1, m3 ; Calculating tldiff normally requires pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it pand m0, m4 ; in 8-bit with some tricks which avoids psubusb m2, m5, m1 ; having to unpack everything to 16-bit. psubb m1, m0 psubusb m1, m5 por m1, m2 paddusb m1, m1 por m1, m0 ; min(tldiff, 255) psubusb m2, m5, m3 psubusb m0, m3, m5 por m2, m0 ; tdiff pminub m2, m%2 pcmpeqb m0, m%2, m2 ; ldiff <= tdiff vpblendvb m0, m%1, m3, m0 pminub m1, m2 pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff vpblendvb m0, m5, m0, m1 %endmacro cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h %define base r5-ipred_paeth_avx2_table lea r5, [ipred_paeth_avx2_table] tzcnt wd, wm vpbroadcastb m5, [tlq] ; topleft movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m4, [base+pb_1] add wq, r5 jmp wq .w4: vpbroadcastd m6, [tlq+1] ; top mova m8, [base+ipred_h_shuf] lea r3, [strideq*3] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: sub tlq, 8 vpbroadcastq m3, [tlq] pshufb m3, m8 ; left PAETH 6, 7 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r3 ], xm1, 2 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+r3 ], xm1, 3 lea dstq, [dstq+strideq*4] sub hd, 8 jg .w4_loop .ret: RET ALIGN function_align .w8: vpbroadcastq m6, [tlq+1] mova m8, [base+ipred_h_shuf] lea r3, [strideq*3] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: sub tlq, 4 vpbroadcastd m3, [tlq] pshufb m3, m8 PAETH 6, 7 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET ALIGN function_align .w16: vbroadcasti128 m6, [tlq+1] mova xm8, xm4 ; lower half = 1, upper half = 0 psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: sub tlq, 2 vpbroadcastd m3, [tlq] pshufb m3, m8 PAETH 6, 7 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w32_loop: dec tlq vpbroadcastb m3, [tlq] PAETH 6, 7 mova [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: movu m6, [tlq+ 1] movu m7, [tlq+33] %if WIN64 movaps r4m, xmm9 %endif psubusb m8, m5, m6 psubusb m0, m6, m5 psubusb m9, m5, m7 psubusb m1, m7, m5 por m8, m0 por m9, m1 .w64_loop: dec tlq vpbroadcastb m3, [tlq] PAETH 6, 8 mova [dstq+32*0], m0 PAETH 7, 9 mova [dstq+32*1], m0 add dstq, strideq dec hd jg .w64_loop %if WIN64 movaps xmm9, r4m %endif RET %macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] ; w * a = (w - 128) * a + 128 * a ; (256 - w) * b = (127 - w) * b + 129 * b pmaddubsw m0, m%3, m%1 pmaddubsw m1, m%4, m%2 paddw m0, m%5 paddw m1, m%6 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 %endmacro cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_avx2_table lea r6, [ipred_smooth_v_avx2_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m0, [base+pb_127_m127] vpbroadcastd m1, [base+pw_128] lea weightsq, [base+smooth_weights+hq*4] neg hq vpbroadcastb m5, [tlq+hq] ; bottom add wq, r6 jmp wq .w4: vpbroadcastd m2, [tlq+1] punpcklbw m2, m5 ; top, bottom mova m5, [base+ipred_v_shuf] lea r3, [strideq*3] punpckldq m4, m5, m5 punpckhdq m5, m5 pmaddubsw m3, m2, m0 paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok paddw m3, m1 ; 128 * top + 129 * bottom + 128 .w4_loop: vbroadcasti128 m1, [weightsq+hq*2] pshufb m0, m1, m4 pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 1 pextrd [dstq+r3 ], xm1, 1 cmp hd, -4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm1, 2 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+r3 ], xm1, 3 lea dstq, [dstq+strideq*4] add hq, 8 jl .w4_loop .ret: RET ALIGN function_align .w8: vpbroadcastq m2, [tlq+1] punpcklbw m2, m5 mova m5, [base+ipred_v_shuf] lea r3, [strideq*3] pshufd m4, m5, q0000 pshufd m5, m5, q1111 pmaddubsw m3, m2, m0 paddw m1, m2 paddw m3, m1 .w8_loop: vpbroadcastq m1, [weightsq+hq*2] pshufb m0, m1, m4 pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET ALIGN function_align .w16: WIN64_SPILL_XMM 7 vbroadcasti128 m3, [tlq+1] mova m6, [base+ipred_v_shuf] punpcklbw m2, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w16_loop: vpbroadcastd m1, [weightsq+hq*2] pshufb m1, m6 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] add hq, 2 jl .w16_loop RET ALIGN function_align .w32: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 6 movu m3, [tlq+1] punpcklbw m2, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w32_loop: vpbroadcastw m1, [weightsq+hq*2] SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m0 add dstq, strideq inc hq jl .w32_loop RET ALIGN function_align .w64: WIN64_SPILL_XMM 11 movu m4, [tlq+ 1] movu m8, [tlq+33] punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m7, m8, m5 punpckhbw m8, m5 pmaddubsw m5, m3, m0 pmaddubsw m6, m4, m0 pmaddubsw m9, m7, m0 pmaddubsw m10, m8, m0 paddw m2, m1, m3 paddw m5, m2 paddw m2, m1, m4 paddw m6, m2 paddw m0, m1, m7 paddw m9, m0 paddw m1, m8 paddw m10, m1 .w64_loop: vpbroadcastw m2, [weightsq+hq*2] SMOOTH 2, 2, 3, 4, 5, 6 mova [dstq+32*0], m0 SMOOTH 2, 2, 7, 8, 9, 10 mova [dstq+32*1], m0 add dstq, strideq inc hq jl .w64_loop RET %macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used %assign stack_offset 0 %assign stack_size_padded 0 %assign regs_used %2 %xdefine rstk rsp SETUP_STACK_POINTER %1 %if regs_used != %2 && WIN64 PUSH r%2 %endif ALLOC_STACK %1, %3 %endmacro cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h %define base r6-ipred_smooth_h_avx2_table lea r6, [ipred_smooth_h_avx2_table] mov wd, wm vpbroadcastb m3, [tlq+wq] ; right tzcnt wd, wd mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m4, [base+pb_127_m127] vpbroadcastd m5, [base+pw_128] add wq, r6 jmp wq .w4: WIN64_SPILL_XMM 8 vpbroadcastq m6, [base+smooth_weights+4*2] mova m7, [base+ipred_h_shuf] sub tlq, 8 sub tlq, hq lea r3, [strideq*3] .w4_loop: vpbroadcastq m2, [tlq+hq] pshufb m2, m7 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r3 ], xm1, 2 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+r3 ], xm1, 3 lea dstq, [dstq+strideq*4] sub hd, 8 jg .w4_loop .ret: RET ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 vbroadcasti128 m6, [base+smooth_weights+8*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 lea r3, [strideq*3] sub tlq, hq .w8_loop: vpbroadcastd m2, [tlq+hq] pshufb m2, m7 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m0, m1, m4 paddw m0, m1 pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET ALIGN function_align .w16: SETUP_STACK_FRAME 32*4, 7, 8 lea r3, [rsp+64*2-4] call .prep ; only worthwhile for for w16 and above sub tlq, 2 vpbroadcastd xm6, [base+pb_1] mova xm7, [base+ipred_v_shuf+16] vinserti128 m7, [base+ipred_v_shuf+ 0], 1 vbroadcasti128 m4, [base+smooth_weights+16*2] vbroadcasti128 m5, [base+smooth_weights+16*3] .w16_loop: vpbroadcastd m1, [tlq+hq] vpbroadcastd m2, [r3+hq*2] pshufb m1, m6 punpcklbw m1, m3 pshufb m2, m7 SMOOTH 4, 5, 1, 1, 2, 2 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: SETUP_STACK_FRAME 32*4, 7, 6 lea r3, [rsp+64*2-2] call .prep dec tlq mova xm4, [base+smooth_weights+16*4] vinserti128 m4, [base+smooth_weights+16*6], 1 mova xm5, [base+smooth_weights+16*5] vinserti128 m5, [base+smooth_weights+16*7], 1 .w32_loop: vpbroadcastb m1, [tlq+hq] punpcklbw m1, m3 vpbroadcastw m2, [r3+hq*2] SMOOTH 4, 5, 1, 1, 2, 2 mova [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: SETUP_STACK_FRAME 32*4, 7, 9 lea r3, [rsp+64*2-2] call .prep add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table dec tlq mova xm5, [r6-16*7] vinserti128 m5, [r6-16*5], 1 mova xm6, [r6-16*6] vinserti128 m6, [r6-16*4], 1 mova xm7, [r6-16*3] vinserti128 m7, [r6-16*1], 1 mova xm8, [r6-16*2] vinserti128 m8, [r6-16*0], 1 .w64_loop: vpbroadcastb m2, [tlq+hq] punpcklbw m2, m3 vpbroadcastw m4, [r3+hq*2] SMOOTH 5, 6, 2, 2, 4, 4 mova [dstq+32*0], m0 SMOOTH 7, 8, 2, 2, 4, 4 mova [dstq+32*1], m0 add dstq, strideq dec hd jg .w64_loop RET ALIGN function_align .prep: vpermq m2, [tlq-32*1], q3120 punpckhbw m1, m2, m3 punpcklbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m1, m5 ; 1 * left + 256 * right + 128 paddw m0, m1 ; 128 * left + 129 * right + 128 pmaddubsw m1, m2, m4 paddw m2, m5 paddw m1, m2 vpermq m2, [tlq-32*2], q3120 mova [rsp+gprsize+32*3], m0 mova [rsp+gprsize+32*2], m1 punpckhbw m1, m2, m3 punpcklbw m2, m3 pmaddubsw m0, m1, m4 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m2, m5 paddw m1, m2 mova [rsp+gprsize+32*1], m0 mova [rsp+gprsize+32*0], m1 sub r3, hq sub tlq, hq sub r3, hq ret %macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] pmaddubsw m0, m%3, m%1 pmaddubsw m1, m%4, m%2 %ifnum %5 paddw m0, m%5 %else paddw m0, %5 %endif %ifnum %6 paddw m1, m%6 %else paddw m1, %6 %endif pavgw m0, m2 pavgw m1, m3 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 %endmacro cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_avx2_table lea r6, [ipred_smooth_avx2_table] mov wd, wm vpbroadcastb m4, [tlq+wq] ; right tzcnt wd, wd mov hd, hm mov r5, tlq sub r5, hq movsxd wq, [r6+wq*4] vpbroadcastd m5, [base+pb_127_m127] vpbroadcastb m0, [r5] ; bottom vpbroadcastd m3, [base+pw_255] add wq, r6 lea v_weightsq, [base+smooth_weights+hq*2] jmp wq .w4: WIN64_SPILL_XMM 12 mova m10, [base+ipred_h_shuf] vpbroadcastq m11, [base+smooth_weights+4*2] mova m7, [base+ipred_v_shuf] vpbroadcastd m8, [tlq+1] sub tlq, 8 lea r3, [strideq*3] sub tlq, hq punpcklbw m8, m0 ; top, bottom pshufd m6, m7, q2200 pshufd m7, m7, q3311 pmaddubsw m9, m8, m5 paddw m3, m8 ; 1 * top + 255 * bottom + 255 paddw m9, m3 ; 128 * top + 129 * bottom + 255 .w4_loop: vpbroadcastq m1, [tlq+hq] pshufb m1, m10 punpcklbw m0, m1, m4 ; left, right punpckhbw m1, m4 pmaddubsw m2, m0, m5 ; 127 * left - 127 * right pmaddubsw m3, m1, m5 paddw m2, m0 ; 128 * left + 129 * right paddw m3, m1 pmaddubsw m0, m11 pmaddubsw m1, m11 paddw m2, m0 paddw m3, m1 vbroadcasti128 m1, [v_weightsq] add v_weightsq, 16 pshufb m0, m1, m6 pshufb m1, m7 SMOOTH_2D_END 0, 1, 8, 8, 9, 9 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r3 ], xm1, 2 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+r3 ], xm1, 3 lea dstq, [dstq+strideq*4] sub hd, 8 jg .w4_loop .ret: RET ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 mova m10, [base+ipred_h_shuf] vbroadcasti128 m11, [base+smooth_weights+8*2] mova m7, [base+ipred_v_shuf] vpbroadcastq m8, [tlq+1] sub tlq, 4 lea r3, [strideq*3] sub tlq, hq punpcklbw m8, m0 pshufd m6, m7, q0000 pshufd m7, m7, q1111 pmaddubsw m9, m8, m5 paddw m3, m8 paddw m9, m3 .w8_loop: vpbroadcastd m1, [tlq+hq] pshufb m1, m10 punpcklbw m0, m1, m4 punpckhbw m1, m4 pmaddubsw m2, m0, m5 pmaddubsw m3, m1, m5 paddw m2, m0 paddw m3, m1 pmaddubsw m0, m11 pmaddubsw m1, m11 paddw m2, m0 paddw m3, m1 vpbroadcastq m1, [v_weightsq] add v_weightsq, 8 pshufb m0, m1, m6 pshufb m1, m7 SMOOTH_2D_END 0, 1, 8, 8, 9, 9 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET ALIGN function_align .w16: SETUP_STACK_FRAME 32*4, 7, 14 vbroadcasti128 m11, [tlq+1] lea r3, [rsp+64*2-4] punpcklbw m10, m11, m0 ; top, bottom punpckhbw m11, m0 call .prep_v sub tlq, 2 pmaddubsw m12, m10, m5 pmaddubsw m13, m11, m5 vpbroadcastd xm5, [base+pb_1] mova m9, [base+ipred_v_shuf] vbroadcasti128 m6, [base+smooth_weights+16*2] vbroadcasti128 m7, [base+smooth_weights+16*3] vperm2i128 m8, m9, m9, 0x01 paddw m0, m10, m3 paddw m3, m11 paddw m12, m0 paddw m13, m3 .w16_loop: vpbroadcastd m3, [tlq+hq] vpbroadcastd m0, [r3+hq*2] vpbroadcastd m1, [v_weightsq] add v_weightsq, 4 pshufb m3, m5 punpcklbw m3, m4 ; left, right pmaddubsw m2, m3, m6 pmaddubsw m3, m7 pshufb m0, m8 pshufb m1, m9 paddw m2, m0 paddw m3, m0 SMOOTH_2D_END 1, 1, 10, 11, 12, 13 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: SETUP_STACK_FRAME 32*4, 7, 11 movu m8, [tlq+1] lea r3, [rsp+64*2-2] punpcklbw m7, m8, m0 punpckhbw m8, m0 call .prep_v dec tlq pmaddubsw m9, m7, m5 pmaddubsw m10, m8, m5 mova xm5, [base+smooth_weights+16*4] vinserti128 m5, [base+smooth_weights+16*6], 1 mova xm6, [base+smooth_weights+16*5] vinserti128 m6, [base+smooth_weights+16*7], 1 paddw m0, m7, m3 paddw m3, m8 paddw m9, m0 paddw m10, m3 .w32_loop: vpbroadcastb m3, [tlq+hq] punpcklbw m3, m4 vpbroadcastw m0, [r3+hq*2] vpbroadcastw m1, [v_weightsq] add v_weightsq, 2 pmaddubsw m2, m3, m5 pmaddubsw m3, m6 paddw m2, m0 paddw m3, m0 SMOOTH_2D_END 1, 1, 7, 8, 9, 10 mova [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: SETUP_STACK_FRAME 32*8, 7, 16 movu m13, [tlq+1 ] movu m15, [tlq+33] add r6, smooth_weights+16*15-ipred_smooth_avx2_table lea r3, [rsp+64*2-2] punpcklbw m12, m13, m0 punpckhbw m13, m0 punpcklbw m14, m15, m0 punpckhbw m15, m0 call .prep_v dec tlq pmaddubsw m0, m12, m5 pmaddubsw m1, m13, m5 pmaddubsw m2, m14, m5 pmaddubsw m5, m15, m5 mova xm8, [r6-16*7] vinserti128 m8, [r6-16*5], 1 mova xm9, [r6-16*6] vinserti128 m9, [r6-16*4], 1 mova xm10, [r6-16*3] vinserti128 m10, [r6-16*1], 1 mova xm11, [r6-16*2] vinserti128 m11, [r6-16*0], 1 lea r6, [rsp+32*4] paddw m0, m3 paddw m1, m3 paddw m2, m3 paddw m3, m5 paddw m0, m12 paddw m1, m13 paddw m2, m14 paddw m3, m15 mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 mova [r6+32*3], m3 .w64_loop: vpbroadcastb m5, [tlq+hq] punpcklbw m5, m4 vpbroadcastw m6, [r3+hq*2] vpbroadcastw m7, [v_weightsq] add v_weightsq, 2 pmaddubsw m2, m5, m8 pmaddubsw m3, m5, m9 paddw m2, m6 paddw m3, m6 SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1] mova [dstq+32*0], m0 pmaddubsw m2, m5, m10 pmaddubsw m3, m5, m11 paddw m2, m6 paddw m3, m6 SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3] mova [dstq+32*1], m0 add dstq, strideq dec hd jg .w64_loop RET ALIGN function_align .prep_v: vpermq m2, [tlq-32*1], q3120 punpckhbw m1, m2, m4 punpcklbw m2, m4 pmaddubsw m0, m1, m5 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m2, m5 paddw m1, m2 vpermq m2, [tlq-32*2], q3120 mova [rsp+gprsize+32*3], m0 mova [rsp+gprsize+32*2], m1 punpckhbw m1, m2, m4 punpcklbw m2, m4 pmaddubsw m0, m1, m5 paddw m0, m1 pmaddubsw m1, m2, m5 paddw m1, m2 mova [rsp+gprsize+32*1], m0 mova [rsp+gprsize+32*0], m1 sub r3, hq sub tlq, hq sub r3, hq ret cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z1_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea r7, [dr_intra_derivative] inc tlq movsxd wq, [r6+wq*4] add wq, r6 mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 movzx dxd, word [r7+dxq] xor angled, 0x4ff ; d = 90 - angle vpbroadcastd m3, [pw_512] vpbroadcastd m4, [pw_62] vpbroadcastd m5, [pw_64] jmp wq .w4: cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) ALLOC_STACK -32, 8 mova xm1, [tlq-1] pshufb xm0, xm1, [z_upsample1] pshufb xm1, [z_upsample2] vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse add dxd, dxd ; pw_512 (which is already in m3) pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 pextrd [rsp+16], xm1, 3 ; top[max_base_x] pmaddubsw xm1, xm2 movd xm7, dxd mov r3d, dxd ; xpos vpbroadcastw m7, xm7 paddw xm1, xm0 movq xm0, [tlq] pmulhrsw xm1, xm3 pslldq m6, m7, 8 paddw xm2, xm7, xm7 lea r2, [strideq*3] paddw m6, m7 packuswb xm1, xm1 paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 punpcklbw xm0, xm1 psllw m7, 2 mova [rsp], xm0 .w4_upsample_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 vpbroadcastq m1, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vpbroadcastq m2, [rsp+r5] lea r5d, [r3+dxq] shr r3d, 6 ; base2 movq xm0, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base3 movhps xm0, [rsp+r5] vpblendd m1, m2, 0xc0 pand m2, m4, m6 ; frac vpblendd m0, m1, 0xf0 psubw m1, m5, m2 ; 64-frac psllw m2, 8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 paddw m6, m7 ; xpos += dx pmulhrsw m0, m3 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r2 ], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_upsample_loop RET ALIGN function_align .filter_strength: ; w4/w8/w16 ; The C version uses a lot of branches, but we can do all the comparisons ; in parallel and use popcnt to get the final filter strength value. %define base r3-z_filter_t0 lea r3, [z_filter_t0] movd xm0, maxbased movd xm2, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m2, xm2 pcmpeqb m1, m0, [base+z_filter_wh] pand m1, m2 mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases pcmpgtb m1, m2 pmovmskb r5d, m1 ret .w4_no_upsample: %assign stack_offset org_stack_offset ALLOC_STACK -16, 11 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea maxbased, [hq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m7, [base+pb_8] vbroadcasti128 m2, [tlq-1] pminub m1, m7, [base+z_filter_s] vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] pminub m7, [base+z_filter_s+8] vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] pshufb m0, m2, m1 shufps m1, m7, q2121 pmaddubsw m0, m8 pshufb m1, m2, m1 pmaddubsw m1, m9 pshufb m2, m7 pmaddubsw m2, m10 paddw m0, m1 paddw m0, m2 pmulhrsw m0, m3 mov r3d, 9 mov tlq, rsp cmp hd, 4 cmovne maxbased, r3d vextracti128 xm1, m0, 1 packuswb xm0, xm1 mova [tlq], xm0 .w4_main: movd xm6, dxd vpbroadcastq m0, [z_base_inc] ; base_inc << 6 vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m6, xm6 mov r3d, dxd ; xpos movd xm9, maxbased vpbroadcastw m9, xm9 vbroadcasti128 m8, [z1_shuf_w4] psrlw m7, 8 ; top[max_base_x] paddw m10, m6, m6 psubw m9, m0 ; max_base_x vpblendd m6, m10, 0xcc mova xm0, xm10 paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 paddw m10, m10 .w4_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 vpbroadcastq m1, [tlq+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vpbroadcastq m2, [tlq+r5] lea r5d, [r3+dxq] shr r3d, 6 ; base2 movq xm0, [tlq+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base3 movhps xm0, [tlq+r5] vpblendd m1, m2, 0xc0 pand m2, m4, m6 ; frac vpblendd m0, m1, 0xf0 psubw m1, m5, m2 ; 64-frac psllw m2, 8 pshufb m0, m8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 pcmpgtw m1, m9, m6 ; base < max_base_x pmulhrsw m0, m3 paddw m6, m10 ; xpos += dx lea r5, [dstq+strideq*2] vpblendvb m0, m7, m0, m1 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [r5 +strideq*0], xm0 pextrd [r5 +strideq*1], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*4] cmp r3d, maxbased jb .w4_loop packuswb xm7, xm7 lea r6, [strideq*3] .w4_end_loop: movd [dstq+strideq*0], xm7 movd [dstq+strideq*1], xm7 movd [dstq+strideq*2], xm7 movd [dstq+r6 ], xm7 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_end_loop .w4_end: RET ALIGN function_align .w8: lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 %assign stack_offset org_stack_offset ALLOC_STACK -32, 8 movu xm2, [z_filter_s+6] mova xm0, [tlq-1] movd xm6, hd vinserti128 m0, [tlq+7], 1 vpbroadcastb xm6, xm6 vbroadcasti128 m1, [z_upsample1] pminub xm6, xm2 vpbroadcastd m7, [pb_36_m4] vinserti128 m2, xm6, 1 add dxd, dxd pshufb m1, m0, m1 pshufb m2, m0, m2 movd xm6, dxd pmaddubsw m1, m7 pmaddubsw m2, m7 vpbroadcastw m6, xm6 mov r3d, dxd psrldq m0, 1 lea r2, [strideq*3] paddw m7, m6, m6 paddw m1, m2 vpblendd m6, m7, 0xf0 pmulhrsw m1, m3 pslldq m2, m7, 8 paddw m7, m7 paddw m6, m2 packuswb m1, m1 punpcklbw m0, m1 mova [rsp], m0 .w8_upsample_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 movu xm0, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vinserti128 m0, [rsp+r5], 1 lea r5d, [r3+dxq] shr r3d, 6 ; base2 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 punpcklqdq m1, m2, m2 ; frac0 frac1 pmaddubsw m0, m1 movu xm1, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base3 vinserti128 m1, [rsp+r5], 1 punpckhqdq m2, m2 ; frac2 frac3 pmaddubsw m1, m2 pmulhrsw m0, m3 paddw m6, m7 pmulhrsw m1, m3 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*2], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+r2 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_upsample_loop RET .w8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(h+7, 15) jmp .w8_main .w8_no_upsample: %assign stack_offset org_stack_offset ALLOC_STACK -32, 10 lea maxbased, [hq+7] test angled, 0x400 jnz .w8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w8_main ; filter_strength == 0 popcnt r5d, r5d movu xm2, [tlq] pminub xm1, xm0, [base+z_filter_s+14] vinserti128 m2, [tlq-1], 1 vinserti128 m1, [base+z_filter_s+ 0], 1 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] pminub xm0, [base+z_filter_s+22] vinserti128 m0, [base+z_filter_s+ 8], 1 pshufb m6, m2, m1 pmaddubsw m6, m7 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] movzx r3d, byte [tlq+15] shufps m1, m0, q2121 pshufb m1, m2, m1 pmaddubsw m1, m7 paddw m1, m6 sub r5d, 3 jnz .w8_3tap ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, ; which also results in an awkward edge case where out[w*2] is ; slightly different from out[max_base_x] when h > w. vpbroadcastd m7, [z_filter_k+4*8] movzx r2d, byte [tlq+14] pshufb m2, m0 pmaddubsw m2, m7 sub r2d, r3d lea r2d, [r2+r3*8+4] shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 mov [rsp+16], r2b paddw m1, m2 .w8_3tap: pmulhrsw m1, m3 sar r5d, 1 mov tlq, rsp add r5d, 17 ; w*2 + (filter_strength == 3) cmp hd, 16 cmovns maxbased, r5d mov [tlq+r5], r3b vextracti128 xm0, m1, 1 packuswb xm0, xm1 mova [tlq], xm0 .w8_main: movd xm2, dxd vbroadcasti128 m0, [z_base_inc] vpbroadcastw m2, xm2 vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 movd xm9, maxbased vbroadcasti128 m8, [z_filter_s+2] vpbroadcastw m9, xm9 psrlw m7, 8 psubw m9, m0 mov r3d, dxd paddw m6, m2, m2 vpblendd m2, m6, 0xf0 .w8_loop: lea r5d, [r3+dxq] shr r3d, 6 pand m0, m4, m2 psubw m1, m5, m0 psllw m0, 8 por m1, m0 movu xm0, [tlq+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vinserti128 m0, [tlq+r5], 1 pshufb m0, m8 pmaddubsw m0, m1 pcmpgtw m1, m9, m2 paddw m2, m6 pmulhrsw m0, m3 vpblendvb m0, m7, m0, m1 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 sub hd, 2 jz .w8_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w8_loop packuswb xm7, xm7 .w8_end_loop: movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_end_loop .w8_end: RET .w16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(h+15, 31) jmp .w16_main ALIGN function_align .w16: %assign stack_offset org_stack_offset ALLOC_STACK -64, 12 lea maxbased, [hq+15] test angled, 0x400 jnz .w16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w16_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m1, [base+pb_12] vbroadcasti128 m6, [base+z_filter_s+8] vinserti128 m2, m6, [base+z_filter_s], 0 vinserti128 m6, [base+z_filter_s+16], 1 mova xm10, [tlq-1] vinserti128 m10, [tlq+3], 1 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] vbroadcasti128 m7, [base+z_filter_s+14] vinserti128 m8, m7, [base+z_filter_s+6], 0 vinserti128 m7, [base+z_filter_s+22], 1 psubw m0, m1 movu xm11, [tlq+12] vinserti128 m11, [tlq+16], 1 pminub m8, m0 pminub m7, m0 pshufb m0, m10, m2 shufps m2, m6, q2121 pmaddubsw m0, m9 pshufb m1, m11, m8 shufps m8, m7, q2121 pmaddubsw m1, m9 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] movzx r3d, byte [tlq+31] pshufb m2, m10, m2 pmaddubsw m2, m9 pshufb m8, m11, m8 pmaddubsw m8, m9 paddw m0, m2 paddw m1, m8 sub r5d, 3 jnz .w16_3tap vpbroadcastd m9, [z_filter_k+4*8] movzx r2d, byte [tlq+30] pshufb m10, m6 pmaddubsw m10, m9 pshufb m11, m7 pmaddubsw m11, m9 sub r2d, r3d lea r2d, [r2+r3*8+4] shr r2d, 3 mov [rsp+32], r2b paddw m0, m10 paddw m1, m11 .w16_3tap: pmulhrsw m0, m3 pmulhrsw m1, m3 sar r5d, 1 mov tlq, rsp add r5d, 33 cmp hd, 32 cmovns maxbased, r5d mov [tlq+r5], r3b packuswb m0, m1 vpermq m0, m0, q3120 mova [tlq], m0 .w16_main: movd xm6, dxd vbroadcasti128 m0, [z_base_inc] vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m6, xm6 movd xm9, maxbased vbroadcasti128 m8, [z_filter_s+2] vpbroadcastw m9, xm9 mov r3d, dxd psubw m9, m0 paddw m11, m6, m6 psubw m10, m9, m3 ; 64*8 vpblendd m6, m11, 0xf0 .w16_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu xm0, [tlq+r3+0] movu xm1, [tlq+r3+8] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vinserti128 m0, [tlq+r5+0], 1 vinserti128 m1, [tlq+r5+8], 1 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w16_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w16_loop .w16_end_loop: mova [dstq+strideq*0], xm7 mova [dstq+strideq*1], xm7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_end_loop .w16_end: RET ALIGN function_align .w32: %assign stack_offset org_stack_offset ALLOC_STACK -96, 15 lea r3d, [hq+31] mov maxbased, 63 cmp hd, 32 cmovs maxbased, r3d test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main vbroadcasti128 m0, [pb_0to15] sub r3d, 29 ; h+2 movu xm13, [tlq+29] ; 32-39 movd xm1, r3d movu xm14, [tlq+37] ; 40-47 sub r3d, 8 ; h-6 vinserti128 m14, [tlq+51], 1 ; 56-63 vpbroadcastb xm1, xm1 mova xm11, [tlq- 1] ; 0- 7 vinserti128 m11, [tlq+13], 1 ; 16-23 movd xm2, r3d movu xm12, [tlq+ 5] ; 8-15 vinserti128 m12, [tlq+19], 1 ; 24-31 pminub xm1, xm0 ; clip 32x8 mova m7, [z_filter_s+0] pshufb xm13, xm1 vpbroadcastd m1, [pb_12] vpbroadcastb xm2, xm2 vinserti128 m13, [tlq+43], 1 ; 48-55 vinserti128 m8, m7, [z_filter_s+4], 1 vpblendd m2, m1, 0xf0 vinserti128 m7, [z_filter_s+12], 0 pminub m2, m0 ; clip 32x16 and 32x(32|64) vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m14, m2 pshufb m0, m11, m8 shufps m8, m7, q1021 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pshufb m1, m13, m8 pmaddubsw m1, m9 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m10, m11, m8 shufps m8, m7, q2121 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m8 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m8 pmaddubsw m10, m9 paddw m1, m10 pshufb m10, m14, m8 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*2] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m7 pmaddubsw m12, m9 movzx r3d, byte [tlq+63] movzx r2d, byte [tlq+62] paddw m0, m11 paddw m2, m12 pshufb m13, m7 pmaddubsw m13, m9 pshufb m14, m7 pmaddubsw m14, m9 paddw m1, m13 paddw m6, m14 sub r2d, r3d lea r2d, [r2+r3*8+4] ; edge case for 32x64 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 shr r2d, 3 mov [rsp+64], r2b mov tlq, rsp mov [tlq+65], r3b mov r3d, 65 cmp hd, 64 cmove maxbased, r3d packuswb m0, m2 packuswb m1, m6 mova [tlq+ 0], m0 mova [tlq+32], m1 .w32_main: movd xm6, dxd vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m6, xm6 movd xm9, maxbased vbroadcasti128 m8, [z_filter_s+2] vpbroadcastw m9, xm9 mov r5d, dxd psubw m9, [z_base_inc] mova m11, m6 psubw m10, m9, m3 ; 64*8 .w32_loop: mov r3d, r5d shr r3d, 6 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu m0, [tlq+r3+0] movu m1, [tlq+r3+8] add r5d, dxd pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 mova [dstq], m0 dec hd jz .w32_end add dstq, strideq cmp r5d, maxbased jb .w32_loop test hb, 1 jz .w32_end_loop mova [dstq], m7 add dstq, strideq dec hd jz .w32_end .w32_end_loop: mova [dstq+strideq*0], m7 mova [dstq+strideq*1], m7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_end_loop .w32_end: RET ALIGN function_align .w64: %assign stack_offset org_stack_offset ALLOC_STACK -128, 16 lea maxbased, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main mova xm11, [tlq- 1] ; 0- 7 vinserti128 m11, [tlq+13], 1 ; 16-23 movu xm12, [tlq+ 5] ; 8-15 vinserti128 m12, [tlq+19], 1 ; 24-31 mova m7, [z_filter_s+0] vinserti128 m8, m7, [z_filter_s+4], 1 vinserti128 m7, [z_filter_s+12], 0 vpbroadcastd m9, [z_filter_k+4*2+12*0] movu xm13, [tlq+29] ; 32-39 vinserti128 m13, [tlq+43], 1 ; 48-55 movu xm14, [tlq+37] ; 40-47 vinserti128 m14, [tlq+51], 1 ; 56-63 pshufb m0, m11, m8 shufps m8, m7, q1021 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pshufb m1, m13, m8 pmaddubsw m1, m9 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m10, m11, m8 shufps m15, m8, m7, q2121 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m15 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m15 pmaddubsw m10, m9 paddw m1, m10 pshufb m10, m14, m15 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m10, [z_filter_k+4*2+12*2] pshufb m11, m15 pmaddubsw m11, m10 pshufb m12, m7 pmaddubsw m12, m10 pshufb m13, m7 pmaddubsw m13, m10 pshufb m14, m7 pmaddubsw m14, m10 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 movu xm11, [tlq+ 61] ; 64- 71 vinserti128 m11, [tlq+ 75], 1 ; 80- 87 movu xm12, [tlq+ 69] ; 72- 79 vinserti128 m12, [tlq+ 83], 1 ; 88- 95 movu xm13, [tlq+ 93] ; 96-103 vinserti128 m13, [tlq+107], 1 ; 112-119 movu xm14, [tlq+101] ; 104-111 vinserti128 m14, [tlq+115], 1 ; 120-127 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 lea r3d, [hq-20] mov tlq, rsp packuswb m0, m2 packuswb m1, m6 vpbroadcastd xm2, [pb_14] vbroadcasti128 m6, [pb_0to15] mova [tlq+32*0], m0 mova [tlq+32*1], m1 movd xm0, r3d vpbroadcastd m1, [pb_12] vpbroadcastb m0, xm0 paddb m0, m2 pminub m0, m6 ; clip 64x16 and 64x32 pshufb m12, m0 pminub m1, m6 ; clip 64x64 pshufb m14, m1 pshufb m0, m11, m7 pmaddubsw m0, m10 pshufb m2, m12, m7 pmaddubsw m2, m10 pshufb m1, m13, m7 pmaddubsw m1, m10 pshufb m6, m14, m7 pmaddubsw m6, m10 pshufb m7, m11, m15 pmaddubsw m7, m9 pshufb m10, m12, m15 pmaddubsw m10, m9 paddw m0, m7 pshufb m7, m13, m15 pmaddubsw m7, m9 paddw m2, m10 pshufb m10, m14, m15 pmaddubsw m10, m9 paddw m1, m7 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m8 pmaddubsw m12, m9 pshufb m13, m8 pmaddubsw m13, m9 pshufb m14, m8 pmaddubsw m14, m9 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 packuswb m0, m2 packuswb m1, m6 mova [tlq+32*2], m0 mova [tlq+32*3], m1 .w64_main: movd xm12, dxd vpbroadcastb m7, [tlq+maxbaseq] lea r3d, [dxq-64] shl maxbased, 6 vpbroadcastw m12, xm12 sub r3d, maxbased vbroadcasti128 m8, [z_filter_s+2] movd xm6, r3d mov r5d, dxd mova m10, [pb_1to32] vpbroadcastd m11, [pb_32] vpbroadcastw m6, xm6 .w64_loop: mov r3d, r5d shr r3d, 6 movu m0, [tlq+r3+ 0] movu m1, [tlq+r3+ 8] pand m2, m4, m6 psubw m9, m5, m2 psllw m2, 8 por m9, m2 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 psraw m2, m6, 6 pmulhrsw m0, m3 pmulhrsw m1, m3 packsswb m2, m2 paddb m2, m10 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [dstq+ 0], m0 movu m0, [tlq+r3+32] movu m1, [tlq+r3+40] add r5d, dxd pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 paddb m2, m11 pmulhrsw m0, m3 pmulhrsw m1, m3 paddw m6, m12 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [dstq+32], m0 dec hd jz .w64_end add dstq, strideq cmp r5d, maxbased jb .w64_loop .w64_end_loop: mova [dstq+ 0], m7 mova [dstq+32], m7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy %define base r9-z_filter_t0 lea r9, [ipred_z2_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea dxq, [dr_intra_derivative-90] movsxd wq, [r9+wq*4] movzx dyd, angleb xor angled, 0x400 mov r8, dxq sub dxq, dyq add wq, r9 add r9, z_filter_t0-ipred_z2_avx2_table mova m2, [tlq-64] mova m0, [tlq-32] mova m1, [tlq] and dyd, ~1 and dxq, ~1 movzx dyd, word [r8+dyq] ; angle - 90 movzx dxd, word [dxq+270] ; 180 - angle vpbroadcastd m13, [base+pw_512] vpbroadcastd m14, [base+pw_62] vpbroadcastd m15, [base+pw_64] mova [rsp+ 0], m2 mova [rsp+32], m0 mova [rsp+64], m1 neg dxd neg dyd jmp wq .w4: vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 vbroadcasti128 m10, [base+z1_shuf_w4] vbroadcasti128 m11, [base+z2_shuf_h4] lea r2d, [dxq+(65<<6)] ; xpos movd xm5, dyd mov r8d, (63-4)<<6 mov dyq, -4 pshuflw xm5, xm5, q0000 pmullw xm5, [base+z2_ymul] test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+2] add angled, 1022 shl r3d, 6 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) vpbroadcastd xm3, [base+pb_4] call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle call .filter_strength jmp .w4_filter_left ALIGN function_align .filter_strength: movd xm8, r3d mov r3d, angled movd xm7, angled vpbroadcastb m8, xm8 shr r3d, 8 ; is_sm << 1 vpbroadcastb m7, xm7 pcmpeqb m8, [base+z_filter_wh] mova xm9, [r9+r3*8] pand m0, m8, m7 pcmpgtb m0, m9 pmovmskb r3d, m0 ret ALIGN function_align .upsample_above: ; w4/w8 pshufb xm2, xm1, [base+z_upsample1-2] pminub xm3, [base+z_filter_s+4] vpbroadcastd xm4, [base+pb_36_m4] vbroadcasti128 m10, [base+pb_0to15] pshufb xm3, xm1, xm3 pmaddubsw xm2, xm4 pmaddubsw xm3, xm4 lea r2d, [r2+dxq+(1<<6)] add dxd, dxd paddw xm2, xm3 pmulhrsw xm2, xm13 sub r8d, 3<<6 paddw m6, m6 packuswb xm2, xm2 punpcklbw xm1, xm2 mova [rsp+gprsize+64], xm1 ret ALIGN function_align .upsample_left: ; h4/h8 mov r3d, hd and r3d, 4 movd xm2, [rsp+gprsize+64] movddup xm0, [rsp+gprsize+56] movd xm1, r3d palignr xm2, xm0, 1 vpbroadcastb xm1, xm1 pshufb xm2, [base+z_filter_s+18] vpbroadcastd xm3, [base+pb_36_m4] pmaxub xm1, [base+z_upsample1-2] pshufb xm1, xm0, xm1 pmaddubsw xm2, xm3 pmaddubsw xm1, xm3 paddw xm5, xm5 add dyq, dyq paddw xm1, xm2 pmulhrsw xm1, xm13 vbroadcasti128 m11, [base+z2_upsample] paddw xm5, xm15 packuswb xm1, xm1 punpcklbw xm0, xm1 mova [rsp+gprsize+48], xm0 ret .w4_no_upsample_above: lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength test r3d, r3d jz .w4_no_filter_above popcnt r3d, r3d vpbroadcastd xm2, [base+pb_4] pminub xm2, [base+z_filter_s] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] pshufb xm3, xm1, xm2 ; 00 01 12 23 pshufd xm2, xm2, q0321 pmaddubsw xm0, xm3, xm0 pshufb xm2, xm1, xm2 ; 12 23 34 44 pmaddubsw xm2, xm4 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] punpckhqdq xm3, xm3 ; 34 44 44 44 pmaddubsw xm3, xm4 movd xm4, r6m ; max_width pminsw xm4, xm15 vpbroadcastb xm4, xm4 paddw xm0, xm2 paddw xm0, xm3 pmulhrsw xm0, xm13 psubb xm4, [base+pb_1to32] psrlq xm1, 8 packuswb xm0, xm0 vpblendvb xm0, xm1, xm4 movd [rsp+65], xm0 .w4_no_filter_above: lea r3d, [hq+2] add angled, 973 ; angle + 883 shl r3d, 6 test r3d, angled jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) vpbroadcastd xm0, [base+pb_90] psubb xm0, xm7 ; 180 - angle pand xm0, xm8 ; reuse from previous filter_strength call pcmpgtb xm0, xm9 pmovmskb r3d, xm0 .w4_filter_left: test r3d, r3d jz .w4_main popcnt r3d, r3d mov r5d, 10 cmp hd, 16 movu xm2, [rsp+49] vinserti128 m2, [rsp+43], 1 cmovs r5d, hd xor r5d, 15 ; h == 16 ? 5 : 15 - h movd xm0, r5d vbroadcasti128 m1, [base+z_filter_s+12] vbroadcasti128 m4, [base+z_filter_s+16] vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef vpbroadcastb m0, xm0 pmaxub m0, m3 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] pshufb m0, m2, m0 pmaddubsw m0, m3 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] pshufb m1, m2, m1 pmaddubsw m1, m3 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] pshufb m2, m4 pmaddubsw m2, m3 movd xm4, r7m ; max_height pminsw xm4, xm15 vpbroadcastb xm4, xm4 psubb xm4, [base+pb_16to1] paddw m1, m0 paddw m1, m2 pmulhrsw m1, m13 vextracti128 xm0, m1, 1 packuswb xm0, xm1 vpblendvb xm0, [rsp+48], xm4 mova [rsp+48], xm0 jmp .w4_main .w4_upsample_left: call .upsample_left .w4_main: movd xm0, dxd mova m12, [base+z2_y_shuf_h4] lea r5, [rsp+56] ; left-7 vpbroadcastw m0, xm0 lea r9, [strideq*3] psraw xm1, xm5, 6 pand xm5, xm14 ; frac_y pxor xm2, xm2 paddw m7, m0, m0 psubw xm4, xm2, xm1 ; base_y vpblendd m0, m7, 0xcc mova xm1, xm7 punpcklwd xm4, xm2 paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 psubw xm1, xm15, xm5 ; 64-frac_y psllw xm5, 8 paddw m7, m7 paddw m6, m0 por xm5, xm1 ; 64-frac_y, frac_y vpbroadcastq m5, xm5 .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 vpbroadcastq m1, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 vpbroadcastq m2, [rsp+r3] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movq xm0, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movhps xm0, [rsp+r3] vpblendd m1, m2, 0xc0 pand m2, m14, m6 ; frac_x vpblendd m0, m1, 0xf0 psubw m1, m15, m2 ; 64-frac_x psllw m2, 8 pshufb m0, m10 por m1, m2 ; 64-frac_x, frac_x pmaddubsw m0, m1 cmp r3d, 64 jge .w4_toponly mova m1, m7 ; arbitrary negative value vpgatherdq m3, [r5+xm4], m1 pshufb m1, m3, m11 vpermd m1, m12, m1 pmaddubsw m1, m5 psraw m2, m6, 15 ; base_x < topleft vpblendvb m0, m1, m2 .w4_toponly: pmulhrsw m0, m13 paddw m6, m7 ; xpos += dx add r5, dyq packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r9 ], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*4] cmp r2d, r8d jge .w4_loop .w4_leftonly_loop: mova m1, m7 vpgatherdq m2, [r5+xm4], m1 add r5, dyq pshufb m0, m2, m11 vpermd m0, m12, m0 pmaddubsw m0, m5 pmulhrsw m0, m13 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r9 ], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_leftonly_loop .w4_end: RET .w8: vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 movd xm5, dyd vbroadcasti128 m10, [base+z_filter_s+2] vbroadcasti128 m11, [base+z2_shuf_h4] lea r2d, [dxq+(65<<6)] ; xpos vpbroadcastw xm5, xm5 mov r8d, (63-8)<<6 mov dyq, -4 pmullw xm5, [base+z2_ymul] test angled, 0x400 jnz .w8_main lea r3d, [angleq+126] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm vpbroadcastd xm3, [base+pb_8] movhps [rsp+80], xm1 call .upsample_above sub angled, 53 ; angle - 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle call .filter_strength jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength test r3d, r3d jz .w8_no_filter_above popcnt r3d, r3d vpbroadcastd xm3, [base+pb_8] pminub xm3, [base+z_filter_s+8] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 pmaddubsw xm0, xm2, xm0 pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 pmaddubsw xm2, xm4 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] pmaddubsw xm3, xm4 movd xm4, r6m ; max_width pminuw xm4, xm15 vpbroadcastb xm4, xm4 paddw xm0, xm2 paddw xm0, xm3 pmulhrsw xm0, xm13 psubb xm4, [base+pb_1to32] psrldq xm1, 1 packuswb xm0, xm0 vpblendvb xm0, xm1, xm4 movq [rsp+65], xm0 .w8_no_filter_above: lea r3d, [angleq-51] mov r3b, hb cmp r3d, 8 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 .w8_filter_left: test r3d, r3d jz .w8_main popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] cmp hd, 32 jne .w8_filter_left_h16 movu xm2, [rsp+27] vinserti128 m2, [rsp+35], 1 vpbroadcastd xm0, [base+pb_5] vbroadcasti128 m3, [base+z_filter_s+ 8] vbroadcasti128 m1, [base+z_filter_s+12] vbroadcasti128 m4, [base+z_filter_s+16] pmaxub m3, m0 pshufb m3, m2, m3 pmaddubsw m3, m7 pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 paddw m3, m1 paddw m3, m2 pmulhrsw m3, m13 jmp .w8_filter_left_top16 .w8_filter_left_h16: mov r5d, 10 cmp hd, 16 cmovs r5d, hd xor r5d, 15 ; h == 16 ? 5 : 15 - h movd xm0, r5d vpbroadcastb m0, xm0 .w8_filter_left_top16: vbroadcasti128 m1, [base+z_filter_s+12] vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab vbroadcasti128 m4, [base+z_filter_s+16] vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef pmaxub m0, m2 movu xm2, [rsp+49] vinserti128 m2, [rsp+43], 1 pshufb m0, m2, m0 pmaddubsw m0, m7 movd xm7, r7m ; max_height pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 pminsw xm7, xm15 paddw m1, m0 vpbroadcastb m7, xm7 paddw m1, m2 pmulhrsw m1, m13 psubb m7, [base+pb_32to1] packuswb m3, m1 vpermq m3, m3, q1320 vpblendvb m3, [rsp+32], m7 mova [rsp+32], m3 jmp .w8_main .w8_upsample_left: call .upsample_left .w8_main: movd xm3, dxd lea r5, [rsp+56] ; left-7 pshufd xm1, xm5, q3120 pand xm5, xm14 vpbroadcastw m3, xm3 pxor xm0, xm0 psubw xm2, xm15, xm5 psraw xm1, 6 lea r9, [strideq*3] paddw m7, m3, m3 psubw xm9, xm0, xm1 ; base_y psllw xm5, 8 punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 vpblendd m3, m7, 0xf0 ; xpos0 xpos1 por xm5, xm2 ; 64-frac_y, frac_y punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 paddw m6, m3 vinserti128 m12, m5, xm5, 1 .w8_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm0, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 vinserti128 m0, [rsp+r3], 1 lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movu xm1, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 vinserti128 m1, [rsp+r3], 1 pand m2, m14, m6 paddsw m4, m6, m7 psubw m5, m15, m2 psllw m2, 8 pshufb m0, m10 por m2, m5 pmaddubsw m0, m2 pand m2, m14, m4 psubw m5, m15, m2 psllw m2, 8 pshufb m1, m10 por m2, m5 pmaddubsw m1, m2 cmp r3d, 64 jge .w8_toponly mova m5, m7 vpgatherdq m3, [r5+xm9], m7 mova m7, m5 vpgatherdq m2, [r5+xm8], m5 pshufb m3, m11 pshufb m2, m11 punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 vpermq m5, m5, q3120 ; y0 y1 vpermq m2, m2, q3120 ; y2 y3 pmaddubsw m5, m12 pmaddubsw m2, m12 psraw m6, 15 ; base_x < topleft vpblendvb m0, m5, m6 psraw m3, m4, 15 vpblendvb m1, m2, m3 .w8_toponly: pmulhrsw m0, m13 pmulhrsw m1, m13 paddw m6, m4, m7 ; xpos += dx add r5, dyq packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*2], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+r9 ], xm1 sub hd, 4 jz .w8_end lea dstq, [dstq+strideq*4] cmp r2d, r8d jge .w8_loop .w8_leftonly_loop: mova m0, m7 vpgatherdq m5, [r5+xm9], m7 mova m7, m0 vpgatherdq m3, [r5+xm8], m0 add r5, dyq pshufb m2, m5, m11 pshufb m1, m3, m11 punpckldq m0, m1, m2 punpckhdq m1, m2 vpermq m0, m0, q3120 vpermq m1, m1, q3120 pmaddubsw m0, m12 pmaddubsw m1, m12 pmulhrsw m0, m13 pmulhrsw m1, m13 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*2], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_leftonly_loop .w8_end: RET .w16: mov r8d, hd test angled, 0x400 jnz .w16_main lea r3d, [hq+15] sub angled, 90 call .filter_strength test r3d, r3d jz .w16_no_filter_above popcnt r3d, r3d vbroadcasti128 m6, [tlq+1] mova xm2, [base+z_filter_s] vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de movu xm3, [base+z_filter_s+8] vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff vpblendd m1, m6, 0xf0 vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2] pshufb m2, m1, m2 pshufb m1, m3 pmaddubsw m0, m2, m0 shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff pmaddubsw m2, m4 pmaddubsw m1, m5 movd xm4, r6m ; max_width pminsw xm4, xm15 vpbroadcastb xm4, xm4 paddw m0, m2 paddw m0, m1 pmulhrsw m0, m13 psubb xm4, [base+pb_1to32] vextracti128 xm2, m0, 1 packuswb xm0, xm2 vpblendvb xm0, xm6, xm4 movu [rsp+65], xm0 .w16_no_filter_above: vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 test r3d, r3d jz .w16_main popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] .w16_filter_left: movd xm6, r7m ; max_height pminsw xm6, xm15 vpbroadcastb m6, xm6 cmp hd, 32 jl .w16_filter_left_h16 vpbroadcastd xm0, [base+pb_5] vbroadcasti128 m10, [base+z_filter_s+ 8] vbroadcasti128 m11, [base+z_filter_s+12] vbroadcasti128 m12, [base+z_filter_s+16] je .w16_filter_left_h32 movu m3, [tlq-69] movu m5, [tlq-61] pmaxub m1, m10, m0 pshufb m1, m3, m1 pmaddubsw m1, m7 pshufb m2, m3, m11 pmaddubsw m2, m8 pshufb m3, m12 pmaddubsw m3, m9 paddw m1, m2 pshufb m2, m5, m10 pmaddubsw m2, m7 pshufb m4, m5, m11 pmaddubsw m4, m8 pshufb m5, m12 pmaddubsw m5, m9 paddw m1, m3 vpbroadcastd m3, [base+pb_32] paddb m3, [base+pb_32to1] paddw m2, m4 paddw m2, m5 pmulhrsw m1, m13 pmulhrsw m2, m13 psubb m3, m6, m3 packuswb m1, m2 vpblendvb m1, [tlq-64], m3 mova [rsp], m1 jmp .w16_filter_left_top32 .w16_filter_left_h32: pmaxub m10, m0 .w16_filter_left_top32: movu xm2, [tlq-37] vinserti128 m2, [tlq-29], 1 pshufb m3, m2, m10 pshufb m1, m2, m11 pshufb m2, m12 pmaddubsw m3, m7 pmaddubsw m1, m8 pmaddubsw m2, m9 paddw m3, m1 paddw m3, m2 pmulhrsw m3, m13 jmp .w16_filter_left_top16 .w16_filter_left_h16: mov r5d, 10 cmp hd, 16 cmovs r5d, hd xor r5d, 15 ; h == 16 ? 5 : 15 - h movd xm0, r5d vpbroadcastb m0, xm0 .w16_filter_left_top16: movu xm2, [tlq-15] vinserti128 m2, [tlq-21], 1 vbroadcasti128 m1, [base+z_filter_s+12] vbroadcasti128 m4, [base+z_filter_s+16] vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef pmaxub m0, m5 pshufb m0, m2, m0 pmaddubsw m0, m7 pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 psubb m6, [base+pb_32to1] paddw m1, m0 paddw m1, m2 pmulhrsw m1, m13 packuswb m3, m1 vpermq m3, m3, q1320 vpblendvb m3, [tlq-32], m6 mova [rsp+32], m3 .w16_main: movd xm1, dyd vbroadcasti128 m10, [base+z_filter_s+2] movd xm7, dxd vbroadcasti128 m11, [base+z2_shuf_h2] vpbroadcastw m1, xm1 vpbroadcastw m7, xm7 mov r7, dstq pmullw m0, m1, [base+z2_ymul] psllw xm1, 4 paddw m6, m7, [base+z2_base_inc] lea r9d, [dxq+(65<<6)] ; xpos movd [rsp+156], xm1 .w16_loop0: mov r2d, r9d mova [rsp+160], m0 lea r5, [rsp+60] ; left-3 mova [rsp+192], m6 pxor m1, m1 psraw m2, m0, 6 pand m0, m14 psubw m9, m1, m2 ; base_y psubw m12, m15, m0 punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11 psllw m0, 8 punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15 por m12, m0 ; 64-frac_y, frac_y .w16_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm0, [rsp+r2] vinserti128 m0, [rsp+r2+8], 1 lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu xm1, [rsp+r3] vinserti128 m1, [rsp+r3+8], 1 pand m2, m14, m6 paddsw m5, m6, m7 psubw m3, m15, m2 psllw m2, 8 pshufb m0, m10 por m2, m3 pmaddubsw m0, m2 pand m2, m14, m5 psubw m3, m15, m2 psllw m2, 8 pshufb m1, m10 por m2, m3 pmaddubsw m1, m2 cmp r3d, 64 jge .w16_toponly punpckhwd m2, m5, m5 ; mask out unnecessary loads vpgatherdd m4, [r5+m9], m2 punpcklwd m2, m5, m5 vpgatherdd m3, [r5+m8], m2 pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1 pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1 punpcklqdq m2, m3, m4 ; y0 punpckhqdq m3, m4 ; y1 pmaddubsw m2, m12 pmaddubsw m3, m12 psraw m6, 15 ; base_x < topleft vpblendvb m0, m2, m6 psraw m6, m5, 15 vpblendvb m1, m3, m6 .w16_toponly: pmulhrsw m0, m13 pmulhrsw m1, m13 paddw m6, m5, m7 ; xpos += dx sub r5, 2 packuswb m0, m1 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w16_end lea dstq, [dstq+strideq*2] cmp r2d, (63-16)<<6 jge .w16_loop .w16_leftonly_loop: mova m0, m7 vpgatherdd m4, [r5+m9], m7 mova m7, m0 vpgatherdd m3, [r5+m8], m0 sub r5, 2 pshufb m2, m4, m11 pshufb m1, m3, m11 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pmaddubsw m0, m12 pmaddubsw m1, m12 pmulhrsw m0, m13 pmulhrsw m1, m13 packuswb m0, m1 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_leftonly_loop .w16_end: sub r8d, 1<<8 jl .w16_ret vpbroadcastd m0, [rsp+156] paddw m0, [rsp+160] ; base_y += 16*dy paddw m6, m13, [rsp+192] add r7, 16 add r9d, 16<<6 movzx hd, r8b mov dstq, r7 paddw m6, m13 ; base_x += 16*64 jmp .w16_loop0 .w16_ret: RET .w32: mova m2, [tlq+32] lea r8d, [hq+(1<<8)] mova [rsp+96], m2 test angled, 0x400 jnz .w16_main vpbroadcastd m7, [base+z_filter_k+4*2+12*0] vpbroadcastd m8, [base+z_filter_k+4*2+12*1] vpbroadcastd m9, [base+z_filter_k+4*2+12*2] mova xm5, [base+z_filter_s] vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc vinserti128 m1, [tlq+11], 1 movu xm6, [base+z_filter_s+12] vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff movu xm3, [tlq+ 6] vinserti128 m3, [tlq+17], 1 movd xm0, r6m ; max_width pminsw xm0, xm15 vpbroadcastb m10, xm0 .w32_filter_above: pshufb m0, m1, m5 shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de pmaddubsw m0, m7 pshufb m2, m1, m4 shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff pmaddubsw m2, m8 pshufb m1, m5 pmaddubsw m1, m9 paddw m0, m2 paddw m0, m1 pshufb m1, m3, m4 pmaddubsw m1, m7 pshufb m2, m3, m5 pmaddubsw m2, m8 pshufb m3, m6 pmaddubsw m3, m9 paddw m1, m2 paddw m1, m3 pmulhrsw m0, m13 pmulhrsw m1, m13 psubb m10, [base+pb_1to32] packuswb m0, m1 vpblendvb m0, [tlq+1], m10 movu [rsp+65], m0 jmp .w16_filter_left .w64: mova m2, [tlq+32] mov r3d, [tlq+64] lea r8d, [hq+(3<<8)] mova [rsp+ 96], m2 mov [rsp+128], r3d test angled, 0x400 jnz .w16_main vpbroadcastd m7, [base+z_filter_k+4*2+12*0] vpbroadcastd m8, [base+z_filter_k+4*2+12*1] vpbroadcastd m9, [base+z_filter_k+4*2+12*2] movu xm6, [base+z_filter_s+ 4] vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc movu xm3, [tlq+30] vinserti128 m3, [tlq+43], 1 movu xm5, [base+z_filter_s+16] vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff pshufb m0, m3, m6 shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de pmaddubsw m0, m7 pshufb m2, m3, m4 shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff pmaddubsw m2, m8 pshufb m3, m6 pmaddubsw m3, m9 paddw m0, m2 paddw m0, m3 movu xm2, [tlq+36] vinserti128 m2, [tlq+49], 1 pshufb m4, m2, m4 pmaddubsw m4, m7 pshufb m3, m2, m6 pmaddubsw m3, m8 pshufb m2, m5 pmaddubsw m2, m9 movd xm5, r6m ; max_width pminsw xm5, xm15 vpbroadcastb m10, xm5 paddw m3, m4 paddw m2, m3 vpbroadcastd m3, [base+pb_32] pmulhrsw m0, m13 pmulhrsw m2, m13 mova xm5, [base+z_filter_s] vinserti128 m5, [base+z_filter_s+6], 1 psubb m3, m10, m3 psubb m3, [base+pb_1to32] vinserti128 m1, [tlq+13], 1 packuswb m0, m2 vpblendvb m0, [tlq+33], m3 movu xm3, [tlq+ 6] vinserti128 m3, [tlq+19], 1 movu [rsp+97], m0 jmp .w32_filter_above cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z3_avx2_table] tzcnt hd, hm movifnidn angled, anglem lea r7, [dr_intra_derivative+45*2-1] dec tlq movsxd hq, [r6+hq*4] sub angled, 180 add hq, r6 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e movzx dyd, word [r7+dyq] vpbroadcastd m3, [pw_512] vpbroadcastd m4, [pw_62] vpbroadcastd m5, [pw_64] mov org_wd, wd jmp hq .h4: lea r7, [strideq*3] cmp angleb, 40 jae .h4_no_upsample lea r4d, [angleq-1024] sar r4d, 7 add r4d, wd jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) ALLOC_STACK -32, 9 movu xm8, [tlq-7] pshufb xm0, xm8, [z_upsample1-4] vpbroadcastb xm2, xm8 pshufb xm1, xm8, [z_filter_s+2] mova [rsp+16], xm2 ; top[max_base_y] vpbroadcastd xm2, [pb_36_m4] add dyd, dyd pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 movd xm7, dyd mov r2d, dyd vpbroadcastw m7, xm7 paddw xm1, xm0 pmulhrsw xm1, xm3 pslldq m6, m7, 8 paddw xm2, xm7, xm7 paddw m6, m7 packuswb xm1, xm1 paddw m6, m2 punpcklbw xm1, xm8 mova xm8, [z_transpose4] psllw m7, 2 pshufb xm1, [pb_15to0] mova [rsp], xm1 .h4_upsample_loop: lea r4d, [r2+dyq] shr r2d, 6 vpbroadcastq m1, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 vpbroadcastq m2, [rsp+r4] lea r4d, [r2+dyq] shr r2d, 6 movq xm0, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 movhps xm0, [rsp+r4] vpblendd m1, m2, 0xc0 pand m2, m4, m6 vpblendd m0, m1, 0xf0 psubw m1, m5, m2 psllw m2, 8 por m1, m2 pmaddubsw m0, m1 paddw m6, m7 pmulhrsw m0, m3 vextracti128 xm1, m0, 1 packuswb xm1, xm0 pshufb xm1, xm8 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+r7 ], xm1, 3 add dstq, 4 sub wd, 4 jg .h4_upsample_loop RET ALIGN function_align .filter_strength: ; h4/h8/h16 %define base r4-z_filter_t0 lea r4, [z_filter_t0] movd xm0, maxbased movd xm2, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m2, xm2 pcmpeqb m1, m0, [base+z_filter_wh] pand m1, m2 mova xm2, [r4+angleq*8] pcmpgtb m1, m2 pmovmskb r5d, m1 ret .h4_no_upsample: %assign stack_offset org_stack_offset ALLOC_STACK -16, 12 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea maxbased, [wq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m7, [base+pb_7] vbroadcasti128 m2, [tlq-14] pmaxub m1, m7, [base+z_filter_s-4] vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] pmaxub m7, [base+z_filter_s+4] vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] pshufb m0, m2, m1 shufps m1, m7, q2121 pmaddubsw m0, m8 pshufb m1, m2, m1 pmaddubsw m1, m9 pshufb m2, m7 pmaddubsw m2, m10 paddw m0, m1 paddw m0, m2 pmulhrsw m0, m3 mov r4d, 9 lea tlq, [rsp+15] cmp wd, 4 cmovne maxbased, r4d vextracti128 xm1, m0, 1 packuswb xm0, xm1 mova [rsp], xm0 .h4_main: movd xm6, dyd vpbroadcastq m0, [z_base_inc] ; base_inc << 6 mov r4, tlq sub tlq, 4 neg dyq vpbroadcastw m6, xm6 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] ; ypos movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf_w4] add maxbased, 64 vpbroadcastw m9, xm9 psrlw m7, 8 ; top[max_base_y] paddw m10, m6, m6 psubw m9, m0 ; max_base_y vpblendd m6, m10, 0xcc mova xm0, xm10 paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1 paddw m10, m10 mova xm11, [z_transpose4] .h4_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 vpbroadcastq m1, [tlq+r4] lea r4, [r5+dyq] sar r5, 6 ; base1 vpbroadcastq m2, [tlq+r5] lea r5, [r4+dyq] sar r4, 6 ; base2 movq xm0, [tlq+r4] lea r4, [r5+dyq] sar r5, 6 ; base3 movhps xm0, [tlq+r5] vpblendd m1, m2, 0xc0 pand m2, m4, m6 ; frac vpblendd m0, m1, 0xf0 psubw m1, m5, m2 ; 64-frac psllw m2, 8 pshufb m0, m8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 pcmpgtw m1, m9, m6 ; base < max_base_y pmulhrsw m0, m3 paddw m6, m10 ; ypos += dy vpblendvb m0, m7, m0, m1 vextracti128 xm1, m0, 1 packuswb xm1, xm0 pshufb xm1, xm11 ; transpose movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+r7 ], xm1, 3 sub wd, 4 jz .h4_end add dstq, 4 cmp r4d, maxbased jg .h4_loop packuswb xm7, xm7 .h4_end_loop: movd [dstq+strideq*0], xm7 movd [dstq+strideq*1], xm7 movd [dstq+strideq*2], xm7 movd [dstq+r7 ], xm7 add dstq, 4 sub wd, 4 jg .h4_end_loop .h4_end: RET ALIGN function_align .h8: lea r4d, [angleq+216] mov r4b, wb cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 %assign stack_offset org_stack_offset ALLOC_STACK -32, 8 and r4d, 4 mova xm0, [tlq-15] vinserti128 m0, [tlq- 9], 1 movd xm1, r4d movu xm2, [z_filter_s+2] vinserti128 m2, [z_filter_s+6], 1 vpbroadcastb xm1, xm1 ; w & 4 vpbroadcastd m7, [pb_36_m4] pmaxub xm1, [z_upsample1-4] ; clip 4x8 vinserti128 m1, [z_upsample1], 1 add dyd, dyd pshufb m1, m0, m1 pshufb m2, m0, m2 vinserti128 m0, [tlq-7], 1 movd xm6, dyd pmaddubsw m1, m7 pmaddubsw m2, m7 vpbroadcastw m6, xm6 mov r2d, dyd lea r5, [strideq*3] paddw m7, m6, m6 paddw m1, m2 vpblendd m6, m7, 0xf0 pmulhrsw m1, m3 pslldq m2, m7, 8 paddw m7, m7 paddw m6, m2 vbroadcasti128 m2, [pb_15to0] packuswb m1, m1 punpcklbw m1, m0 pshufb m1, m2 vextracti128 [rsp+ 0], m1, 1 mova [rsp+16], xm1 .h8_upsample_loop: lea r4d, [r2+dyq] shr r2d, 6 ; base0 movu xm0, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 ; base1 vinserti128 m0, [rsp+r4], 1 lea r4d, [r2+dyq] shr r2d, 6 ; base2 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 punpcklqdq m1, m2, m2 ; frac0 frac1 pmaddubsw m0, m1 movu xm1, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 ; base3 vinserti128 m1, [rsp+r4], 1 punpckhqdq m2, m2 ; frac2 frac3 pmaddubsw m1, m2 pmulhrsw m0, m3 paddw m6, m7 pmulhrsw m1, m3 lea r4, [dstq+strideq*4] psllw m1, 8 por m0, m1 vextracti128 xm1, m0, 1 punpcklbw xm2, xm0, xm1 punpckhbw xm0, xm1 movd [dstq+strideq*0], xm2 pextrd [dstq+strideq*1], xm2, 1 pextrd [dstq+strideq*2], xm2, 2 pextrd [dstq+r5 ], xm2, 3 movd [r4 +strideq*0], xm0 pextrd [r4 +strideq*1], xm0, 1 pextrd [r4 +strideq*2], xm0, 2 pextrd [r4 +r5 ], xm0, 3 add dstq, 4 sub wd, 4 jg .h8_upsample_loop RET .h8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(w+7, 15) jmp .h8_main .h8_no_upsample: %assign stack_offset org_stack_offset ALLOC_STACK -32, 10 lea maxbased, [wq+7] test angled, 0x400 jnz .h8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h8_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd xm6, [base+pb_15] pcmpeqb xm1, xm1 psubusb xm6, xm0 psubb xm6, xm1 ; w == 4 ? 5 : 1 movu xm2, [tlq-16] pmaxub xm1, xm6, [base+z_filter_s] vinserti128 m2, [tlq-14], 1 vinserti128 m1, [base+z_filter_s+12], 1 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] pmaxub xm6, [base+z_filter_s+ 8] vinserti128 m6, [base+z_filter_s+20], 1 pshufb m0, m2, m1 pmaddubsw m0, m7 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] movzx r4d, byte [tlq-15] shufps m1, m6, q2121 pshufb m1, m2, m1 pmaddubsw m1, m7 paddw m0, m1 sub r5d, 3 jnz .h8_3tap vpbroadcastd m7, [z_filter_k+4*8] movzx r2d, byte [tlq-14] pshufb m2, m6 pmaddubsw m2, m7 sub r2d, r4d lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+15], r2b paddw m0, m2 .h8_3tap: pmulhrsw m0, m3 sar r5d, 1 lea tlq, [rsp+31] add r5d, 17 cmp wd, 16 cmovns maxbased, r5d neg r5 mov [tlq+r5], r4b vextracti128 xm1, m0, 1 packuswb xm0, xm1 mova [tlq-15], xm0 .h8_main: movd xm2, dyd vbroadcasti128 m0, [z_base_inc] mov r4, tlq sub tlq, 8 neg dyq vpbroadcastw m2, xm2 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf] add maxbased, 64 vpbroadcastw m9, xm9 psrlw m7, 8 psubw m9, m0 paddw m6, m2, m2 vpblendd m2, m6, 0x0f .h8_loop: lea r5, [r4+dyq] sar r4, 6 pand m0, m4, m2 psubw m1, m5, m0 psllw m0, 8 por m1, m0 vbroadcasti128 m0, [tlq+r4] lea r4, [r5+dyq] sar r5, 6 vinserti128 m0, [tlq+r5], 0 sub rsp, 8*2 pshufb m0, m8 pmaddubsw m0, m1 pcmpgtw m1, m9, m2 paddw m2, m6 pmulhrsw m0, m3 vpblendvb m0, m7, m0, m1 vextracti128 xm1, m0, 1 psllw xm0, 8 por xm0, xm1 ; interleave rows (partial transpose) mova [rsp], xm0 sub wd, 2 jz .h8_transpose cmp r4d, maxbased jg .h8_loop packuswb xm0, xm7, xm7 .h8_end_loop: sub rsp, 8*2 mova [rsp], xm0 sub wd, 2 jg .h8_end_loop .h8_transpose: mova xm2, [rsp+16*1] sub org_wd, 8 lea r2, [strideq*3] lea r6, [dstq+org_wq] cmovns dstq, r6 punpcklwd xm1, xm2, xm0 punpckhwd xm2, xm0 lea r6, [dstq+strideq*4] jge .h8_w8 add rsp, 16*2 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+r2 ], xm1, 3 movd [r6 +strideq*0], xm2 pextrd [r6 +strideq*1], xm2, 1 pextrd [r6 +strideq*2], xm2, 2 pextrd [r6 +r2 ], xm2, 3 jmp .h8_end .h8_w8_loop: mova xm0, [rsp+16*0] mova xm2, [rsp+16*1] punpcklwd xm1, xm2, xm0 punpckhwd xm2, xm0 .h8_w8: ; w8/w16/w32 mova xm0, [rsp+16*2] mova xm4, [rsp+16*3] add rsp, 16*4 punpcklwd xm3, xm4, xm0 punpckhwd xm4, xm0 punpckldq xm0, xm3, xm1 punpckhdq xm3, xm1 punpckldq xm1, xm4, xm2 punpckhdq xm4, xm2 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm3 movhps [dstq+r2 ], xm3 movq [r6 +strideq*0], xm1 movhps [r6 +strideq*1], xm1 movq [r6 +strideq*2], xm4 movhps [r6 +r2 ], xm4 sub dstq, 8 sub r6, 8 sub org_wd, 8 jge .h8_w8_loop .h8_end: RET .h16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(w+15, 31) jmp .h16_main ALIGN function_align .h16: %assign stack_offset org_stack_offset ALLOC_STACK -64, 12 lea maxbased, [wq+15] test angled, 0x400 jnz .h16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h16_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m11, [base+pb_27] vpbroadcastd m1, [base+pb_1] vbroadcasti128 m6, [base+z_filter_s+12] vinserti128 m2, m6, [base+z_filter_s+4], 0 vinserti128 m6, [base+z_filter_s+20], 1 movu xm10, [tlq-18] vinserti128 m10, [tlq-14], 1 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] vbroadcasti128 m7, [base+z_filter_s+8] vinserti128 m8, m7, [base+z_filter_s+0], 0 vinserti128 m7, [base+z_filter_s+16], 1 psubusb m11, m0 por m1, m11 movu xm11, [tlq-32] vinserti128 m11, [tlq-28], 1 pmaxub m8, m1 pmaxub m7, m1 pshufb m0, m10, m2 shufps m2, m6, q2121 pmaddubsw m0, m9 pshufb m1, m11, m8 shufps m8, m7, q2121 pmaddubsw m1, m9 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] movzx r4d, byte [tlq-31] pshufb m2, m10, m2 pmaddubsw m2, m9 pshufb m8, m11, m8 pmaddubsw m8, m9 paddw m0, m2 paddw m1, m8 sub r5d, 3 jnz .h16_3tap vpbroadcastd m9, [z_filter_k+4*8] movzx r2d, byte [tlq-30] pshufb m10, m6 pmaddubsw m10, m9 pshufb m11, m7 pmaddubsw m11, m9 sub r2d, r4d lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+31], r2b paddw m0, m10 paddw m1, m11 .h16_3tap: pmulhrsw m0, m3 pmulhrsw m1, m3 sar r5d, 1 lea tlq, [rsp+63] add r5d, 33 cmp wd, 32 cmovns maxbased, r5d neg r5 mov [tlq+r5], r4b packuswb m0, m1 vpermq m0, m0, q2031 mova [tlq-31], m0 .h16_main: movd xm6, dyd vbroadcasti128 m0, [z_base_inc] mov r4, tlq sub tlq, 8 neg dyq vpbroadcastw m6, xm6 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf] add maxbased, 64 vpbroadcastw m9, xm9 psubw m9, m0 paddw m11, m6, m6 psubw m10, m9, m3 ; 64*8 vpblendd m6, m11, 0xf0 .h16_loop: lea r5, [r4+dyq] sar r4, 6 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu xm0, [tlq+r4-0] movu xm1, [tlq+r4-8] lea r4, [r5+dyq] sar r5, 6 vinserti128 m0, [tlq+r5-0], 1 vinserti128 m1, [tlq+r5-8], 1 sub rsp, 32 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 vpermq m0, m0, q3120 mova [rsp], m0 sub wd, 2 jz .h16_transpose cmp r4d, maxbased jg .h16_loop mova m0, m7 .h16_end_loop: sub rsp, 32 mova [rsp], m7 sub wd, 2 jg .h16_end_loop .h16_transpose: mova m2, [rsp+32*1] sub org_wd, 8 lea r2, [strideq*3] lea r6, [dstq+org_wq] cmovns dstq, r6 punpcklbw m1, m2, m0 punpckhbw m2, m0 lea r3, [strideq*5] punpcklbw m0, m1, m2 punpckhbw m1, m2 lea r4, [strideq+r2*2] ; stride*7 jge .h16_w8 add rsp, 32*2 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 vextracti128 xm0, m0, 1 movd [dstq+strideq*4], xm1 pextrd [dstq+r3 ], xm1, 1 pextrd [dstq+r2*2 ], xm1, 2 pextrd [dstq+r4 ], xm1, 3 lea dstq, [dstq+strideq*8] vextracti128 xm1, m1, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 movd [dstq+strideq*4], xm1 pextrd [dstq+r3 ], xm1, 1 pextrd [dstq+r2*2 ], xm1, 2 pextrd [dstq+r4 ], xm1, 3 jmp .h16_end .h16_w8_loop: mova m0, [rsp+32*0] mova m2, [rsp+32*1] punpcklbw m1, m2, m0 punpckhbw m2, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 .h16_w8: mova m2, [rsp+32*2] mova m4, [rsp+32*3] lea r6, [dstq+strideq*8] add rsp, 32*4 punpcklbw m3, m4, m2 punpckhbw m4, m2 punpcklbw m2, m3, m4 punpckhbw m3, m4 punpckldq m4, m2, m0 punpckhdq m2, m0 punpckldq m0, m3, m1 punpckhdq m3, m1 movq [dstq+strideq*0], xm4 movhps [dstq+strideq*1], xm4 vextracti128 xm4, m4, 1 movq [dstq+strideq*2], xm2 movhps [dstq+r2 ], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*4], xm0 movhps [dstq+r3 ], xm0 vextracti128 xm0, m0, 1 movq [dstq+r2*2 ], xm3 movhps [dstq+r4 ], xm3 vextracti128 xm3, m3, 1 movq [r6+strideq*0], xm4 movhps [r6+strideq*1], xm4 movq [r6+strideq*2], xm2 movhps [r6+r2 ], xm2 movq [r6+strideq*4], xm0 movhps [r6+r3 ], xm0 movq [r6+r2*2 ], xm3 movhps [r6+r4 ], xm3 sub dstq, 8 sub org_wd, 8 jge .h16_w8_loop .h16_end: RET ALIGN function_align .h32: %assign stack_offset org_stack_offset ALLOC_STACK -96, 15 lea maxbased, [wq+31] and maxbased, 31 or maxbased, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main vbroadcasti128 m0, [pb_0to15] mov r4d, 21 mov r5d, 3 movu xm11, [tlq-66] ; 56-63 vinserti128 m11, [tlq-52], 1 ; 40-47 sub r4d, wd ; 21-w cmovns r5d, r4d movu xm12, [tlq-58] ; 48-55 vinserti128 m12, [tlq-44], 1 ; 32-39 sub r4d, 8 ; 13-w movd xm1, r5d movu xm13, [tlq-34] ; 24-31 vinserti128 m13, [tlq-20], 1 ; 8-15 movd xm2, r4d vpbroadcastb m1, xm1 movu xm14, [tlq-28] ; 16-23 vinserti128 m14, [tlq-14], 1 ; 0- 7 vpbroadcastb m2, xm2 pmaxsb m1, m0 ; clip 16x32 and (32|64)x32 movu m7, [z_filter_s+4] pshufb m11, m1 vinserti128 m8, m7, [z_filter_s+8], 1 vinserti128 m7, [z_filter_s+16], 0 pmaxsb m2, m0 ; clip 8x32 vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m12, m2 pshufb m0, m11, m8 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pshufb m1, m13, m8 pmaddubsw m1, m9 shufps m8, m7, q1021 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m10, m11, m8 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m8 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m8 pmaddubsw m10, m9 shufps m8, m7, q2121 paddw m1, m10 pshufb m10, m14, m8 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*2] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m8 pmaddubsw m12, m9 movzx r4d, byte [tlq-63] movzx r2d, byte [tlq-62] paddw m0, m11 paddw m2, m12 pshufb m13, m8 pmaddubsw m13, m9 pshufb m14, m7 pmaddubsw m14, m9 paddw m1, m13 paddw m6, m14 sub r2d, r4d lea r2d, [r2+r4*8+4] ; edge case for 64x32 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 shr r2d, 3 mov [rsp+31], r2b lea tlq, [rsp+95] mov [tlq-65], r4b mov r4d, 65 cmp wd, 64 cmove maxbased, r4d packuswb m0, m2 packuswb m1, m6 mova [tlq-63], m0 mova [tlq-31], m1 .h32_main: movd xm6, dyd mov r4, tlq sub tlq, 8 neg dyq vpbroadcastw m6, xm6 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf] add maxbased, 64 vpbroadcastw m9, xm9 psubw m9, [z_base_inc] mova m11, m6 psubw m10, m9, m3 ; 64*8 .h32_loop: mov r5, r4 sar r5, 6 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu xm0, [tlq+r5- 0] vinserti128 m0, [tlq+r5-16], 1 movu xm1, [tlq+r5- 8] vinserti128 m1, [tlq+r5-24], 1 sub rsp, 32 add r4, dyq pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 mova [rsp], m0 dec wd jz .h32_transpose cmp r4d, maxbased jg .h32_loop .h32_end_loop: sub rsp, 32 mova [rsp], m7 dec wd jg .h32_end_loop .h32_transpose: lea dstq, [dstq+org_wq-8] lea r2, [strideq*3] lea r3, [strideq*5] lea r4, [strideq+r2*2] ; stride*7 .h32_w8_loop: mova m7, [rsp+32*0] mova m6, [rsp+32*1] mova m5, [rsp+32*2] mova m4, [rsp+32*3] mova m3, [rsp+32*4] mova m2, [rsp+32*5] mova m1, [rsp+32*6] mova m0, [rsp+32*7] lea r6, [dstq+strideq*8] add rsp, 32*8 punpcklbw m8, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklwd m7, m8, m1 punpckhwd m8, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpckldq m6, m7, m2 punpckhdq m7, m2 punpckldq m2, m8, m3 punpckhdq m8, m3 punpckldq m3, m1, m5 punpckhdq m1, m5 punpckldq m5, m0, m4 punpckhdq m0, m4 movq [dstq+strideq*0], xm6 movhps [dstq+strideq*1], xm6 vextracti128 xm6, m6, 1 movq [dstq+strideq*2], xm7 movhps [dstq+r2 ], xm7 vextracti128 xm7, m7, 1 movq [dstq+strideq*4], xm2 movhps [dstq+r3 ], xm2 vextracti128 xm2, m2, 1 movq [dstq+r2*2 ], xm8 movhps [dstq+r4 ], xm8 vextracti128 xm8, m8, 1 movq [r6+strideq*0], xm3 movhps [r6+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [r6+strideq*2], xm1 movhps [r6+r2 ], xm1 vextracti128 xm1, m1, 1 movq [r6+strideq*4], xm5 movhps [r6+r3 ], xm5 vextracti128 xm5, m5, 1 movq [r6+r2*2 ], xm0 movhps [r6+r4 ], xm0 lea r6, [r6+strideq*8] vextracti128 xm0, m0, 1 movq [r6+strideq*0], xm6 movhps [r6+strideq*1], xm6 movq [r6+strideq*2], xm7 movhps [r6+r2 ], xm7 movq [r6+strideq*4], xm2 movhps [r6+r3 ], xm2 movq [r6+r2*2 ], xm8 movhps [r6+r4 ], xm8 lea r6, [r6+strideq*8] movq [r6+strideq*0], xm3 movhps [r6+strideq*1], xm3 movq [r6+strideq*2], xm1 movhps [r6+r2 ], xm1 movq [r6+strideq*4], xm5 movhps [r6+r3 ], xm5 movq [r6+r2*2 ], xm0 movhps [r6+r4 ], xm0 sub dstq, 8 sub org_wd, 8 jg .h32_w8_loop RET ALIGN function_align .h64: %assign stack_offset org_stack_offset ALLOC_STACK -128, 16 lea maxbased, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main mov r4d, 21 vpbroadcastb xm11, [tlq-127] vpblendd xm11, [tlq-130], 0x0e ; 120-127 sub r4d, wd ; 21-w mov r5d, 3 vinserti128 m11, [tlq-116], 1 ; 104-111 movu m7, [z_filter_s+4] cmp wd, 32 cmove r4d, r5d vinserti128 m8, m7, [z_filter_s+8], 1 vbroadcasti128 m6, [pb_0to15] movd xm1, r4d vpbroadcastd m9, [z_filter_k+4*2+12*0] movu xm12, [tlq-122] ; 112-119 vinserti128 m12, [tlq-108], 1 ; 96-103 vpbroadcastb m1, xm1 movu xm13, [tlq- 98] ; 88- 95 vinserti128 m13, [tlq- 84], 1 ; 72- 79 movu xm14, [tlq- 90] ; 80- 87 vinserti128 m14, [tlq- 76], 1 ; 64- 71 vinserti128 m7, [z_filter_s+16], 0 pshufb m0, m11, m8 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pmaxsb m1, m6 ; clip (16|32)x64 pshufb m13, m1 pshufb m1, m13, m8 pmaddubsw m1, m9 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] shufps m15, m8, m7, q1021 pshufb m10, m11, m15 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m15 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m15 pmaddubsw m10, m9 paddw m1, m10 pshufb m10, m14, m15 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*2] shufps m10, m8, m7, q2132 pshufb m11, m10 pmaddubsw m11, m9 pshufb m12, m10 pmaddubsw m12, m9 pshufb m13, m10 pmaddubsw m13, m9 pshufb m14, m10 pmaddubsw m14, m9 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 movu xm11, [tlq-66] ; 56-63 vinserti128 m11, [tlq-52], 1 ; 40-47 movu xm12, [tlq-58] ; 48-55 vinserti128 m12, [tlq-44], 1 ; 32-39 movu xm13, [tlq-34] ; 24-31 vinserti128 m13, [tlq-20], 1 ; 8-15 movu xm14, [tlq-28] ; 16-23 vinserti128 m14, [tlq-14], 1 ; 0- 7 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 lea tlq, [rsp+127] packuswb m0, m2 packuswb m1, m6 mova [tlq-127], m0 mova [tlq- 95], m1 pshufb m0, m11, m10 pmaddubsw m0, m9 pshufb m2, m12, m10 pmaddubsw m2, m9 pshufb m1, m13, m10 pmaddubsw m1, m9 pshufb m6, m14, m7 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m7, m11, m15 pmaddubsw m7, m9 paddw m0, m7 pshufb m7, m12, m15 pmaddubsw m7, m9 paddw m2, m7 pshufb m7, m13, m15 pmaddubsw m7, m9 paddw m1, m7 pshufb m7, m14, m10 pmaddubsw m7, m9 paddw m6, m7 vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m8 pmaddubsw m12, m9 pshufb m13, m8 pmaddubsw m13, m9 pshufb m14, m15 pmaddubsw m14, m9 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 packuswb m0, m2 packuswb m1, m6 mova [tlq-63], m0 mova [tlq-31], m1 .h64_main: movd xm12, dyd neg maxbaseq vbroadcasti128 m8, [z3_shuf] vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m12, xm12 lea r5d, [dyq+maxbaseq-64] neg dyq or maxbased, 63 lea r4, [dyq+63] movd xm6, r5d mova xm10, [pb_1to32+16] vinserti128 m10, [pb_1to32], 1 vpbroadcastd m11, [pb_32] vpbroadcastw m6, xm6 .h64_loop: mov r5, r4 sar r5, 6 movu m0, [tlq+r5-24] movu m1, [tlq+r5-32] pand m2, m4, m6 psubw m9, m5, m2 psllw m2, 8 por m9, m2 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 psraw m2, m6, 6 sub rsp, 64 pmulhrsw m0, m3 pmulhrsw m1, m3 packsswb m2, m2 paddb m2, m10 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [rsp+32], m0 movu m0, [tlq+r5-56] movu m1, [tlq+r5-64] add r4, dyq pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 paddb m2, m11 pmulhrsw m0, m3 pmulhrsw m1, m3 paddw m6, m12 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [rsp], m0 dec wd jz .h64_transpose cmp r4d, maxbased jg .h64_loop .h64_end_loop: sub rsp, 64 mova [rsp+32], m7 mova [rsp+ 0], m7 dec wd jg .h64_end_loop .h64_transpose: lea r2, [strideq*3] lea r3, [strideq*5] imul r5, strideq, -8 lea dstq, [dstq+org_wq-16] lea r4, [strideq+r2*2] ; stride*7 .h64_transpose_loop0: lea r6, [rsp+16*3] .h64_transpose_loop: mova xm0, [r6+64*15] vinserti128 m0, [r6+64* 7], 1 mova xm1, [r6+64*14] vinserti128 m1, [r6+64* 6], 1 mova xm2, [r6+64*13] vinserti128 m2, [r6+64* 5], 1 mova xm3, [r6+64*12] vinserti128 m3, [r6+64* 4], 1 mova xm4, [r6+64*11] vinserti128 m4, [r6+64* 3], 1 mova xm5, [r6+64*10] vinserti128 m5, [r6+64* 2], 1 mova xm6, [r6+64* 9] vinserti128 m6, [r6+64* 1], 1 mova xm7, [r6+64* 8] vinserti128 m7, [r6+64* 0], 1 sub r6, 16 punpcklbw m8, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklwd m7, m8, m1 punpckhwd m8, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpckldq m6, m7, m2 punpckhdq m7, m2 punpckldq m2, m8, m3 punpckhdq m8, m3 punpckldq m3, m1, m5 punpckhdq m1, m5 punpckldq m5, m0, m4 punpckhdq m0, m4 vpermq m6, m6, q3120 vpermq m7, m7, q3120 vpermq m2, m2, q3120 vpermq m8, m8, q3120 vpermq m3, m3, q3120 vpermq m1, m1, q3120 vpermq m5, m5, q3120 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm6 vextracti128 [dstq+strideq*1], m6, 1 mova [dstq+strideq*2], xm7 vextracti128 [dstq+r2 ], m7, 1 mova [dstq+strideq*4], xm2 vextracti128 [dstq+r3 ], m2, 1 mova [dstq+r2*2 ], xm8 vextracti128 [dstq+r4 ], m8, 1 sub dstq, r5 mova [dstq+strideq*0], xm3 vextracti128 [dstq+strideq*1], m3, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+r2 ], m1, 1 mova [dstq+strideq*4], xm5 vextracti128 [dstq+r3 ], m5, 1 mova [dstq+r2*2 ], xm0 vextracti128 [dstq+r4 ], m0, 1 sub dstq, r5 cmp r6, rsp jae .h64_transpose_loop add rsp, 64*16 lea dstq, [dstq+r5*8-16] sub org_wd, 16 jg .h64_transpose_loop0 .h64_end: RET %macro FILTER_XMM 4 ; dst, src, tmp, shuf %ifnum %4 pshufb xm%2, xm%4 %else pshufb xm%2, %4 %endif pshufd xm%1, xm%2, q0000 ; p0 p1 pmaddubsw xm%1, xm2 pshufd xm%3, xm%2, q1111 ; p2 p3 pmaddubsw xm%3, xm3 paddw xm%1, xm1 paddw xm%1, xm%3 pshufd xm%3, xm%2, q2222 ; p4 p5 pmaddubsw xm%3, xm4 paddw xm%1, xm%3 pshufd xm%3, xm%2, q3333 ; p6 __ pmaddubsw xm%3, xm5 paddw xm%1, xm%3 psraw xm%1, 4 packuswb xm%1, xm%1 %endmacro %macro FILTER_YMM 4 ; dst, src, tmp, shuf pshufb m%2, m%4 pshufd m%1, m%2, q0000 pmaddubsw m%1, m2 pshufd m%3, m%2, q1111 pmaddubsw m%3, m3 paddw m%1, m1 paddw m%1, m%3 pshufd m%3, m%2, q2222 pmaddubsw m%3, m4 paddw m%1, m%3 pshufd m%3, m%2, q3333 pmaddubsw m%3, m5 paddw m%1, m%3 psraw m%1, 4 vperm2i128 m%3, m%1, m%1, 0x01 packuswb m%1, m%3 %endmacro ; The ipred_filter SIMD processes 4x2 blocks in the following order which ; increases parallelism compared to doing things row by row. One redundant ; block is calculated for w8 and w16, two for w32. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 5 1 2 3 5 b c d f ; 2 2 3 2 4 5 7 2 4 5 7 c e f h ; 3 3 4 4 6 7 9 4 6 7 9 e g h j ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ ; 5 8 8 i cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter %define base r6-ipred_filter_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 add filterq, r6 lea r6, [ipred_filter_avx2_table] movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 movsxd wq, [r6+wq*4] vpbroadcastd m1, [base+pw_8] vbroadcasti128 m2, [filterq+16*0] vbroadcasti128 m3, [filterq+16*1] vbroadcasti128 m4, [filterq+16*2] vbroadcasti128 m5, [filterq+16*3] add wq, r6 mov hd, hm jmp wq .w4: WIN64_SPILL_XMM 9 mova xm8, [base+filter_shuf2] sub tlq, 3 sub tlq, hq jmp .w4_loop_start .w4_loop: pinsrd xm0, xm6, [tlq+hq], 0 lea dstq, [dstq+strideq*2] .w4_loop_start: FILTER_XMM 6, 0, 7, 8 movd [dstq+strideq*0], xm6 pextrd [dstq+strideq*1], xm6, 1 sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 10 mova m8, [base+filter_shuf1] FILTER_XMM 7, 0, 6, [base+filter_shuf2] vpbroadcastd m0, [tlq+4] vpbroadcastd m6, [tlq+5] sub tlq, 4 sub tlq, hq vpbroadcastq m7, xm7 vpblendd m7, m6, 0x20 .w8_loop: vpbroadcastd xm6, [tlq+hq] palignr m6, m0, 12 vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 mova xm6, xm7 call .main vpblendd xm6, xm7, 0x0c pshufd xm6, xm6, q3120 movq [dstq+strideq*0], xm6 movhps [dstq+strideq*1], xm6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: %if WIN64 %assign stack_offset stack_offset - stack_size_padded %assign xmm_regs_used 15 %assign stack_size_padded 0x98 SUB rsp, stack_size_padded %endif sub hd, 2 TAIL_CALL .w16_main, 0 .w16_main: %if WIN64 movaps [rsp+0xa8], xmm6 movaps [rsp+0xb8], xmm7 movaps [rsp+0x28], xmm8 movaps [rsp+0x38], xmm9 movaps [rsp+0x48], xmm10 movaps [rsp+0x58], xmm11 movaps [rsp+0x68], xmm12 movaps [rsp+0x78], xmm13 movaps [rsp+0x88], xmm14 %endif FILTER_XMM 12, 0, 7, [base+filter_shuf2] vpbroadcastd m0, [tlq+5] vpblendd m0, [tlq-12], 0x14 mova m8, [base+filter_shuf1] vpbroadcastq m7, xm12 vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 call .main ; c0 d0 a1 b1 a1 b1 c0 d0 movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 vinserti128 m14, m8, [base+filter_shuf3], 0 vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 FILTER_XMM 6, 9, 10, 14 vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 vpbroadcastd m9, [tlq+13] vpbroadcastd m10, [tlq+12] psrld m11, m8, 4 vpblendd m6, m9, 0x20 ; top sub tlq, 6 sub tlq, hq .w16_loop: vpbroadcastd xm9, [tlq+hq] palignr m9, m0, 12 vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 mova xm13, xm7 call .main ; e0 f0 c1 d1 c1 d1 e0 f0 vpblendd m9, m12, m10, 0xf0 vpblendd m12, m6, 0xc0 pshufd m9, m9, q3333 vpblendd m9, m6, 0xee vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 mova [dstq+strideq*0], xm9 vextracti128 [dstq+strideq*1], m9, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm6 ret ALIGN function_align .w32: sub rsp, stack_size_padded sub hd, 2 lea r3, [dstq+16] lea r5d, [hq-2] call .w16_main add tlq, r5 mov dstq, r3 lea r3, [strideq-4] lea r4, [r3+strideq*2] movq xm0, [tlq+21] pinsrd xm0, [dstq-4], 2 pinsrd xm0, [dstq+r3*1], 3 FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 movq xm7, [dstq+r3*2] pinsrd xm7, [dstq+r4], 2 palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 vpbroadcastd m0, [tlq+28] vpbroadcastd m9, [tlq+29] vbroadcasti128 m8, [base+filter_shuf1+16] vpblendd m0, m9, 0x20 vpblendd m0, m7, 0x0f vpbroadcastq m7, xm12 vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 call .main ; c0 d0 a1 b1 a1 b1 c0 d0 add r3, 2 lea r4, [r4+strideq*2] movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 FILTER_XMM 6, 9, 10, 14 vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 vpbroadcastd m9, [tlq+37] vpbroadcastd m10, [tlq+36] vpblendd m6, m9, 0x20 ; top .w32_loop: movq xm9, [dstq+r3*4] pinsrd xm9, [dstq+r4], 2 .w32_loop_last: palignr m9, m0, 12 vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 mova xm13, xm7 ; c0 d0 call .main ; e0 f0 c1 d1 c1 d1 e0 f0 vpblendd m9, m12, m10, 0xf0 vpblendd m12, m6, 0xc0 pshufd m9, m9, q3333 vpblendd m9, m6, 0xee vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 mova [dstq+strideq*0], xm9 vextracti128 [dstq+strideq*1], m9, 1 lea dstq, [dstq+strideq*2] sub r5d, 2 jg .w32_loop jz .w32_loop_last vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm6 RET ALIGN function_align .main: FILTER_YMM 7, 0, 9, 8 ret %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif %macro IPRED_CFL 1 ; ac in, unpacked pixels out psignw m3, m%1, m1 pabsw m%1, m%1 pmulhrsw m%1, m2 psignw m%1, m3 paddw m%1, m0 %endmacro cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_left_avx2_table] tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm mov r6d, 0x8000 shrx r6d, r6d, wd movd xm3, r6d movsxd r6, [t0+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] mov t0d, 0x8000 shrx t0d, t0d, r6d movd xm3, t0d lea t0, [ipred_cfl_left_avx2_table] movsxd r6, [t0+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 .h32: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 .h8: psrlq xm1, xm0, 32 paddw xm0, xm1 .h4: pmaddwd xm0, xm2 pmulhrsw xm0, xm3 vpbroadcastw m0, xm0 jmp wq cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea t0d, [wq+hq] movd xm4, t0d tzcnt t0d, t0d movd xm5, t0d lea t0, [ipred_cfl_avx2_table] tzcnt wd, wd movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] pcmpeqd m3, m3 psrlw xm4, 1 add r6, t0 add wq, t0 movifnidn acq, acmp jmp r6 .h4: movd xm0, [tlq-4] pmaddubsw xm0, xm3 jmp wq .w4: movd xm1, [tlq+1] pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm1 pmaddwd xm0, xm3 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: punpckhqdq xm1, xm0, xm0 lea r2d, [hq*2] mov r6d, 0x55563334 paddw xm0, xm1 shrx r6d, r6d, r2d psrlq xm1, xm0, 32 paddw xm0, xm1 movd xm1, r6d psrlw xm0, 2 pmulhuw xm0, xm1 .w4_end: vpbroadcastw m0, xm0 .s4: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq] IPRED_CFL 4 packuswb m4, m4 vextracti128 xm5, m4, 1 movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 movd [dstq+strideq*2], xm5 pextrd [dstq+r6 ], xm5, 1 lea dstq, [dstq+strideq*4] add acq, 32 sub hd, 4 jg .s4_loop RET ALIGN function_align .h8: movq xm0, [tlq-8] pmaddubsw xm0, xm3 jmp wq .w8: movq xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 punpckhqdq xm2, xm0, xm0 paddw xm0, xm2 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmove r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w8_end: vpbroadcastw m0, xm0 .s8: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movq [dstq+strideq*1], xm5 movhps [dstq+strideq*2], xm4 movhps [dstq+r6 ], xm5 lea dstq, [dstq+strideq*4] add acq, 64 sub hd, 4 jg .s8_loop RET ALIGN function_align .h16: mova xm0, [tlq-16] pmaddubsw xm0, xm3 jmp wq .w16: movu xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w16_end: vpbroadcastw m0, xm0 .s16: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 vpermq m4, m4, q3120 mova [dstq+strideq*0], xm4 vextracti128 [dstq+strideq*1], m4, 1 lea dstq, [dstq+strideq*2] add acq, 64 sub hd, 2 jg .s16_loop RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 vextracti128 xm1, m0, 1 psubw xm0, xm4 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x33345556 shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w32_end: vpbroadcastw m0, xm0 .s32: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 vpermq m4, m4, q3120 mova [dstq], m4 add dstq, strideq add acq, 64 dec hd jg .s32_loop RET cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_splat_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [t0+wq*4] vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128] add wq, t0 movifnidn acq, acmp jmp wq cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm mov szd, wd mov ac_bakq, acq imul szd, hd shl hpadd, 2 sub hd, hpadd vpbroadcastd m2, [pb_2] pxor m4, m4 cmp wd, 8 jg .w16 je .w8 ; fall-through DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak .w4: lea stride3q, [strideq*3] .w4_loop: movq xm0, [yq] movq xm1, [yq+strideq] movhps xm0, [yq+strideq*2] movhps xm1, [yq+stride3q] pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 paddw xm0, xm1 mova [acq], xm0 paddw xm4, xm0 lea yq, [yq+strideq*4] add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .calc_avg vpermq m0, m0, q1111 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .calc_avg .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova xm0, [yq] mova xm1, [yq+strideq] vinserti128 m0, [yq+strideq*2], 1 vinserti128 m1, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 32 sub hd, 2 jg .w8_loop test hpadd, hpadd jz .calc_avg jmp .w8_hpad .w8_wpad: vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] .w8_wpad_loop: movq xm0, [yq] movq xm1, [yq+strideq] vinserti128 m0, [yq+strideq*2], 1 vinserti128 m1, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufb m0, m3 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 32 sub hd, 2 jg .w8_wpad_loop test hpadd, hpadd jz .calc_avg .w8_hpad: vpermq m0, m0, q3232 .w8_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad_loop jmp .calc_avg .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_loop test hpadd, hpadd jz .calc_avg jmp .w16_hpad_loop .w16_wpad: DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak lea iptrq, [ipred_cfl_ac_420_avx2_table] shl wpadd, 2 mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ ipred_cfl_ac_420_avx2_table+wpadq*8-32] movsxd wpadq, [iptrq+wpadq+4] add iptrq, wpadq jmp iptrq .w16_pad3: vpbroadcastq m0, [yq] vpbroadcastq m1, [yq+strideq] jmp .w16_wpad_end .w16_pad2: vbroadcasti128 m0, [yq] vbroadcasti128 m1, [yq+strideq] jmp .w16_wpad_end .w16_pad1: mova m0, [yq] mova m1, [yq+strideq] ; fall-through .w16_wpad_end: pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufb m0, m3 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 dec hd jz .w16_wpad_done jmp iptrq .w16_wpad_done: test hpadd, hpadd jz .calc_avg .w16_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 dec hpadd jg .w16_hpad_loop ; fall-through .calc_avg: vpbroadcastd m2, [pw_1] pmaddwd m0, m4, m2 vextracti128 xm1, m0, 1 tzcnt r1d, szd paddd xm0, xm1 movd xm2, r1d movd xm3, szd punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 psrad xm3, 1 psrlq xm1, xm0, 32 paddd xm0, xm3 paddd xm0, xm1 psrad xm0, xm2 vpbroadcastw m0, xm0 .sub_loop: mova m1, [ac_bakq] psubw m1, m0 mova [ac_bakq], m1 add ac_bakq, 32 sub szd, 16 jg .sub_loop RET cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm mov szd, wd mov ac_bakq, acq imul szd, hd shl hpadd, 2 sub hd, hpadd vpbroadcastd m2, [pb_4] pxor m4, m4 pxor m5, m5 cmp wd, 8 jg .w16 je .w8 ; fall-through DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak .w4: lea stride3q, [strideq*3] .w4_loop: movq xm1, [yq] movhps xm1, [yq+strideq] movq xm0, [yq+strideq*2] movhps xm0, [yq+stride3q] pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 mova [acq], xm1 mova [acq+16], xm0 paddw xm4, xm0 paddw xm5, xm1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg vpermq m0, m0, q1111 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .calc_avg .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova xm1, [yq] vinserti128 m1, [yq+strideq], 1 mova xm0, [yq+strideq*2] vinserti128 m0, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg jmp .w8_hpad .w8_wpad: vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] .w8_wpad_loop: movq xm1, [yq] vinserti128 m1, [yq+strideq], 1 movq xm0, [yq+strideq*2] vinserti128 m0, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 pshufb m0, m3 pshufb m1, m3 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_wpad_loop test hpadd, hpadd jz .calc_avg .w8_hpad: vpermq m0, m0, q3232 .w8_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad_loop jmp .calc_avg .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m1, [yq] mova m0, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg jmp .w16_hpad_loop .w16_wpad: DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak lea iptrq, [ipred_cfl_ac_422_avx2_table] shl wpadd, 2 mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ ipred_cfl_ac_422_avx2_table+wpadq*8-32] movsxd wpadq, [iptrq+wpadq+4] add iptrq, wpadq jmp iptrq .w16_pad3: vpbroadcastq m1, [yq] vpbroadcastq m0, [yq+strideq] jmp .w16_wpad_end .w16_pad2: vbroadcasti128 m1, [yq] vbroadcasti128 m0, [yq+strideq] jmp .w16_wpad_end .w16_pad1: mova m1, [yq] mova m0, [yq+strideq] ; fall-through .w16_wpad_end: pmaddubsw m0, m2 pmaddubsw m1, m2 pshufb m0, m3 pshufb m1, m3 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jz .w16_wpad_done jmp iptrq .w16_wpad_done: test hpadd, hpadd jz .calc_avg .w16_hpad_loop: mova [acq], m0 mova [acq+32], m0 paddw m4, m0 paddw m5, m0 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop ; fall-through .calc_avg: vpbroadcastd m2, [pw_1] pmaddwd m5, m5, m2 pmaddwd m0, m4, m2 paddd m0, m5 vextracti128 xm1, m0, 1 tzcnt r1d, szd paddd xm0, xm1 movd xm2, r1d movd xm3, szd punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 psrad xm3, 1 psrlq xm1, xm0, 32 paddd xm0, xm3 paddd xm0, xm1 psrad xm0, xm2 vpbroadcastw m0, xm0 .sub_loop: mova m1, [ac_bakq] psubw m1, m0 mova [ac_bakq], m1 add ac_bakq, 32 sub szd, 16 jg .sub_loop RET cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm mov szd, wd imul szd, hd shl hpadd, 2 sub hd, hpadd pxor m4, m4 vpbroadcastd m5, [pw_1] tzcnt r8d, wd lea r5, [ipred_cfl_ac_444_avx2_table] movsxd r8, [r5+r8*4+12] add r5, r8 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak mov ac_bakq, acq jmp r5 .w4: lea stride3q, [strideq*3] pxor xm2, xm2 .w4_loop: movd xm1, [yq] movd xm0, [yq+strideq*2] pinsrd xm1, [yq+strideq], 1 pinsrd xm0, [yq+stride3q], 1 punpcklbw xm1, xm2 punpcklbw xm0, xm2 psllw xm1, 3 psllw xm0, 3 mova [acq], xm1 mova [acq+16], xm0 paddw xm1, xm0 paddw xm4, xm1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg_mul pshufd xm0, xm0, q3232 paddw xm1, xm0, xm0 .w4_hpad_loop: mova [acq], xm0 mova [acq+16], xm0 paddw xm4, xm1 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .calc_avg_mul .w8: lea stride3q, [strideq*3] pxor m2, m2 .w8_loop: movq xm1, [yq] movq xm0, [yq+strideq*2] vinserti128 m1, [yq+strideq], 1 vinserti128 m0, [yq+stride3q], 1 punpcklbw m1, m2 punpcklbw m0, m2 psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m1, m0 paddw m4, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg_mul vpermq m0, m0, q3232 paddw m1, m0, m0 .w8_hpad_loop: mova [acq], m0 mova [acq+32], m0 paddw m4, m1 add acq, 64 sub hpadd, 4 jg .w8_hpad_loop jmp .calc_avg_mul .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: pmovzxbw m1, [yq] pmovzxbw m0, [yq+strideq] psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m1, m0 pmaddwd m1, m5 paddd m4, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg jmp .w16_hpad .w16_wpad: mova m3, [cfl_ac_444_w16_pad1_shuffle] .w16_wpad_loop: vpbroadcastq m1, [yq] vpbroadcastq m0, [yq+strideq] pshufb m1, m3 pshufb m0, m3 psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m1, m0 pmaddwd m1, m5 paddd m4, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_wpad_loop test hpadd, hpadd jz .calc_avg .w16_hpad: paddw m1, m0, m0 pmaddwd m1, m5 .w16_hpad_loop: mova [acq], m0 mova [acq+32], m0 paddd m4, m1 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop jmp .calc_avg .w32: test wpadd, wpadd jnz .w32_wpad .w32_loop: pmovzxbw m1, [yq] pmovzxbw m0, [yq+16] psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m2, m1, m0 pmaddwd m2, m5 paddd m4, m2 add yq, strideq add acq, 64 dec hd jg .w32_loop test hpadd, hpadd jz .calc_avg jmp .w32_hpad_loop .w32_wpad: DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak lea iptrq, [ipred_cfl_ac_444_avx2_table] add wpadd, wpadd mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table] movsxd wpadq, [iptrq+wpadq+4] add iptrq, wpadq jmp iptrq .w32_pad3: vpbroadcastq m1, [yq] pshufb m1, m3 vpermq m0, m1, q3232 jmp .w32_wpad_end .w32_pad2: pmovzxbw m1, [yq] pshufhw m0, m1, q3333 vpermq m0, m0, q3333 jmp .w32_wpad_end .w32_pad1: pmovzxbw m1, [yq] vpbroadcastq m0, [yq+16] pshufb m0, m3 ; fall-through .w32_wpad_end: psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m2, m1, m0 pmaddwd m2, m5 paddd m4, m2 add yq, strideq add acq, 64 dec hd jz .w32_wpad_done jmp iptrq .w32_wpad_done: test hpadd, hpadd jz .calc_avg .w32_hpad_loop: mova [acq], m1 mova [acq+32], m0 paddd m4, m2 add acq, 64 dec hpadd jg .w32_hpad_loop jmp .calc_avg .calc_avg_mul: pmaddwd m4, m5 .calc_avg: vextracti128 xm1, m4, 1 tzcnt r1d, szd paddd xm0, xm4, xm1 movd xm2, r1d movd xm3, szd punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 psrad xm3, 1 psrlq xm1, xm0, 32 paddd xm0, xm3 paddd xm0, xm1 psrad xm0, xm2 vpbroadcastw m0, xm0 .sub_loop: mova m1, [ac_bakq] psubw m1, m0 mova [ac_bakq], m1 add ac_bakq, 32 sub szd, 16 jg .sub_loop RET cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h vbroadcasti128 m4, [palq] lea r2, [pal_pred_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r2+wq*4] packuswb m4, m4 add wq, r2 lea r2, [strideq*3] jmp wq .w4: pshufb xm0, xm4, [idxq] add idxq, 16 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET ALIGN function_align .w8: pshufb xm0, xm4, [idxq+16*0] pshufb xm1, xm4, [idxq+16*1] add idxq, 16*2 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET ALIGN function_align .w16: pshufb m0, m4, [idxq+32*0] pshufb m1, m4, [idxq+32*1] add idxq, 32*2 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+r2 ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET ALIGN function_align .w32: pshufb m0, m4, [idxq+32*0] pshufb m1, m4, [idxq+32*1] pshufb m2, m4, [idxq+32*2] pshufb m3, m4, [idxq+32*3] add idxq, 32*4 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r2 ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32 RET ALIGN function_align .w64: pshufb m0, m4, [idxq+32*0] pshufb m1, m4, [idxq+32*1] pshufb m2, m4, [idxq+32*2] pshufb m3, m4, [idxq+32*3] add idxq, 32*4 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64 RET %endif av-scenechange-0.14.1/src/asm/x86/ipred_avx512.asm000064400000000000000000001467621046102023000175050ustar 00000000000000; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 %macro SMOOTH_WEIGHT_TABLE 1-* %rep %0 db %1-128, 127-%1 %rotate 1 %endrep %endmacro smooth_weights: SMOOTH_WEIGHT_TABLE \ 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 ; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 pb_127_m127: times 2 db 127, -127 pb_128: times 4 db 128 pw_128: times 2 dw 128 pw_255: times 2 dw 255 %define pb_1 (ipred_h_shuf+24) %define pb_2 (ipred_h_shuf+20) %define pb_3 (ipred_h_shuf+16) %define pd_8 (filter_taps+128) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64 SECTION .text INIT_ZMM avx512icl cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h lea r5, [ipred_dc_left_8bpc_avx512icl_table] movd xm0, wm tzcnt wd, wm inc tlq movifnidn hd, hm movu ym1, [tlq] movd xmm3, wd movsxd r6, [r5+wq*4] vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] psrld xm0, 1 vpdpbusd ym0, ym1, ym2 add r6, r5 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_left_8bpc_avx512icl_table] mov hd, hm tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movd xm0, hm movu ym1, [tlq] movd xmm3, r6d movsxd r6, [r5+r6*4] vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] psrld xm0, 1 vpdpbusd ym0, ym1, ym2 add r6, r5 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu ym1, [tlq+32] ; unaligned when jumping here from dc_top vpdpbusd ym0, ym1, ym2 .h32: vextracti32x4 xm1, ym0, 1 paddd xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 .h8: psrlq xm1, xm0, 32 paddd xm0, xm1 .h4: vpsrlvd xm0, xmm3 lea stride3q, [strideq*3] vpbroadcastb m0, xm0 jmp wq cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea r5d, [wq+hq] movd xm0, r5d tzcnt r5d, r5d movd xmm4, r5d lea r5, [ipred_dc_8bpc_avx512icl_table] tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] psrld xm0, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movd xmm1, [tlq-4] vpdpbusd xm0, xmm1, xm3 jmp wq .w4: movd xmm1, [tlq+1] vpdpbusd xm0, xmm1, xm3 cmp hd, 4 jg .w4_mul psrlw xmm0, xm0, 3 jmp .w4_end .w4_mul: punpckhqdq xmm1, xm0, xm0 lea r2d, [hq*2] mov r6d, 0x55563334 paddd xmm1, xm0 shrx r6d, r6d, r2d psrlq xmm0, xmm1, 32 paddd xmm0, xmm1 movd xmm1, r6d psrld xmm0, 2 pmulhuw xmm0, xmm1 .w4_end: vpbroadcastb xm0, xmm0 .s4: movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm0 movd [dstq+strideq*2], xm0 movd [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET .h8: movq xmm1, [tlq-8] vpdpbusd xm0, xmm1, xm3 jmp wq .w8: movq xmm1, [tlq+1] vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmove r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w8_end: vpbroadcastb xm0, xmm0 .s8: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET .h16: mova xmm1, [tlq-16] vpdpbusd xm0, xmm1, xm3 jmp wq .w16: movu xmm1, [tlq+1] vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hb, 8|32 cmovz r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w16_end: vpbroadcastb xm0, xmm0 .s16: mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm0 mova [dstq+strideq*2], xm0 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET .h32: mova ym1, [tlq-32] vpdpbusd ym0, ym1, ym3 jmp wq .w32: movu ym1, [tlq+1] vpdpbusd ym0, ym1, ym3 vextracti32x4 xm1, ym0, 1 paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x33345556 shrx r6d, r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w32_end: vpbroadcastb ym0, xmm0 .s32: mova [dstq+strideq*0], ym0 mova [dstq+strideq*1], ym0 mova [dstq+strideq*2], ym0 mova [dstq+stride3q ], ym0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET .h64: mova ym1, [tlq-64] mova ym2, [tlq-32] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 jmp wq .w64: movu ym1, [tlq+ 1] movu ym2, [tlq+33] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 vextracti32x4 xm1, ym0, 1 paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 64 je .w64_end mov r6d, 0x33345556 shrx r6d, r6d, hd movd xmm1, r6d pmulhuw xmm0, xmm1 .w64_end: vpbroadcastb m0, xmm0 .s64: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s64 RET cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] tzcnt wd, wm movu m0, [tlq+1] movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 %define base r6-ipred_h_8bpc_avx512icl_table lea r6, [ipred_h_8bpc_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] lea stride3q, [strideq*3] sub tlq, hq add wq, r6 jmp wq .w4: mova xmm1, [base+ipred_h_shuf+16] .w4_loop: movd xmm0, [tlq+hq-4] pshufb xmm0, xmm1 movd [dstq+strideq*0], xmm0 pextrd [dstq+strideq*1], xmm0, 1 pextrd [dstq+strideq*2], xmm0, 2 pextrd [dstq+stride3q ], xmm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET .w8: movsldup xmm2, [base+ipred_h_shuf+16] movshdup xmm3, [base+ipred_h_shuf+16] .w8_loop: movd xmm1, [tlq+hq-4] pshufb xmm0, xmm1, xmm2 pshufb xmm1, xmm3 movq [dstq+strideq*0], xmm0 movq [dstq+strideq*1], xmm1 movhps [dstq+strideq*2], xmm0 movhps [dstq+stride3q ], xmm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: movsldup m1, [base+smooth_shuf] .w16_loop: vpbroadcastd m0, [tlq+hq-4] pshufb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpbroadcastd ym3, [base+pb_1] vpord m2, m3, [base+pb_2] {1to16} .w32_loop: vpbroadcastd m1, [tlq+hq-4] pshufb m0, m1, m2 pshufb m1, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32_loop RET .w64: vpbroadcastd m4, [base+pb_3] vpbroadcastd m5, [base+pb_2] vpbroadcastd m6, [base+pb_1] pxor m7, m7 .w64_loop: vpbroadcastd m3, [tlq+hq-4] pshufb m0, m3, m4 pshufb m1, m3, m5 pshufb m2, m3, m6 pshufb m3, m7 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w64_loop RET %macro PAETH 0 psubusb m1, m5, m4 psubusb m0, m4, m5 por m1, m0 ; tdiff pavgb m2, m6, m4 vpcmpub k1, m1, m7, 1 ; tdiff < ldiff vpblendmb m0{k1}, m4, m6 vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 psubusb m3, m5, m2 psubb m2, m4 psubusb m2, m5 por m2, m3 pminub m1, m7 paddusb m2, m2 por m2, m4 ; min(tldiff, 255) vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff vmovdqu8 m0{k1}, m5 %endmacro cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 lea r6, [ipred_paeth_8bpc_avx512icl_table] tzcnt wd, wm vpbroadcastb m5, [tlq] ; topleft mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] lea topq, [tlq+1] sub tlq, hq add wq, r6 lea stride3q, [strideq*3] jmp wq INIT_YMM avx512icl .w4: vpbroadcastd m6, [topq] mova m9, [ipred_h_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 ; left PAETH movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm0, 3 sub hd, 8 jl .w4_ret vextracti32x4 xm0, m0, 1 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm0, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_ret: RET INIT_ZMM avx512icl .w8: vpbroadcastq m6, [topq] movsldup m9, [smooth_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 PAETH vextracti32x4 xm1, m0, 2 vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 sub hd, 8 jl .w8_ret lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w8_loop .w8_ret: RET .w16: vbroadcasti32x4 m6, [topq] movsldup m9, [smooth_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: vpbroadcastd m4, [tlq+hq-4] pshufb m4, m9 PAETH mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: vbroadcasti32x8 m6, [topq] mova ym9, ym8 psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w32_loop: vpbroadcastd m4, [tlq+hq-2] pshufb m4, m9 PAETH mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: movu m6, [topq] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w64_loop: vpbroadcastb m4, [tlq+hq-1] PAETH mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 %define base r6-ipred_smooth_v_8bpc_avx512icl_table lea r6, [ipred_smooth_v_8bpc_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m0, [base+pb_127_m127] vpbroadcastd m1, [base+pw_128] lea weightsq, [base+smooth_weights+hq*4] neg hq vpbroadcastb m4, [tlq+hq] ; bottom add wq, r6 lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastd m2, [tlq+1] movshdup m5, [smooth_shuf] mova ym6, [smooth_endA] punpcklbw m2, m4 ; top, bottom pmaddubsw m3, m2, m0 paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok paddw m3, m1 ; 128 * top + 129 * bottom + 128 .w4_loop: vbroadcasti32x4 m0, [weightsq+hq*2] pshufb m0, m5 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 add hq, 8 jg .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jl .w4_loop .ret: RET .w8: vpbroadcastq m2, [tlq+1] movshdup m5, [smooth_shuf] mova ym6, [smooth_endA] punpcklbw m2, m4 pmaddubsw m3, m2, m0 paddw m1, m2 paddw m3, m1 .w8_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m5 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: vbroadcasti32x4 m3, [tlq+1] movshdup m6, [smooth_shuf] mova m7, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w16_loop: vpbroadcastq m1, [weightsq+hq*2] pshufb m1, m6 pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m7, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: vbroadcasti32x8 m3, [tlq+1] movshdup m6, [smooth_shuf] mova m7, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w32_loop: vpbroadcastd m1, [weightsq+hq*2] pshufb m1, m6 pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m7, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] add hq, 2 jl .w32_loop RET .w64: movu m3, [tlq+1] mova m6, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w64_loop: vpbroadcastw m1, [weightsq+hq*2] pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m6, m1 mova [dstq], m0 add dstq, strideq inc hq jl .w64_loop RET cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 %define base r5-ipred_smooth_h_8bpc_avx512icl_table lea r5, [ipred_smooth_h_8bpc_avx512icl_table] mov r6d, wd tzcnt wd, wd vpbroadcastb m4, [tlq+r6] ; right mov hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m5, [base+pb_127_m127] vpbroadcastd m6, [base+pw_128] sub tlq, hq add wq, r5 vpmovb2m k1, m6 lea stride3q, [strideq*3] jmp wq .w4: movsldup m3, [smooth_shuf] vpbroadcastq m7, [smooth_weights+4*2] mova ym8, [smooth_endA] .w4_loop: vpbroadcastq m0, [tlq+hq-8] mova m2, m4 vpshufb m2{k1}, m0, m3 ; left, right pmaddubsw m0, m2, m5 pmaddubsw m1, m2, m7 paddw m2, m6 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: RET .w8: movsldup m3, [smooth_shuf] vbroadcasti32x4 m7, [smooth_weights+8*2] mova ym8, [smooth_endA] .w8_loop: vpbroadcastd m0, [tlq+hq-4] mova m2, m4 vpshufb m2{k1}, m0, m3 pmaddubsw m0, m2, m5 pmaddubsw m1, m2, m7 paddw m2, m6 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: movsldup m7, [smooth_shuf] vbroadcasti32x4 m8, [smooth_weights+16*2] vbroadcasti32x4 m9, [smooth_weights+16*3] mova m10, [smooth_endB] .w16_loop: vpbroadcastd m0, [tlq+hq-4] mova m3, m4 vpshufb m3{k1}, m0, m7 pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m8 pmaddubsw m1, m3, m9 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m10, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: mova m10, [smooth_endA] vpbroadcastd ym7, [pb_1] vbroadcasti32x8 m8, [smooth_weights+32*2] vbroadcasti32x8 m9, [smooth_weights+32*3] vshufi32x4 m10, m10, q3120 .w32_loop: vpbroadcastd m0, [tlq+hq-2] mova m3, m4 vpshufb m3{k1}, m0, m7 pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m8 pmaddubsw m1, m3, m9 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m10, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: mova m7, [smooth_weights+64*2] mova m8, [smooth_weights+64*3] mova m9, [smooth_endA] .w64_loop: mova m3, m4 vpbroadcastb m3{k1}, [tlq+hq-1] pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m7 pmaddubsw m1, m3, m8 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m9, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 %define base r5-ipred_smooth_8bpc_avx512icl_table lea r5, [ipred_smooth_8bpc_avx512icl_table] mov r6d, wd tzcnt wd, wd mov hd, hm vpbroadcastb m6, [tlq+r6] ; right sub tlq, hq movsxd wq, [r5+wq*4] vpbroadcastd m7, [base+pb_127_m127] vpbroadcastb m0, [tlq] ; bottom vpbroadcastd m1, [base+pw_255] add wq, r5 lea v_weightsq, [base+smooth_weights+hq*2] vpmovb2m k1, m1 lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastd m8, [tlq+hq+1] movsldup m4, [smooth_shuf] movshdup m5, [smooth_shuf] vpbroadcastq m9, [smooth_weights+4*2] mova ym11, [smooth_endA] punpcklbw m8, m0 ; top, bottom pmaddubsw m10, m8, m7 paddw m1, m8 ; 1 * top + 256 * bottom + 255 paddw m10, m1 ; 128 * top + 129 * bottom + 255 .w4_loop: vpbroadcastq m1, [tlq+hq-8] vbroadcasti32x4 m0, [v_weightsq] add v_weightsq, 16 mova m2, m6 vpshufb m2{k1}, m1, m4 ; left, right pmaddubsw m1, m2, m7 ; 127 * left - 127 * right pshufb m0, m5 pmaddubsw m0, m8, m0 paddw m1, m2 ; 128 * left + 129 * right pmaddubsw m2, m9 paddw m0, m10 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: RET .w8: vpbroadcastq m8, [tlq+hq+1] movsldup m4, [smooth_shuf] movshdup m5, [smooth_shuf] vbroadcasti32x4 m9, [smooth_weights+8*2] mova ym11, [smooth_endA] punpcklbw m8, m0 pmaddubsw m10, m8, m7 paddw m1, m8 paddw m10, m1 .w8_loop: vpbroadcastd m1, [tlq+hq-4] vpbroadcastq m0, [v_weightsq] add v_weightsq, 8 mova m2, m6 vpshufb m2{k1}, m1, m4 pmaddubsw m1, m2, m7 pshufb m0, m5 pmaddubsw m0, m8, m0 paddw m1, m2 pmaddubsw m2, m9 paddw m0, m10 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: vbroadcasti32x4 m9, [tlq+hq+1] movsldup m5, [smooth_shuf] movshdup m10, [smooth_shuf] vbroadcasti32x4 m11, [smooth_weights+16*2] vbroadcasti32x4 m12, [smooth_weights+16*3] mova m15, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m13, m8, m7 pmaddubsw m14, m9, m7 paddw m0, m1, m8 paddw m1, m9 paddw m13, m0 paddw m14, m1 .w16_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastq m1, [v_weightsq] add v_weightsq, 8 mova m4, m6 vpshufb m4{k1}, m0, m5 pmaddubsw m2, m4, m7 pshufb m1, m10 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m11 pmaddubsw m4, m12 paddw m0, m13 paddw m1, m14 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m15, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: vbroadcasti32x8 m9, [tlq+hq+1] movshdup m10, [smooth_shuf] mova m12, [smooth_weights+32*2] vpbroadcastd ym5, [pb_1] mova m15, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m13, m8, m7 pmaddubsw m14, m9, m7 vshufi32x4 m11, m12, m12, q2020 vshufi32x4 m12, m12, q3131 paddw m0, m1, m8 paddw m1, m9 paddw m13, m0 paddw m14, m1 .w32_loop: vpbroadcastd m0, [tlq+hq-2] vpbroadcastd m1, [v_weightsq] add v_weightsq, 4 mova m4, m6 vpshufb m4{k1}, m0, m5 pmaddubsw m2, m4, m7 pshufb m1, m10 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m11 pmaddubsw m4, m12 paddw m0, m13 paddw m1, m14 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m15, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: movu m9, [tlq+hq+1] mova m11, [smooth_weights+64*2] mova m2, [smooth_weights+64*3] mova m14, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m12, m8, m7 pmaddubsw m13, m9, m7 vshufi32x4 m10, m11, m2, q2020 vshufi32x4 m11, m2, q3131 paddw m0, m1, m8 paddw m1, m9 paddw m12, m0 paddw m13, m1 .w64_loop: mova m4, m6 vpbroadcastb m4{k1}, [tlq+hq-1] vpbroadcastw m1, [v_weightsq] add v_weightsq, 2 pmaddubsw m2, m4, m7 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m10 pmaddubsw m4, m11 paddw m0, m12 paddw m1, m13 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m14, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3 lea r6, [pal_pred_8bpc_avx512icl_table] tzcnt wd, wm vbroadcasti32x4 m4, [palq] movifnidn hd, hm movsxd wq, [r6+wq*4] packuswb m4, m4 add wq, r6 lea stride3q, [strideq*3] jmp wq .w4: pshufb xmm0, xm4, [idxq] add idxq, 16 movd [dstq+strideq*0], xmm0 pextrd [dstq+strideq*1], xmm0, 1 pextrd [dstq+strideq*2], xmm0, 2 pextrd [dstq+stride3q ], xmm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: pshufb xmm0, xm4, [idxq+16*0] pshufb xmm1, xm4, [idxq+16*1] add idxq, 16*2 movq [dstq+strideq*0], xmm0 movhps [dstq+strideq*1], xmm0 movq [dstq+strideq*2], xmm1 movhps [dstq+stride3q ], xmm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: pshufb m0, m4, [idxq] add idxq, 64 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: pshufb m0, m4, [idxq+64*0] pshufb m1, m4, [idxq+64*1] add idxq, 64*2 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32 RET .w64: pshufb m0, m4, [idxq+64*0] pshufb m1, m4, [idxq+64*1] pshufb m2, m4, [idxq+64*2] pshufb m3, m4, [idxq+64*3] add idxq, 64*4 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w64 RET ; The ipred_filter code processes 4x2 blocks in the following order ; which increases parallelism compared to doing things row by row. ; Some redundant blocks are calculated for w > 4. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 4 1 2 3 4 9 a b c ; 2 2 3 2 3 4 5 2 3 4 5 a b c d ; 3 3 4 3 4 5 6 3 4 5 6 b c d e ; 4 4 5 4 5 6 7 4 5 6 7 c d e f ; 5 5 6 5 6 7 8 5 6 7 8 d e f g ; 6 6 7 6 7 8 9 6 7 8 9 e f g h ; 7 7 8 7 8 9 a 7 8 9 a f g h i ; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ ; 9 9 a b h i j ; a b i j ; b j cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt %define base r6-filter_taps lea r6, [filter_taps] %ifidn fltd, fltm movzx fltd, fltb %else movzx fltd, byte fltm %endif vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 movifnidn hd, hm shl fltd, 6 vpbroadcastd m6, [base+pd_8] vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 vbroadcasti32x4 m8, [r6+fltq+16*1] vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ vbroadcasti32x4 m10, [r6+fltq+16*3] mova xmm0, xm6 vpdpbusd xmm0, xmm2, xm7 mova xmm1, xm6 vpdpbusd xmm1, xmm2, xm8 vpdpbusd xmm0, xmm3, xm9 vpdpbusd xmm1, xmm3, xm10 packssdw xmm0, xmm1 cmp wd, 8 jb .w4 vpbroadcastd ym2, [tlq+5] mova m11, [base+filter_perm] mov r5, 0xffffffffffff000f psrldq xmm2, 1 ; __ t0 kmovq k1, r5 ; 0x000f psraw xm5, xmm0, 4 packuswb xmm2, xm5 ; __ t0 a0 b0 pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 je .w8 kxnorb k3, k3, k3 ; 0x00ff vpbroadcastd xm3, [tlq-4] kandnq k2, k3, k1 ; 0xffffffffffff0000 vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ mova ym0, ym6 vpdpbusd ym0, ym2, ym7 mova ym1, ym6 vpdpbusd ym1, ym2, ym8 pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 vpbroadcastd m2, [tlq+9] vpdpbusd ym0, ym3, ym9 vpdpbusd ym1, ym3, ym10 vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ kunpckbw k4, k1, k3 ; 0x0fff packssdw ym0, ym1 psraw ym0, 4 ; a0 d0 a1 b1 packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 vpbroadcastd m2, [tlq+13] vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 mova m12, [base+filter_end] lea r5d, [hq-6] mov r6, dstq cmovp hd, r5d ; w == 16 ? h : h - 6 packssdw m4, m1 psraw m4, 4 ; e0 f0 c1 d1 a2 b2 packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 .w16_loop: vpbroadcastd xm3, [tlq-8] vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ mova m1, m6 vpdpbusd m1, m2, m7 mova m0, m6 vpdpbusd m0, m2, m8 sub tlq, 2 vpdpbusd m1, m3, m9 vpdpbusd m0, m3, m10 packssdw m1, m0 mova m0, m4 psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 vextracti32x4 [dstq+strideq*0], m5, 2 vextracti32x4 [dstq+strideq*1], m5, 3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop cmp wd, 16 je .ret mova xm13, [filter_perm+16] mova xmm3, [r6+strideq*0] punpckhdq xmm3, [r6+strideq*1] vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 pinsrb xm3, xmm3, [tlq+r5+16], 7 pshufb xm3, xm13 vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ mova m0, m6 vpdpbusd m0, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kunpckbw k5, k3, k1 ; 0xff0f lea r3, [strideq*3] vpdpbusd m0, m3, m9 vpdpbusd m1, m3, m10 packssdw m0, m1 psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 vpbroadcastd ym2, [tlq+r5+21] pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 vextracti32x4 [dstq+strideq*0], m5, 2 vextracti32x4 [dstq+strideq*1], m5, 3 punpckhqdq xmm3, [r6+r3] pinsrb xmm3, [r6+strideq*2+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kxnord k3, k3, k4 ; 0xfffff0ff lea r4, [strideq*5] vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 packssdw m4, m1 psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 vpbroadcastd m2, [tlq+r5+25] pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 vextracti32x4 [dstq+strideq*2], m5, 2 vextracti32x4 [dstq+r3 ], m5, 3 punpckhqdq xmm3, [r6+r4] pinsrb xmm3, [r6+strideq*4+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ mova m0, m6 vpdpbusd m0, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kunpckwd k1, k1, k2 ; 0x000f0000 vpdpbusd m0, m3, m9 vpdpbusd m1, m3, m10 packssdw m0, m1 psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 vpbroadcastd m2, [tlq+r5+29] pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 vextracti32x4 [dstq+strideq*4], m5, 2 vextracti32x4 [dstq+r4 ], m5, 3 lea r0, [strideq+r3*2] .w32_loop: punpckhqdq xmm3, [r6+r0] pinsrb xmm3, [r6+r3*2+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ .w32_loop_tail: mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 packssdw m4, m1 mova m1, m0 psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 vextracti32x4 [r6+strideq*0+16], m5, 2 vextracti32x4 [r6+strideq*1+16], m5, 3 lea r6, [r6+strideq*2] sub r5d, 2 jg .w32_loop vpermb m3, m11, m1 cmp r5d, -6 jg .w32_loop_tail .ret: RET .w8: vpermb ym3, ym11, ymm2 .w8_loop: vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ mova ym0, ym6 vpdpbusd ym0, ym2, ym7 mova ym1, ym6 vpdpbusd ym1, ym2, ym8 sub tlq, 2 vpdpbusd ym0, ym3, ym9 vpdpbusd ym1, ym3, ym10 mova ym3, ym5 packssdw ym0, ym1 psraw ym5, ym0, 4 ; c0 d0 a1 b1 packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET .w4_loop: vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ mova xmm0, xm6 vpdpbusd xmm0, xmm2, xm7 mova xmm1, xm6 vpdpbusd xmm1, xmm2, xm8 sub tlq, 2 vpdpbusd xmm0, xmm3, xm9 vpdpbusd xmm1, xmm3, xm10 packssdw xmm0, xmm1 .w4: psraw xmm0, 4 ; a0 b0 packuswb xmm0, xmm0 movd [dstq+strideq*0], xmm0 pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 movd [dstq+strideq*1], xmm2 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_loop RET %endif ; ARCH_X86_64 av-scenechange-0.14.1/src/asm/x86/ipred_sse.asm000064400000000000000000005567721046102023000172570ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" SECTION_RODATA 16 %macro SMOOTH_WEIGHT_TABLE 1-* %rep %0 db %1-128, 127-%1 %rotate 1 %endrep %endmacro ; sm_weights[], but modified to precalculate x and 256-x with offsets to ; enable efficient use of pmaddubsw (which requires signed values) smooth_weights: SMOOTH_WEIGHT_TABLE \ 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7 z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 z_filter_wh4: db 7, 7, 19, 7, z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 pd_32768: dd 32768 z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8 z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 9, 9, 10, 10, 11 z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64 z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64 pw_m1to4: dw -1, -2, -3, -4 z_filter_k: times 4 db 0, 16 times 4 db 0, 20 times 4 db 8, 16 times 4 db 32, 16 times 4 db 24, 20 times 4 db 16, 16 times 4 db 0, 0 times 4 db 0, 0 pw_8: times 8 db 8, 0 pb_3: times 16 db 3 pb_16: times 16 db 16 pw_62: times 8 dw 62 pw_64: times 8 dw 64 pw_256: times 8 dw 256 pw_512: times 8 dw 512 pw_m256: times 8 dw -256 pb_2: times 8 db 2 pb_4: times 8 db 4 pb_8: times 8 db 8 pb_128: times 8 db 128 pb_m16: times 8 db -16 pw_128: times 4 dw 128 pw_255: times 4 dw 255 pb_36_m4: times 4 db 36, -4 pb_127_m127: times 4 db 127, -127 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) %define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64 JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 cextern dr_intra_derivative cextern filter_intra_taps SECTION .text ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- %macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 pshuflw m1, m0, %3 ; extend 8 byte for 2 pos punpcklqdq m1, m1 mova [dstq + %2], m1 %if %1 > 16 mova [dstq + 16 + %2], m1 %endif %if %1 > 32 mova [dstq + 32 + %2], m1 mova [dstq + 48 + %2], m1 %endif %endmacro %macro IPRED_H 1 ; width sub tlq, 4 movd m0, [tlq] ; get 4 bytes of topleft data punpcklbw m0, m0 ; extend 2 byte %if %1 == 4 pshuflw m1, m0, q2233 movd [dstq+strideq*0], m1 psrlq m1, 32 movd [dstq+strideq*1], m1 pshuflw m0, m0, q0011 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+stride3q ], m0 %elif %1 == 8 punpcklwd m0, m0 punpckhdq m1, m0, m0 punpckldq m0, m0 movq [dstq+strideq*1], m1 movhps [dstq+strideq*0], m1 movq [dstq+stride3q ], m0 movhps [dstq+strideq*2], m0 %else IPRED_SET %1, 0, q3333 IPRED_SET %1, strideq, q2222 IPRED_SET %1, strideq*2, q1111 IPRED_SET %1, stride3q, q0000 %endif lea dstq, [dstq+strideq*4] sub hd, 4 jg .w%1 RET %endmacro INIT_XMM ssse3 cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 LEA r5, ipred_h_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq .w4: IPRED_H 4 .w8: IPRED_H 8 .w16: IPRED_H 16 .w32: IPRED_H 32 .w64: IPRED_H 64 ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movu m0, [tlq+ 1] movu m1, [tlq+17] movu m2, [tlq+33] movu m3, [tlq+49] movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea r5d, [wq+hq] movd m4, r5d tzcnt r5d, r5d movd m5, r5d LEA r5, ipred_dc_ssse3_table tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+20] pcmpeqd m3, m3 psrlw m4, 1 ; dc = (width + height) >> 1; add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movd m0, [tlq-4] pmaddubsw m0, m3 jmp wq .w4: movd m1, [tlq+1] pmaddubsw m1, m3 psubw m0, m4 paddw m0, m1 pmaddwd m0, m3 cmp hd, 4 jg .w4_mul psrlw m0, 3 ; dc >>= ctz(width + height); jmp .w4_end .w4_mul: punpckhqdq m1, m0, m0 paddw m0, m1 psrlq m1, m0, 32 paddw m0, m1 psrlw m0, 2 mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8 cmovz r6d, r2d movd m5, r6d pmulhuw m0, m5 .w4_end: pxor m1, m1 pshufb m0, m1 .s4: movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m0 movd [dstq+strideq*2], m0 movd [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET ALIGN function_align .h8: movq m0, [tlq-8] pmaddubsw m0, m3 jmp wq .w8: movq m1, [tlq+1] pmaddubsw m1, m3 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 paddw m0, m1 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w8_end: pxor m1, m1 pshufb m0, m1 .s8: movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET ALIGN function_align .h16: mova m0, [tlq-16] pmaddubsw m0, m3 jmp wq .w16: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w16_end: pxor m1, m1 pshufb m0, m1 .s16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 mova m2, [tlq-16] pmaddubsw m2, m3 paddw m0, m2 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 movu m2, [tlq+17] pmaddubsw m2, m3 paddw m1, m2 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x5556 mov r2d, 0x3334 test hd, 64|16 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w32_end: pxor m1, m1 pshufb m0, m1 mova m1, m0 .s32: mova [dstq], m0 mova [dstq+16], m1 mova [dstq+strideq], m0 mova [dstq+strideq+16], m1 mova [dstq+strideq*2], m0 mova [dstq+strideq*2+16], m1 mova [dstq+stride3q], m0 mova [dstq+stride3q+16], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET ALIGN function_align .h64: mova m0, [tlq-64] mova m1, [tlq-48] pmaddubsw m0, m3 pmaddubsw m1, m3 paddw m0, m1 mova m1, [tlq-32] pmaddubsw m1, m3 paddw m0, m1 mova m1, [tlq-16] pmaddubsw m1, m3 paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 1] movu m2, [tlq+17] pmaddubsw m1, m3 pmaddubsw m2, m3 paddw m1, m2 movu m2, [tlq+33] pmaddubsw m2, m3 paddw m1, m2 movu m2, [tlq+49] pmaddubsw m2, m3 paddw m1, m2 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 64 je .w64_end mov r6d, 0x5556 mov r2d, 0x3334 test hd, 32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w64_end: pxor m1, m1 pshufb m0, m1 mova m1, m0 mova m2, m0 mova m3, m0 .s64: mova [dstq], m0 mova [dstq+16], m1 mova [dstq+32], m2 mova [dstq+48], m3 mova [dstq+strideq], m0 mova [dstq+strideq+16], m1 mova [dstq+strideq+32], m2 mova [dstq+strideq+48], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s64 RET ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_left_ssse3_table mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] movd m2, r6d psrld m3, m2 movsxd r6, [r5+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu m1, [tlq+48] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 movu m1, [tlq+32] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h32: movu m1, [tlq+16] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h16: pshufd m1, m0, q3232 ; psrlq m1, m0, 16 paddw m0, m1 .h8: pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 .h4: pmaddwd m0, m2 pmulhrsw m0, m3 lea stride3q, [strideq*3] pxor m1, m1 pshufb m0, m1 mova m1, m0 mova m2, m0 mova m3, m0 jmp wq ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] mova m1, m0 mova m2, m0 mova m3, m0 add wq, r5 lea stride3q, [strideq*3] jmp wq ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_ssse3_table tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] movd m2, wd psrld m3, m2 movsxd r6, [r5+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- %macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] ; w * a = (w - 128) * a + 128 * a ; (256 - w) * b = (127 - w) * b + 129 * b ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] pmaddubsw m6, m%3, m%1 pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b paddw m6, m%5 paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] psrlw m6, 8 psrlw m0, 8 packuswb m6, m0 %endmacro cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_ssse3_table LEA r6, ipred_smooth_v_ssse3_table tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] movddup m0, [base+pb_127_m127] movddup m1, [base+pw_128] lea weightsq, [base+smooth_weights+hq*4] neg hq movd m5, [tlq+hq] pxor m2, m2 pshufb m5, m2 add wq, r6 jmp wq .w4: movd m2, [tlq+1] punpckldq m2, m2 punpcklbw m2, m5 ; top, bottom lea r3, [strideq*3] mova m4, [base+ipred_v_shuf] mova m5, m4 punpckldq m4, m4 punpckhdq m5, m5 pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 .w4_loop: movu m1, [weightsq+hq*2] pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 movd [dstq+strideq*0], m6 pshuflw m1, m6, q1032 movd [dstq+strideq*1], m1 punpckhqdq m6, m6 movd [dstq+strideq*2], m6 psrlq m6, 32 movd [dstq+r3 ], m6 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop RET ALIGN function_align .w8: movq m2, [tlq+1] punpcklbw m2, m5 mova m5, [base+ipred_v_shuf] lea r3, [strideq*3] pshufd m4, m5, q0000 pshufd m5, m5, q1111 pmaddubsw m3, m2, m0 paddw m1, m2 paddw m3, m1 ; m3 is output for loop .w8_loop: movq m1, [weightsq+hq*2] pshufb m0, m1, m4 pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 movq [dstq+strideq*0], m6 movhps [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] add hq, 2 jl .w8_loop RET ALIGN function_align .w16: movu m3, [tlq+1] punpcklbw m2, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 ; m4 and m5 is output for loop .w16_loop: movd m1, [weightsq+hq*2] pshuflw m1, m1, q0000 punpcklqdq m1, m1 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m6 add dstq, strideq add hq, 1 jl .w16_loop RET ALIGN function_align .w32: %if WIN64 movaps [rsp+24], xmm7 %define xmm_regs_used 8 %endif mova m7, m5 .w32_loop_init: mov r3d, 2 .w32_loop: movddup m0, [base+pb_127_m127] movddup m1, [base+pw_128] movu m3, [tlq+1] punpcklbw m2, m3, m7 punpckhbw m3, m7 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 movd m1, [weightsq+hq*2] pshuflw m1, m1, q0000 punpcklqdq m1, m1 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m6 add tlq, 16 add dstq, 16 dec r3d jg .w32_loop lea dstq, [dstq-32+strideq] sub tlq, 32 add hq, 1 jl .w32_loop_init RET ALIGN function_align .w64: %if WIN64 movaps [rsp+24], xmm7 %define xmm_regs_used 8 %endif mova m7, m5 .w64_loop_init: mov r3d, 4 .w64_loop: movddup m0, [base+pb_127_m127] movddup m1, [base+pw_128] movu m3, [tlq+1] punpcklbw m2, m3, m7 punpckhbw m3, m7 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 movd m1, [weightsq+hq*2] pshuflw m1, m1, q0000 punpcklqdq m1, m1 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m6 add tlq, 16 add dstq, 16 dec r3d jg .w64_loop lea dstq, [dstq-64+strideq] sub tlq, 64 add hq, 1 jl .w64_loop_init RET ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h %define base r6-ipred_smooth_h_ssse3_table LEA r6, ipred_smooth_h_ssse3_table mov wd, wm movd m3, [tlq+wq] pxor m1, m1 pshufb m3, m1 ; right tzcnt wd, wd mov hd, hm movsxd wq, [r6+wq*4] movddup m4, [base+pb_127_m127] movddup m5, [base+pw_128] add wq, r6 jmp wq .w4: movddup m6, [base+smooth_weights+4*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 sub tlq, hq lea r3, [strideq*3] .w4_loop: movd m2, [tlq+hq] ; left pshufb m2, m7 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r3 ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: mova m6, [base+smooth_weights+8*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 sub tlq, hq punpckldq m7, m7 .w8_loop: movd m2, [tlq+hq] ; left pshufb m2, m7 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: mova m6, [base+smooth_weights+16*2] mova m7, [base+smooth_weights+16*3] sub tlq, 1 sub tlq, hq .w16_loop: pxor m1, m1 movd m2, [tlq+hq] ; left pshufb m2, m1 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m7 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq], m0 lea dstq, [dstq+strideq] sub hd, 1 jg .w16_loop RET ALIGN function_align .w32: sub tlq, 1 sub tlq, hq pxor m6, m6 .w32_loop_init: mov r5, 2 lea r3, [base+smooth_weights+16*4] .w32_loop: mova m7, [r3] add r3, 16 movd m2, [tlq+hq] ; left pshufb m2, m6 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m7 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 mova m7, [r3] add r3, 16 pmaddubsw m2, m7 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq], m0 add dstq, 16 dec r5 jg .w32_loop lea dstq, [dstq-32+strideq] sub hd, 1 jg .w32_loop_init RET ALIGN function_align .w64: sub tlq, 1 sub tlq, hq pxor m6, m6 .w64_loop_init: mov r5, 4 lea r3, [base+smooth_weights+16*8] .w64_loop: mova m7, [r3] add r3, 16 movd m2, [tlq+hq] ; left pshufb m2, m6 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m7 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 mova m7, [r3] add r3, 16 pmaddubsw m2, m7 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq], m0 add dstq, 16 dec r5 jg .w64_loop lea dstq, [dstq-64+strideq] sub hd, 1 jg .w64_loop_init RET ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- %macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 pmaddubsw m6, m%3, m%1 mova m0, m6 pmaddubsw m6, m%4, m%2 mova m1, m6 %ifnum %5 paddw m0, m%5 %else paddw m0, %5 %endif %ifnum %6 paddw m1, m%6 %else paddw m1, %6 %endif %ifnum %7 %else mova m3, %7 %endif pavgw m0, m2 pavgw m1, m3 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 %endmacro %macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] mova m1, [rsp+16*%1] ; top punpckhbw m6, m1, m0 ; top, bottom punpcklbw m1, m0 ; top, bottom pmaddubsw m2, m1, m5 mova [rsp+16*%2], m1 paddw m1, m3 ; 1 * top + 255 * bottom + 255 paddw m2, m1 ; 128 * top + 129 * bottom + 255 mova [rsp+16*%3], m2 pmaddubsw m2, m6, m5 mova [rsp+16*%4], m6 paddw m6, m3 ; 1 * top + 255 * bottom + 255 paddw m2, m6 ; 128 * top + 129 * bottom + 255 mova [rsp+16*%5], m2 movd m1, [tlq+hq] ; left pshufb m1, [base+pb_3] ; topleft[-(1 + y)] punpcklbw m1, m4 ; left, right pmaddubsw m2, m1, m5 ; 127 * left - 127 * right paddw m2, m1 ; 128 * left + 129 * right mova m3, m2 pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; pmaddubsw m1, %7 paddw m2, m3, m0 paddw m3, m1 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; mova m7, [rsp+16*%9] pshufb m1, m7 mova [rsp+16*%8], m3 mova m4, [rsp+16*%2] mova m5, [rsp+16*%3] mova m3, [rsp+16*%4] mova m7, [rsp+16*%5] SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] mova [dstq], m0 movddup m3, [base+pw_255] ; recovery mova m0, [rsp+16*%10] ; recovery mova m4, [rsp+16*%11] ; recovery mova m5, [rsp+16*%12] ; recovery %endmacro cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_ssse3_table mov wd, wm mov hd, hm LEA r6, ipred_smooth_ssse3_table movd m4, [tlq+wq] ; right pxor m2, m2 pshufb m4, m2 tzcnt wd, wd mov r5, tlq sub r5, hq movsxd wq, [r6+wq*4] movddup m5, [base+pb_127_m127] movd m0, [r5] pshufb m0, m2 ; bottom movddup m3, [base+pw_255] add wq, r6 lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] jmp wq .w4: mova m7, [base+ipred_v_shuf] movd m1, [tlq+1] ; left pshufd m1, m1, q0000 sub tlq, 4 lea r3, [strideq*3] sub tlq, hq punpcklbw m1, m0 ; top, bottom pshufd m6, m7, q1100 pshufd m7, m7, q3322 pmaddubsw m2, m1, m5 paddw m3, m1 ; 1 * top + 255 * bottom + 255 paddw m2, m3 ; 128 * top + 129 * bottom + 255 mova [rsp+16*0], m1 mova [rsp+16*1], m2 movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; punpcklqdq m1, m1 mova [rsp+16*2], m1 mova [rsp+16*3], m4 mova [rsp+16*4], m6 mova [rsp+16*5], m5 .w4_loop: movd m1, [tlq+hq] ; left pshufb m1, [base+ipred_h_shuf] punpcklbw m0, m1, m4 ; left, right punpckhbw m1, m4 pmaddubsw m2, m0, m5 ; 127 * left - 127 * right pmaddubsw m3, m1, m5 paddw m2, m0 ; 128 * left + 129 * right paddw m3, m1 mova m4, [rsp+16*2] pmaddubsw m0, m4 pmaddubsw m1, m4 paddw m2, m0 paddw m3, m1 movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; add v_weightsq, 8 pshufb m0, m1, m6 pshufb m1, m7 mova m4, [rsp+16*0] mova m5, [rsp+16*1] SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 mova m4, [rsp+16*3] mova m6, [rsp+16*4] mova m5, [rsp+16*5] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r3 ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: mova m7, [base+ipred_v_shuf] movq m1, [tlq+1] ; left punpcklqdq m1, m1 sub tlq, 4 sub tlq, hq punpcklbw m1, m0 pshufd m6, m7, q0000 pshufd m7, m7, q1111 pmaddubsw m2, m1, m5 paddw m3, m1 paddw m2, m3 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; mova [rsp+16*2], m1 mova [rsp+16*3], m4 mova [rsp+16*4], m6 mova [rsp+16*5], m5 .w8_loop: movd m1, [tlq+hq] ; left pshufb m1, [base+ipred_h_shuf] pshufd m1, m1, q1100 punpcklbw m0, m1, m4 punpckhbw m1, m4 pmaddubsw m2, m0, m5 pmaddubsw m3, m1, m5 paddw m2, m0 paddw m3, m1 mova m4, [rsp+16*2] pmaddubsw m0, m4 pmaddubsw m1, m4 paddw m2, m0 paddw m3, m1 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; add v_weightsq, 4 pshufb m0, m1, m6 pshufb m1, m7 mova m4, [rsp+16*0] mova m5, [rsp+16*1] SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 mova m4, [rsp+16*3] mova m6, [rsp+16*4] mova m5, [rsp+16*5] movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: mova m7, [base+ipred_v_shuf] movu m1, [tlq+1] ; left sub tlq, 4 sub tlq, hq punpckhbw m6, m1, m0 ; top, bottom punpcklbw m1, m0 ; top, bottom pshufd m7, m7, q0000 mova [rsp+16*2], m7 pmaddubsw m2, m6, m5 mova [rsp+16*5], m6 paddw m6, m3 ; 1 * top + 255 * bottom + 255 paddw m2, m6 ; 128 * top + 129 * bottom + 255 mova [rsp+16*6], m2 pmaddubsw m2, m1, m5 paddw m3, m1 ; 1 * top + 255 * bottom + 255 mova [rsp+16*0], m1 paddw m2, m3 ; 128 * top + 129 * bottom + 255 mova [rsp+16*1], m2 mova [rsp+16*3], m4 mova [rsp+16*4], m5 .w16_loop: movd m1, [tlq+hq] ; left pshufb m1, [base+pb_3] ; topleft[-(1 + y)] punpcklbw m1, m4 ; left, right pmaddubsw m2, m1, m5 ; 127 * left - 127 * right paddw m2, m1 ; 128 * left + 129 * right mova m0, m1 mova m3, m2 pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; pmaddubsw m1, [base+smooth_weights+16*3] paddw m2, m0 paddw m3, m1 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; add v_weightsq, 2 mova m7, [rsp+16*2] pshufb m1, m7 mova [rsp+16*7], m3 mova m4, [rsp+16*0] mova m5, [rsp+16*1] mova m3, [rsp+16*5] mova m7, [rsp+16*6] SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] mova m4, [rsp+16*3] mova m5, [rsp+16*4] mova [dstq], m0 lea dstq, [dstq+strideq] sub hd, 1 jg .w16_loop RET ALIGN function_align .w32: movu m1, [tlq+1] ; top topleft[1 + x] movu m2, [tlq+17] ; top mova [rsp+16*0], m1 mova [rsp+16*1], m2 sub tlq, 4 sub tlq, hq mova m7, [base+ipred_v_shuf] pshufd m7, m7, q0000 mova [rsp+16*2], m7 mova [rsp+16*3], m0 mova [rsp+16*4], m4 mova [rsp+16*5], m5 .w32_loop: SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 lea dstq, [dstq-16+strideq] add v_weightsq, 2 sub hd, 1 jg .w32_loop RET ALIGN function_align .w64: movu m1, [tlq+1] ; top topleft[1 + x] movu m2, [tlq+17] ; top mova [rsp+16*0], m1 mova [rsp+16*1], m2 movu m1, [tlq+33] ; top movu m2, [tlq+49] ; top mova [rsp+16*11], m1 mova [rsp+16*12], m2 sub tlq, 4 sub tlq, hq mova m7, [base+ipred_v_shuf] pshufd m7, m7, q0000 mova [rsp+16*2], m7 mova [rsp+16*3], m0 mova [rsp+16*4], m4 mova [rsp+16*5], m5 .w64_loop: SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 lea dstq, [dstq-48+strideq] add v_weightsq, 2 sub hd, 1 jg .w64_loop RET %if ARCH_X86_64 cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx %define base r7-$$ lea r7, [$$] mova m8, [base+pw_62] mova m9, [base+pw_64] mova m10, [base+pw_512] %else cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx %define base r1-$$ %define m8 [base+pw_62] %define m9 [base+pw_64] %define m10 [base+pw_512] %define strideq r3 %define stridemp dword [rsp+16*12] mov stridemp, r1 LEA r1, $$ %endif tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm inc tlq movsxd wq, [base+ipred_z1_ssse3_table+wq*4] mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 lea wq, [base+wq+ipred_z1_ssse3_table] movzx dxd, word [base+dr_intra_derivative+dxq] xor angled, 0x4ff ; d = 90 - angle jmp wq .w4: lea r3d, [angleq+88] test r3d, 0x480 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r3d, 9 add r3d, hd cmp r3d, 8 jg .w4_no_upsample ; h > 8 || (w == h && is_sm) mova m1, [tlq-1] pshufb m0, m1, [base+z_upsample1] pshufb m1, [base+z_upsample2] movddup m2, [base+pb_36_m4] add dxd, dxd pmaddubsw m0, m2 pshufd m7, m1, q3333 movd [rsp+16], m7 ; top[max_base_x] pmaddubsw m1, m2 movd m6, dxd mov r5d, dxd ; xpos pshufb m6, [base+pw_256] paddw m1, m0 movq m0, [tlq] pmulhrsw m1, m10 paddw m7, m6, m6 punpcklqdq m6, m7 ; xpos0 xpos1 packuswb m1, m1 punpcklbw m0, m1 movifnidn strideq, stridemp mova [rsp], m0 .w4_upsample_loop: lea r2d, [r5+dxq] shr r5d, 6 ; base0 movq m0, [rsp+r5] lea r5d, [r2+dxq] shr r2d, 6 ; base1 movhps m0, [rsp+r2] pand m2, m8, m6 ; frac psubw m1, m9, m2 ; 64-frac psllw m2, 8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 paddw m6, m7 ; xpos += dx pmulhrsw m0, m10 packuswb m0, m0 movd [dstq+strideq*0], m0 pshuflw m0, m0, q1032 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_upsample_loop RET .w4_no_upsample: mov r3d, 7 ; max_base test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea r3d, [hq+3] movd m0, r3d movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 pcmpeqb m1, m0, [base+z_filter_wh4] pand m1, m2 pcmpgtb m1, [base+z_filter_t_w48+angleq*8] pmovmskb r5d, m1 mov r3d, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 mova m3, [tlq-1] imul r5d, 0x55555555 movu m7, [base+z_filter_s+8] shr r5d, 30 ; filter_strength movddup m0, [base+pb_8] pminub m7, m0 pshufb m0, m3, [base+z_filter_s] movddup m4, [base+z_filter_k-8+r5*8+24*0] pshufb m3, m7 movddup m5, [base+z_filter_k-8+r5*8+24*1] shufps m2, m0, m3, q2121 movddup m6, [base+z_filter_k-8+r5*8+24*2] pmaddubsw m0, m4 pmaddubsw m1, m2, m4 pmaddubsw m2, m5 paddd m5, m6 pmaddubsw m4, m3, m5 pmaddubsw m3, m6 paddw m0, m2 paddw m1, m4 paddw m0, m3 pshufd m1, m1, q3333 pmulhrsw m0, m10 pmulhrsw m1, m10 mov r5d, 9 mov tlq, rsp cmp hd, 4 cmovne r3d, r5d packuswb m0, m1 mova [tlq], m0 .w4_main: add tlq, r3 movd m5, dxd movddup m0, [base+z_base_inc] ; base_inc << 6 movd m7, [tlq] ; top[max_base_x] shl r3d, 6 movd m4, r3d pshufb m5, [base+pw_256] mov r5d, dxd ; xpos pshufb m7, [base+pw_m256] sub r5, r3 pshufb m4, [base+pw_256] mova m3, [base+z1_shuf_w4] paddw m6, m5, m5 psubw m4, m0 ; max_base_x punpcklqdq m5, m6 ; xpos0 xpos1 .w4_loop: lea r3, [r5+dxq] sar r5, 6 ; base0 movq m0, [tlq+r5] lea r5, [r3+dxq] sar r3, 6 ; base1 movhps m0, [tlq+r3] pand m2, m8, m5 ; frac psubw m1, m9, m2 ; 64-frac psllw m2, 8 pshufb m0, m3 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 movifnidn strideq, stridemp pcmpgtw m1, m4, m5 ; base < max_base_x pmulhrsw m0, m10 paddw m5, m6 ; xpos += dx pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movd [dstq+strideq*0], m0 pshuflw m0, m0, q1032 movd [dstq+strideq*1], m0 sub hd, 2 jz .w4_end lea dstq, [dstq+strideq*2] test r5d, r5d jl .w4_loop packuswb m7, m7 .w4_end_loop: movd [dstq+strideq*0], m7 movd [dstq+strideq*1], m7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_end_loop .w4_end: RET .w8: lea r3d, [angleq+88] and r3d, ~0x7f or r3d, hd cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 mova m5, [base+z_upsample1] movu m3, [base+z_filter_s+6] movd m4, hd mova m0, [tlq-1] movu m1, [tlq+7] pxor m7, m7 pshufb m4, m7 movddup m7, [base+pb_36_m4] pminub m4, m3 add dxd, dxd pshufb m2, m0, m5 pmaddubsw m2, m7 pshufb m0, m3 pmaddubsw m0, m7 movd m6, dxd pshufb m3, m1, m5 pmaddubsw m3, m7 pshufb m1, m4 pmaddubsw m1, m7 pshufb m6, [base+pw_256] mov r5d, dxd paddw m2, m0 paddw m7, m6, m6 paddw m3, m1 punpcklqdq m6, m7 ; xpos0 xpos1 movu m1, [tlq] pmulhrsw m2, m10 pmulhrsw m3, m10 packuswb m2, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 movifnidn strideq, stridemp mova [rsp+16*0], m0 mova [rsp+16*1], m1 .w8_upsample_loop: lea r2d, [r5+dxq] shr r5d, 6 ; base0 movu m0, [rsp+r5] lea r5d, [r2+dxq] shr r2d, 6 ; base1 movu m1, [rsp+r2] pand m2, m8, m6 psubw m3, m9, m2 psllw m2, 8 por m3, m2 punpcklqdq m2, m3, m3 ; frac0 pmaddubsw m0, m2 punpckhqdq m3, m3 ; frac1 pmaddubsw m1, m3 paddw m6, m7 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_upsample_loop RET .w8_no_upsample: lea r3d, [hq+7] movd m0, r3d and r3d, 7 or r3d, 8 ; imin(h+7, 15) test angled, 0x400 jnz .w8_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movu m1, [base+z_filter_wh8] psrldq m3, [base+z_filter_t_w48+angleq*8], 4 pcmpeqb m1, m0 pand m1, m2 pcmpgtb m1, m3 pmovmskb r5d, m1 test r5d, r5d jz .w8_main ; filter_strength == 0 movd m3, [tlq-1] movu m0, [tlq+16*0] imul r5d, 0x55555555 movu m1, [tlq+16*1] shr r5d, 30 ; filter_strength movd m2, [tlq+r3] lea tlq, [rsp+16*4] sub r5, 3 mova [tlq-16*1], m0 pxor m7, m7 mova [tlq+16*0], m1 pshufb m3, m7 pshufb m2, m7 mova [tlq-16*2], m3 movq [tlq+r3-15], m2 call .filter_edge sar r5d, 1 add r5d, 17 cmp hd, 8 cmova r3d, r5d .w8_main: add tlq, r3 movd m5, dxd movd m7, [tlq] shl r3d, 6 movu m3, [base+z_filter_s+2] movd m4, r3d pshufb m5, [base+pw_256] mov r5d, dxd pshufb m7, [base+pw_m256] sub r5, r3 pshufb m4, [base+pw_256] psubw m4, [base+z_base_inc] mova m6, m5 .w8_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3] pand m1, m8, m5 psubw m2, m9, m1 psllw m1, 8 pshufb m0, m3 por m1, m2 pmaddubsw m0, m1 pcmpgtw m1, m4, m5 paddw m5, m6 pmulhrsw m0, m10 pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movq [dstq], m0 dec hd jz .w8_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w8_loop packuswb m7, m7 .w8_end_loop: movq [dstq], m7 add dstq, strideq dec hd jg .w8_end_loop .w8_end: RET .w16: lea r3d, [hq+15] movd m0, r3d and r3d, 15 or r3d, 16 ; imin(h+15, 31) test angled, 0x400 jnz .w16_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movq m3, [base+z_filter_t_w16+angleq*4] pcmpeqb m0, [base+z_filter_wh16] pand m0, m2 pcmpgtb m0, m3 pmovmskb r5d, m0 test r5d, r5d jz .w16_main ; filter_strength == 0 movd m4, [tlq-1] movu m0, [tlq+16*0] imul r5d, 0x24924924 movu m1, [tlq+16*1] shr r5d, 30 movd m2, [tlq+30] adc r5, -4 ; filter_strength-3 movd m3, [tlq+r3] lea tlq, [rsp+16*4] mova [tlq-16*1], m0 pxor m7, m7 mova [tlq+16*0], m1 pshufb m4, m7 movd [rsp], m2 pshufb m3, m7 mova [tlq-16*2], m4 movd [tlq+r3-16], m3 call .filter_edge cmp hd, 16 jle .w16_main pshuflw m0, [rsp], q0000 sar r5, 1 movd m1, [base+z_filter_k_tail+4+r5*4] lea r3d, [r5+33] pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq+32], m0 .w16_main: add tlq, r3 movd m5, dxd movd m7, [tlq] movd m4, r3d shl r3d, 6 pshufb m5, [base+pw_256] pxor m6, m6 pshufb m7, m6 mov r5d, dxd pshufb m4, m6 sub r5, r3 psubb m4, [base+pb_0to15] mova m6, m5 .w16_loop: mov r3, r5 sar r3, 6 movu m1, [tlq+r3+0] pand m0, m8, m5 movu m2, [tlq+r3+1] psubw m3, m9, m0 psllw m0, 8 por m3, m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 psrlw m3, m5, 6 packsswb m3, m3 pmulhrsw m0, m10 pmulhrsw m1, m10 paddw m5, m6 pcmpgtb m2, m4, m3 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 mova [dstq], m0 dec hd jz .w16_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w16_loop .w16_end_loop: mova [dstq], m7 add dstq, strideq dec hd jg .w16_end_loop .w16_end: RET .w32: lea r3d, [hq+31] and r3d, 31 or r3d, 32 ; imin(h+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main movd m6, [tlq-1] movu m0, [tlq+16*0] movu m1, [tlq+16*1] movu m2, [tlq+16*2] movu m3, [tlq+16*3] movd m4, [tlq+62] movd m5, [tlq+r3] lea tlq, [rsp+16*6] mova [tlq-16*3], m0 pxor m7, m7 mova [tlq-16*2], m1 pshufb m6, m7 mova [tlq-16*1], m2 xor r5d, r5d ; filter_strength = 3 mova [tlq+16*0], m3 movd [rsp], m4 pshufb m5, m7 mova [tlq-16*4], m6 movd [tlq+r3-48], m5 call .filter_edge sub tlq, 16*2 call .filter_edge cmp hd, 32 jle .w32_main pshuflw m0, [rsp], q0000 movd m1, [base+z_filter_k_tail+4] add r3d, 2 pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq+64], m0 .w32_main: add tlq, r3 movd m0, r3d movd m7, [tlq] shl r3d, 6 movd m5, dxd pxor m6, m6 mov r5d, dxd pshufb m0, m6 pshufb m5, [base+pw_256] sub r5, r3 pshufb m7, m6 psubb m0, [base+pb_0to15] movddup m1, [base+pb_m16] mova [rsp+16*0], m0 paddb m0, m1 mova [rsp+16*1], m0 mova m6, m5 .w32_loop: mov r3, r5 sar r3, 6 movu m1, [tlq+r3+16*0+0] pand m0, m8, m5 movu m2, [tlq+r3+16*0+1] psubw m3, m9, m0 psllw m0, 8 por m3, m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 psrlw m4, m5, 6 pmulhrsw m0, m10 pmulhrsw m1, m10 packsswb m4, m4 pcmpgtb m2, [rsp+16*0], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*1+0] movu m2, [tlq+r3+16*1+1] mova [dstq+16*0], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*1], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 mova [dstq+16*1], m0 dec hd jz .w32_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w32_loop .w32_end_loop: mova [dstq+16*0], m7 mova [dstq+16*1], m7 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: lea r3d, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main movd m4, [tlq-1] movu m0, [tlq+16*0] movu m1, [tlq+16*1] movu m2, [tlq+16*2] movu m3, [tlq+16*3] mova [rsp+16*3], m0 pxor m7, m7 mova [rsp+16*4], m1 pshufb m4, m7 mova [rsp+16*5], m2 mova [rsp+16*6], m3 mova [rsp+16*2], m4 movu m0, [tlq+16*4] movu m1, [tlq+16*5] movu m2, [tlq+16*6] movu m3, [tlq+16*7] movd m4, [tlq+r3] lea tlq, [rsp+16*10] mova [tlq-16*3], m0 xor r5d, r5d ; filter_strength = 3 mova [tlq-16*2], m1 pshufb m4, m7 mova [tlq-16*1], m2 mova [tlq+16*0], m3 movd [tlq+r3-16*7], m4 cmp hd, 64 jl .w64_filter96 ; skip one call if the last 32 bytes aren't used call .filter_edge .w64_filter96: sub tlq, 16*2 call .filter_edge sub tlq, 16*2 call .filter_edge sub tlq, 16*2 call .filter_edge .w64_main: add tlq, r3 movd m0, r3d movd m7, [tlq] shl r3d, 6 movd m5, dxd pxor m6, m6 mov r5d, dxd pshufb m0, m6 sub r5, r3 pshufb m5, [base+pw_256] pshufb m7, m6 psubb m0, [base+pb_0to15] movddup m1, [base+pb_m16] mova [rsp+16*0], m0 paddb m0, m1 mova [rsp+16*1], m0 paddb m0, m1 mova [rsp+16*2], m0 paddb m0, m1 mova [rsp+16*3], m0 mova m6, m5 .w64_loop: mov r3, r5 sar r3, 6 movu m1, [tlq+r3+16*0+0] pand m0, m8, m5 movu m2, [tlq+r3+16*0+1] psubw m3, m9, m0 psllw m0, 8 por m3, m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 psrlw m4, m5, 6 pmulhrsw m0, m10 pmulhrsw m1, m10 packsswb m4, m4 pcmpgtb m2, [rsp+16*0], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*1+0] movu m2, [tlq+r3+16*1+1] mova [dstq+16*0], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*1], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*2+0] movu m2, [tlq+r3+16*2+1] mova [dstq+16*1], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*2], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*3+0] movu m2, [tlq+r3+16*3+1] mova [dstq+16*2], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*3], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 mova [dstq+16*3], m0 dec hd jz .w64_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w64_loop .w64_end_loop: mova [dstq+16*0], m7 mova [dstq+16*1], m7 mova [dstq+16*2], m7 mova [dstq+16*3], m7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET ALIGN function_align .filter_edge: ; 32 pixels/iteration movddup m7, [base+z_filter_k+8*2+r5*8+24*0] movu m2, [tlq-18] movu m1, [tlq-17] movu m3, [tlq- 2] movu m4, [tlq- 1] punpcklbw m0, m2, m1 pmaddubsw m0, m7 punpckhbw m2, m1 pmaddubsw m2, m7 punpcklbw m1, m3, m4 pmaddubsw m1, m7 punpckhbw m3, m4 pmaddubsw m3, m7 movddup m7, [base+z_filter_k+8*2+r5*8+24*1] mova m5, [tlq-16] movu m6, [tlq-15] punpcklbw m4, m5, m6 pmaddubsw m4, m7 punpckhbw m5, m6 pmaddubsw m5, m7 paddw m0, m4 paddw m2, m5 mova m5, [tlq+ 0] movu m6, [tlq+ 1] punpcklbw m4, m5, m6 pmaddubsw m4, m7 punpckhbw m5, m6 pmaddubsw m5, m7 paddw m1, m4 paddw m3, m5 test r5d, r5d jnz .filter_end ; 3-tap movddup m7, [base+z_filter_k+8*8] movu m5, [tlq-14] movu m6, [tlq+ 2] punpcklbw m4, m5, m5 pmaddubsw m4, m7 punpckhbw m5, m5 pmaddubsw m5, m7 paddw m0, m4 paddw m2, m5 punpcklbw m5, m6, m6 pmaddubsw m5, m7 punpckhbw m6, m6 pmaddubsw m6, m7 paddw m1, m5 paddw m3, m6 .filter_end: %if ARCH_X86_64 REPX {pmulhrsw x, m10}, m0, m2, m1, m3 %else mova m4, m10 REPX {pmulhrsw x, m4 }, m0, m2, m1, m3 %endif packuswb m0, m2 packuswb m1, m3 mova [tlq+16*0], m0 mova [tlq+16*1], m1 ret %if ARCH_X86_64 cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy %define base r7-$$ %define maxwm r6m %define maxhm r7m lea r7, [$$] mov hd, hm mova m8, [base+pw_62] mova m9, [base+pw_64] lea r9d, [wq-4] mova m10, [base+pw_512] shl r9d, 6 mova m11, [base+z1_shuf_w4] or r9d, hd mova m12, [base+z2_h_shuf] %else cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx %define base r1-$$ %define m8 [base+pw_62] %define m9 [base+pw_64] %define m10 [base+pw_512] %define m11 [rsp+16*16] %define m12 [rsp+16*17] %define r9b byte [rsp+16*18+4*0] %define r9d dword [rsp+16*18+4*0] %define r10d dword [rsp+16*18+4*1] %define r11d dword [rsp+16*18+4*2] %define maxwm [rsp+16*18+4*3] %define maxhm [rsp+16*19+4*0] %define stridemp [rsp+16*19+4*1] %define strideq r3 %define dyd r4 %define dyq r4 mov stridemp, r1 mov r1d, r6m mov r4d, r7m mov maxwm, r1d mov maxhm, r4d LEA r1, $$ lea hd, [wq-4] mova m0, [base+z1_shuf_w4] shl hd, 6 mova m1, [base+z2_h_shuf] or hd, hm mova m11, m0 mov r9d, hd mova m12, m1 %endif tzcnt wd, wd movifnidn angled, anglem movsxd wq, [base+ipred_z2_ssse3_table+wq*4] %if ARCH_X86_64 movzx dxd, angleb %else movzx dxd, byte anglem %endif xor angled, 0x400 mova m0, [tlq-16*4] mov dyd, dxd mova m1, [tlq-16*3] neg dxq mova m2, [tlq-16*2] and dyd, ~1 mova m3, [tlq-16*1] and dxq, ~1 movd m4, [tlq] movu m5, [tlq+16*0+1] movu m6, [tlq+16*1+1] movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle mova [rsp+16*2], m0 pxor m7, m7 mova [rsp+16*3], m1 pshufb m4, m7 mova [rsp+16*4], m2 lea wq, [base+ipred_z2_ssse3_table+wq] mova [rsp+16*5], m3 neg dxd mova [rsp+16*6], m4 or dyd, 4<<16 mova [rsp+16*7], m4 mova [rsp+16*8], m5 mova [rsp+16*9], m6 movq m0, [base+z_base_inc+2] movsldup m1, [base+z2_dy_offset] movq m2, [base+pw_256] ; 4<<6 movq [rsp+16*14+8*0], m0 movq [rsp+16*15+8*0], m1 movq [rsp+16*15+8*1], m2 %if ARCH_X86_64 lea r10d, [dxq+(128<<6)] ; xpos %else mov [rsp+16*7+4*1], dyd lea r4d, [dxq+(128<<6)] mov r10d, r4d movzx hd, r9b %endif mov r11d, (128-4)<<6 jmp wq .w4: test angled, 0x400 jnz .w4_main movd m5, [tlq+4] lea r3d, [hq+2] add angled, 1022 pshufb m5, m7 shl r3d, 6 movd [rsp+16*8+4], m5 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle movd m0, r3d movd m6, angled shr angled, 8 ; is_sm << 1 pshufb m0, m7 pshufb m6, m7 pcmpeqb m0, [base+z_filter_wh4] pand m6, m0 pcmpgtb m6, [base+z_filter_t_w48+angleq*8] jmp .w8_filter_left .upsample_above: ; w4/w8 movq m3, [rsp+gprsize+16*8-2] movq m1, [rsp+gprsize+16*8-1] movq m0, [rsp+gprsize+16*8+0] movq m4, [rsp+gprsize+16*8+1] movddup m5, [base+pb_36_m4] punpcklbw m1, m3 punpcklbw m2, m0, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 %if ARCH_X86_64 mova m11, [base+pb_0to15] lea r10d, [r10+dxq+(1<<6)] mov r11d, (128-7)<<6 %else mova m3, [base+pb_0to15] mov r3d, [rsp+gprsize+16*18+4*1] mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6 lea r3d, [r3+dxq+(1<<6)] mov [rsp+gprsize+16*18+4*1], r3d mova [rsp+gprsize+16*16], m3 %endif add dxd, dxd paddw m1, m2 pmulhrsw m1, m10 movq m2, [rsp+gprsize+16*14] paddw m2, m2 movq [rsp+gprsize+16*14], m2 packuswb m1, m1 punpcklbw m1, m0 mova [rsp+gprsize+16*8], m1 ret .w4_no_upsample_above: lea r3d, [hq+3] mov [rsp], angled sub angled, 1112 ; angle - 90 movd m0, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m3, [base+z_filter_wh4] mova m4, [base+z_filter_t_w48+angleq*8] call .w8_filter_top mov angled, [rsp] lea r3d, [hq+2] sub angled, 139 shl r3d, 6 test r3d, angled jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) .upsample_left: ; w4/w8 neg hq movd m0, [tlq+hq] pshufb m0, m7 movd [rsp+16*6+hq-4], m0 movq m3, [rsp+16*5+7] movq m0, [rsp+16*5+8] movq m2, [rsp+16*5+9] movq m4, [rsp+16*5+10] movddup m5, [base+pb_36_m4] punpcklbw m1, m0, m3 punpcklbw m2, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 movshdup m3, [base+z2_dy_offset] %if ARCH_X86_64 mova m12, [base+z2_upsample] add dyd, dyd %else mova m4, [base+z2_upsample] shl dword [rsp+16*7+4*1], 1 mova m12, m4 %endif paddw m1, m2 pmulhrsw m1, m10 movq [rsp+16*15], m3 packuswb m1, m1 punpcklbw m0, m1 mova [rsp+16*5], m0 .w4_main: movd m6, dxd %if ARCH_X86_64 movd m3, dyd %else movd m3, [rsp+16*7+4*1] %endif movddup m0, [rsp+16*14+8*0] pshufb m6, [base+pw_256] paddw m7, m6, m6 movq m5, [base+pw_m1to4] pshuflw m4, m3, q0000 punpcklqdq m6, m7 pmullw m4, m5 pshuflw m3, m3, q1111 paddw m6, m0 mov r2d, r10d pshuflw m0, m4, q3333 psubw m4, [rsp+16*15] movq [rsp+16*6+8*1], m3 movq [rsp+8*1], m0 ; dy*4 mov r5, dstq .w4_loop0: mova [rsp+16*12], m6 movq [rsp+8*0], m4 pand m0, m4, m8 psraw m4, 6 psubw m1, m9, m0 psllw m0, 8 por m0, m1 ; 64-frac_y, frac_y movq [rsp+8*3], m0 pabsw m4, m4 movq [rsp+8*2], m4 movzx hd, r9b .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movq m0, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movhps m0, [rsp+r3] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movq m1, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movhps m1, [rsp+r3] pand m2, m8, m6 paddsw m5, m6, m7 psubw m3, m9, m2 psllw m2, 8 pshufb m0, m11 por m2, m3 pmaddubsw m0, m2 pand m2, m8, m5 psubw m3, m9, m2 psllw m2, 8 pshufb m1, m11 por m2, m3 pmaddubsw m1, m2 cmp r3d, 127 ; topleft jge .w4_toponly movzx r3d, byte [rsp+8*2+0] ; base_y0 movq m3, [rsp+r3] movzx r3d, byte [rsp+8*2+2] ; base_y1 movhps m3, [rsp+r3] movzx r3d, byte [rsp+8*2+4] ; base_y2 movq m4, [rsp+r3] movzx r3d, byte [rsp+8*2+6] ; base_y3 movhps m4, [rsp+r3] pshufb m3, m12 pshufb m4, m12 punpckldq m2, m3, m4 punpckhdq m3, m4 movddup m4, [rsp+8*3] pmaddubsw m2, m4 pmaddubsw m3, m4 psraw m6, 15 ; base_x < topleft pand m2, m6 pandn m6, m0 por m0, m2, m6 psraw m6, m5, 15 pand m3, m6 pandn m6, m1 por m1, m3, m6 .w4_toponly: pmulhrsw m0, m10 pmulhrsw m1, m10 movifnidn strideq, stridemp packuswb m0, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] punpckhqdq m0, m0 movd [dstq+strideq*0], m0 psrlq m0, 32 movd [dstq+strideq*1], m0 sub hd, 4 jz .w4_end movq m4, [rsp+8*2] movq m3, [rsp+16*6+8*1] paddw m6, m5, m7 ; xpos += dx psubw m4, m3 movq [rsp+8*2], m4 lea dstq, [dstq+strideq*2] cmp r2d, r11d jge .w4_loop movddup m5, [rsp+8*3] .w4_leftonly_loop: movzx r2d, byte [rsp+8*2+0] ; base_y0 movq m1, [rsp+r2] movzx r2d, byte [rsp+8*2+2] ; base_y1 movhps m1, [rsp+r2] movzx r2d, byte [rsp+8*2+4] ; base_y2 movq m2, [rsp+r2] movzx r2d, byte [rsp+8*2+6] ; base_y3 movhps m2, [rsp+r2] psubw m4, m3 pshufb m1, m12 pshufb m2, m12 movq [rsp+8*2], m4 punpckldq m0, m1, m2 punpckhdq m1, m2 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] punpckhqdq m0, m0 movd [dstq+strideq*0], m0 psrlq m0, 32 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4_leftonly_loop .w4_end: sub r9d, 1<<8 jl .w4_ret movq m4, [rsp+8*1] add r5, 4 mov dstq, r5 paddw m4, [rsp+8*0] ; base_y += 4*dy movzx r2d, word [rsp+16*15+8*1] movddup m6, [rsp+16*15+8*1] paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above) add r2d, r10d mov r10d, r2d jmp .w4_loop0 .w4_ret: RET .w8: test angled, 0x400 jnz .w4_main movd m5, [tlq+8] lea r3d, [angleq+126] pshufb m5, m7 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif movd [rsp+16*8+8], m5 cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm call .upsample_above sub angled, 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle movu m1, [base+z_filter_wh8] movd m0, r3d movd m6, angled shr angled, 8 ; is_sm << 1 psrldq m2, [base+z_filter_t_w48+angleq*8], 4 pshufb m0, m7 pshufb m6, m7 pcmpeqb m0, m1 pand m6, m0 pcmpgtb m6, m2 %if ARCH_X86_64 movq [rsp+16*15+8*1], m10 ; 8<<6 %else movq m0, m10 movq [rsp+16*15+8*1], m0 %endif jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] mov [rsp], angled sub angled, 90 movd m0, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m3, [base+z_filter_wh8] psrldq m4, [base+z_filter_t_w48+angleq*8], 4 call .w8_filter_top mov r3d, [rsp] sub r3d, 141 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif cmp r3d, 8 jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm .w8_filter_left: pmovmskb r5d, m6 test r5d, r5d jz .w4_main imul r5d, 0x55555555 mov r3, tlq shr r5d, 30 sub r5, 3 ; filter_strength-3 jmp .filter_left .w8_filter_top: movd m6, r3d REPX {pshufb x, m7}, m0, m1, m6 pcmpeqb m0, m3 pand m1, m0 pand m6, m0 pcmpgtb m1, m4 pcmpgtb m6, m4 pmovmskb r5d, m1 test r5d, r5d jz .w8_filter_top_end ; filter_strength == 0 imul r5d, 0x55555555 movq m0, [rsp+gprsize+16*8-2] shr r5d, 30 movq m1, [rsp+gprsize+16*8-1] sub r5, 3 ; filter_strength-3 movddup m7, [base+z_filter_k+8*2+r5*8+24*0] punpcklbw m0, m1 pmaddubsw m0, m7 movq m1, [rsp+gprsize+16*8+0] movq m2, [rsp+gprsize+16*8+1] movddup m7, [base+z_filter_k+8*2+r5*8+24*1] punpcklbw m1, m2 pmaddubsw m1, m7 movq m2, [rsp+gprsize+16*8+2] movddup m7, [base+z_filter_k+8*2+r5*8+24*2] punpcklbw m2, m2 pmaddubsw m2, m7 paddw m0, m1 paddw m0, m2 %if ARCH_X86_64 mov r3d, r7m ; maxw, offset due to call %else mov r3d, [rsp+gprsize+16*18+4*3] %endif pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 movq [rsp+gprsize+16*8], m0 cmp r3d, 8 jge .w8_filter_top_end movq m0, [tlq+r3+1] movq [rsp+gprsize+r3+16*8], m0 .w8_filter_top_end: ret .w16: test angled, 0x400 jnz .w4_main lea r3d, [hq+15] sub angled, 90 movd m0, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movd m6, r3d REPX {pshufb x, m7}, m0, m1, m6 movq m3, [base+z_filter_t_w16+angleq*4] pcmpeqb m0, [base+z_filter_wh16] pand m1, m0 pand m6, m0 pcmpgtb m1, m3 pcmpgtb m6, m3 pmovmskb r5d, m1 mov r3, tlq test r5d, r5d jz .w16_filter_left ; filter_strength == 0 imul r5d, 0x24924924 pshufb m5, [base+z_filter_t_w16] ; tlq[16] shr r5d, 30 adc r5, -4 ; filter_strength-3 movd [rsp+16*9], m5 movddup m7, [base+z_filter_k+8*2+r5*8+24*0] movu m1, [rsp+16*8-2] movu m2, [rsp+16*8-1] punpcklbw m0, m1, m2 pmaddubsw m0, m7 punpckhbw m1, m2 pmaddubsw m1, m7 movddup m7, [base+z_filter_k+8*2+r5*8+24*1] mova m3, [rsp+16*8+0] movu m4, [rsp+16*8+1] punpcklbw m2, m3, m4 pmaddubsw m2, m7 punpckhbw m3, m4 pmaddubsw m3, m7 paddw m0, m2 paddw m1, m3 test r5d, r5d jnz .w16_filter_end ; 3-tap movddup m7, [base+z_filter_k+8*8] movu m3, [rsp+16*8+2] punpcklbw m2, m3, m3 pmaddubsw m2, m7 punpckhbw m3, m3 pmaddubsw m3, m7 paddw m0, m2 paddw m1, m3 .w16_filter_end: mov r2d, maxwm pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 mova [rsp+16*8], m0 cmp r2d, 16 jge .w16_filter_left movu m0, [r3+r2+1] movu [rsp+r2+16*8], m0 .w16_filter_left: pmovmskb r5d, m6 test r5d, r5d jz .w4_main imul r5d, 0x24924924 shr r5d, 30 adc r5, -4 ; filter_strength-3 jmp .filter_left .w32: test angled, 0x400 jnz .w4_main pshufb m6, [base+z_filter_t_w16] ; tlq[32] mov r3, tlq lea tlq, [rsp+16*9] movd [tlq+16*1], m6 xor r5d, r5d ; filter_strength = 3 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge mova m0, [tlq+16*0] mova m1, [tlq+16*1] mov r2d, maxwm mova [rsp+16*8], m0 mova [rsp+16*9], m1 cmp r2d, 32 jge .filter_left movu m0, [r3+r2+16*0+1] movu m1, [r3+r2+16*1+1] movu [rsp+r2+16*8], m0 movu [rsp+r2+16*9], m1 jmp .filter_left .w64: movu m0, [tlq+16*2+1] movu m1, [tlq+16*3+1] mova [rsp+16*10], m0 mova [rsp+16*11], m1 test angled, 0x400 jnz .w4_main pshufb m1, [base+z_filter_t_w16] ; tlq[64] mov r3, tlq lea tlq, [rsp+16*11] movd [tlq+16*1], m1 xor r5d, r5d ; filter_strength = 3 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge mova m0, [tlq+16*0] mova m1, [tlq+16*1] mova m2, [tlq+16*2] mova m3, [tlq+16*3] mov r2d, maxwm mova [rsp+16* 8], m0 mova [rsp+16* 9], m1 mova [rsp+16*10], m2 mova [rsp+16*11], m3 cmp r2d, 64 jge .filter_left movu m0, [r3+r2+16*0+1] movu m1, [r3+r2+16*1+1] movu [rsp+r2+16* 8], m0 movu [rsp+r2+16* 9], m1 cmp r2d, 32 jge .filter_left movu m0, [r3+r2+16*2+1] movu m1, [r3+r2+16*3+1] movu [rsp+r2+16*10], m0 movu [rsp+r2+16*11], m1 .filter_left: neg hq movd m0, [r3+hq] pxor m1, m1 pshufb m0, m1 movd [rsp+16*6+hq-4], m0 lea tlq, [rsp+16*5] call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge cmp hd, -32 jge .filter_left_end sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge mova m0, [tlq+16*0] mova m1, [tlq+16*1] mova [rsp+16*2], m0 mova [rsp+16*3], m1 .filter_left_end: mov r2d, maxhm mova m0, [rsp+16*5] mova m1, [rsp+16*6] mova m2, [rsp+16*7] neg r2 mova [rsp+16*4], m0 mova [rsp+16*5], m1 mova [rsp+16*6], m2 cmp r2d, hd jle .w4_main movu m0, [r3+r2-16*2] movu m1, [r3+r2-16*1] movu [rsp+r2+16*4], m0 movu [rsp+r2+16*5], m1 cmp r2d, -32 jle .w4_main movu m0, [r3+r2-16*4] movu m1, [r3+r2-16*3] movu [rsp+r2+16*2], m0 movu [rsp+r2+16*3], m1 jmp .w4_main %if ARCH_X86_64 cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w %define base r7-$$ lea r7, [$$] mova m8, [base+pw_62] mova m9, [base+pw_64] mova m10, [base+pw_512] mov org_wd, wd %else cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy %define base r1-$$ %define m8 [base+pw_62] %define m9 [base+pw_64] %define m10 [base+pw_512] %define org_wd r5 %define org_wq r5 mov [dstq+strideq*0], strideq mov [dstq+strideq*1], wd LEA r1, $$ %endif tzcnt hd, hm movifnidn angled, anglem dec tlq movsxd hq, [base+ipred_z3_ssse3_table+hq*4] sub angled, 180 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e lea hq, [base+ipred_z3_ssse3_table+hq] movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] jmp hq .h4: lea r4d, [angleq+88] test r4d, 0x480 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r4d, 9 add r4d, wd cmp r4d, 8 jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) movu m3, [tlq-7] movu m1, [base+z_upsample1-4] movu m4, [base+z_filter_s+2] pshufb m0, m3, m1 pxor m1, m1 pshufb m2, m3, m1 pshufb m1, m3, m4 mova [rsp+16], m2 ; top[max_base_y] movddup m2, [base+pb_36_m4] add dyd, dyd pmaddubsw m0, m2 pmaddubsw m1, m2 movd m5, dyd mov r5d, dyd pshufb m5, [base+pw_256] paddw m0, m1 pmulhrsw m0, m10 shl wd, 2 mov tlq, rsp sub rsp, wq packuswb m0, m0 punpcklbw m0, m3 paddw m6, m5, m5 punpcklqdq m5, m6 pshufb m0, [base+pb_15to0] mova [tlq], m0 .h4_upsample_loop: lea r4d, [r5+dyq] shr r5d, 6 movq m0, [tlq+r5] lea r5d, [r4+dyq] shr r4d, 6 movhps m0, [tlq+r4] pand m2, m8, m5 psubw m1, m9, m2 psllw m2, 8 por m1, m2 pmaddubsw m0, m1 paddw m5, m6 pmulhrsw m0, m10 packuswb m0, m0 movq [rsp+wq-8], m0 sub wd, 8 jg .h4_upsample_loop jmp .h4_transpose .h4_no_upsample: mov r4d, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea r4d, [wq+3] movd m0, r4d movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 pcmpeqb m1, m0, [base+z_filter_wh4] pand m1, m2 pcmpgtb m1, [base+z_filter_t_w48+angleq*8] pmovmskb r5d, m1 mov r4d, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 movu m2, [tlq-7] imul r5d, 0x55555555 movu m3, [base+z_filter_s-2] shr r5d, 30 ; filter_strength mova m4, [base+z_upsample2] movddup m5, [base+z_filter_k-8+r5*8+24*0] movddup m6, [base+z_filter_k-8+r5*8+24*1] movddup m7, [base+z_filter_k-8+r5*8+24*2] pshufb m0, m2, m3 shufps m3, m4, q2121 pmaddubsw m1, m0, m5 pmaddubsw m0, m6 pshufb m5, m2, m3 pmaddubsw m3, m5, m6 pmaddubsw m5, m7 pshufb m2, m4 pmaddubsw m2, m7 paddw m0, m1 paddw m1, m3 paddw m0, m5 paddw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 lea r2d, [r4+2] cmp wd, 4 cmovne r4d, r2d pshufd m0, m0, q0000 lea tlq, [rsp+15] packuswb m0, m1 mova [rsp], m0 .h4_main: movd m5, dyd movddup m0, [base+z_base_inc] ; base_inc << 6 sub tlq, r4 shl r4d, 6 movd m7, [tlq] movd m4, r4d pshufb m5, [base+pw_256] neg dyq pshufb m7, [base+pw_m256] mova m3, [base+z3_shuf_h4] lea r5, [dyq+r4+63] ; ypos pshufb m4, [base+pw_256] psubw m4, m0 ; max_base_y shl wd, 2 paddw m6, m5, m5 sub rsp, wq punpcklqdq m5, m6 .h4_loop: lea r4, [r5+dyq] sar r5, 6 movq m0, [tlq+r5-4] lea r5, [r4+dyq] sar r4, 6 movhps m0, [tlq+r4-4] pand m2, m8, m5 psubw m1, m9, m2 psllw m2, 8 pshufb m0, m3 por m1, m2 pmaddubsw m0, m1 pcmpgtw m1, m4, m5 paddw m5, m6 pmulhrsw m0, m10 pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movq [rsp+wq-8], m0 sub wd, 8 jz .h4_transpose test r5d, r5d jg .h4_loop packuswb m7, m7 .h4_end_loop: movq [rsp+wq-8], m7 sub wd, 8 jg .h4_end_loop .h4_transpose: mova m1, [base+z_transpose4] %if ARCH_X86_32 mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif lea r2, [strideq*3] lea dstq, [dstq+org_wq-4] .h4_transpose_loop: mova m0, [rsp] add rsp, 16 pshufb m0, m1 movd [dstq+strideq*0], m0 pshuflw m2, m0, q1032 movd [dstq+strideq*1], m2 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r2 ], m0 sub dstq, 4 sub org_wd, 4 jg .h4_transpose_loop RET .h8: lea r4d, [angleq+88] and r4d, ~0x7f or r4d, wd cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m4, [tlq-15] and r4d, 4 movu m3, [tlq- 9] movd m1, r4d movu m2, [base+z_filter_s+2] pxor m0, m0 movu m5, [base+z_filter_s+6] movddup m7, [base+pb_36_m4] pshufb m1, m0 ; w & 4 movu m0, [base+z_upsample1-4] pmaxub m1, m0 ; clip 4x8 add dyd, dyd pshufb m0, m4, m1 pmaddubsw m0, m7 pshufb m1, m4, m2 pmaddubsw m1, m7 pshufb m2, m3, [base+z_upsample1] pmaddubsw m2, m7 pshufb m3, m5 pmaddubsw m3, m7 movd m5, dyd neg dyq paddw m1, m0 paddw m2, m3 pmulhrsw m1, m10 pmulhrsw m2, m10 shl wd, 3 lea tlq, [rsp+16] pshufb m5, [base+pw_256] sub rsp, wq packuswb m1, m2 lea r5, [dyq+63] punpcklbw m0, m1, m4 punpckhbw m1, m4 mova [tlq-16*1], m0 mova [tlq-16*0], m1 paddw m6, m5, m5 punpcklqdq m5, m6 .h8_upsample_loop: lea r4, [r5+dyq] sar r5, 6 movu m0, [tlq+r5] lea r5, [r4+dyq] sar r4, 6 movu m1, [tlq+r4] pand m3, m8, m5 psubw m2, m9, m3 psllw m2, 8 por m3, m2 pshufd m2, m3, q1010 pmaddubsw m0, m2 punpckhqdq m3, m3 pmaddubsw m1, m3 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m1, m0 mova [rsp+wq-16], m1 sub wd, 16 jg .h8_upsample_loop jmp .h8_transpose .h8_no_upsample: lea r4d, [wq+7] movd m0, r4d and r4d, 7 or r4d, 8 ; imin(w+7, 15) test angled, 0x400 jnz .h8_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movu m1, [base+z_filter_wh8] psrldq m3, [base+z_filter_t_w48+angleq*8], 4 pcmpeqb m1, m0 pand m1, m2 pcmpgtb m1, m3 pmovmskb r5d, m1 test r5d, r5d jz .h8_main ; filter_strength == 0 mova m0, [tlq-15] imul r5d, 0x55555555 movd m1, [tlq+1] neg r4 movd m2, [tlq+r4] shr r5d, 30 pxor m7, m7 lea tlq, [rsp+16*2] sub r5, 3 ; filter_strength-3 mova [tlq+16*0], m0 pshufb m1, m7 mova [tlq+16*1], m1 pshufb m2, m7 movq [tlq+r4+8], m2 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sar r5d, 1 add tlq, 31 add r5d, 17 cmp wd, 8 cmova r4d, r5d .h8_main: movd m5, dyd sub tlq, r4 shl r4d, 6 movd m7, [tlq] movd m4, r4d pshufb m5, [base+pw_256] neg dyq pshufb m7, [base+pw_m256] mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, [base+pw_256] psubw m4, [base+z3_base_inc] shl wd, 3 mova m6, m5 sub rsp, wq .h8_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4-8] pand m2, m8, m5 psubw m1, m9, m2 psllw m2, 8 pshufb m0, m3 por m1, m2 pmaddubsw m0, m1 pcmpgtw m1, m4, m5 paddw m5, m6 pmulhrsw m0, m10 pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movq [rsp+wq-8], m0 sub wd, 8 jz .h8_transpose add r5, dyq jg .h8_loop packuswb m7, m7 .h8_end_loop: movq [rsp+wq-8], m7 sub wd, 8 jg .h8_end_loop .h8_transpose: %if ARCH_X86_32 mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif or r3d, 8 cmp org_wd, 4 %if ARCH_X86_64 jne .end_transpose_main %else jne .end_transpose_loop %endif mova m1, [rsp+16*1] mova m0, [rsp+16*0] lea r2, [strideq*3] add rsp, 16*2 punpcklbw m2, m1, m0 punpckhbw m1, m0 punpckhbw m0, m1, m2 punpcklbw m1, m2 .write_4x8_end: call .write_4x8 RET .write_4x8: movd [dstq+r2 ], m0 pshuflw m4, m0, q1032 movd [dstq+strideq*2], m4 punpckhqdq m0, m0 movd [dstq+strideq*1], m0 psrlq m0, 32 movd [dstq+strideq*0], m0 lea dstq, [dstq+strideq*4] movd [dstq+r2 ], m1 pshuflw m4, m1, q1032 movd [dstq+strideq*2], m4 punpckhqdq m1, m1 movd [dstq+strideq*1], m1 psrlq m1, 32 movd [dstq+strideq*0], m1 ret .h16: lea r4d, [wq+15] movd m0, r4d and r4d, 15 or r4d, 16 ; imin(w+15, 31) test angled, 0x400 jnz .h16_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movq m3, [base+z_filter_t_w16+angleq*4] pcmpeqb m1, m0, [base+z_filter_wh16] pand m1, m2 pcmpgtb m1, m3 pmovmskb r5d, m1 test r5d, r5d jz .h16_main ; filter_strength == 0 mova m0, [tlq-16*2+1] imul r5d, 0x24924924 mova m1, [tlq-16*1+1] neg r4 movd m2, [tlq-16*0+1] shr r5d, 30 movd m3, [tlq+r4] adc r5, -4 ; filter_strength-3 pxor m7, m7 lea tlq, [rsp+16*2] mova [tlq-16*1], m0 pshufb m2, m7 mova [tlq+16*0], m1 pshufb m3, m7 mova [tlq+16*1], m2 movq [tlq+r4+8], m3 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge add tlq, 31 cmp wd, 16 jle .h16_main pshuflw m0, [tlq-47], q0000 sar r5, 1 movq m1, [base+z3_filter_k_tail+r5*4] lea r4d, [r5+33] pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq-35], m0 .h16_main: movd m5, dyd sub tlq, r4 movd m4, r4d shl r4d, 6 movd m7, [tlq] pxor m6, m6 pshufb m5, [base+pw_256] neg dyq pshufb m7, m6 mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, m6 psubb m4, [base+pb_15to0] shl wd, 4 mova m6, m5 sub rsp, wq .h16_loop: mov r4, r5 pand m2, m8, m5 sar r4, 6 psubw m1, m9, m2 psllw m2, 8 movu m0, [tlq+r4-8*2] por m2, m1 movu m1, [tlq+r4-8*1] pshufb m0, m3 pmaddubsw m0, m2 pshufb m1, m3 pmaddubsw m1, m2 psrlw m2, m5, 6 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 packsswb m2, m2 packuswb m0, m1 pcmpgtb m1, m4, m2 pand m0, m1 pandn m1, m7 por m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jz .h16_transpose add r5, dyq jg .h16_loop .h16_end_loop: mova [rsp+wq-16], m7 sub wd, 16 jg .h16_end_loop .h16_transpose: %if ARCH_X86_32 mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif or r3d, 16 cmp org_wd, 4 %if ARCH_X86_64 jne .end_transpose_main %else jne .end_transpose_loop %endif .h16_transpose_w4: mova m2, [rsp+16*3] mova m4, [rsp+16*2] mova m3, [rsp+16*1] mova m0, [rsp+16*0] lea r2, [strideq*3] add rsp, 16*4 punpckhbw m1, m2, m4 punpcklbw m2, m4 punpckhbw m4, m3, m0 punpcklbw m3, m0 punpckhwd m0, m1, m4 punpcklwd m1, m4 call .write_4x8 lea dstq, [dstq+strideq*4] punpckhwd m0, m2, m3 punpcklwd m1, m2, m3 jmp .write_4x8_end .h32: lea r4d, [wq+31] and r4d, 31 or r4d, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main mova m0, [tlq-16*4+1] mova m1, [tlq-16*3+1] mova m2, [tlq-16*2+1] mova m3, [tlq-16*1+1] movd m4, [tlq-16*0+1] neg r4 movd m5, [tlq+r4] pxor m7, m7 lea tlq, [rsp+16*4] mova [tlq-16*3], m0 mova [tlq-16*2], m1 xor r5d, r5d ; filter_strength = 3 mova [tlq-16*1], m2 pshufb m4, m7 mova [tlq+16*0], m3 pshufb m5, m7 mova [tlq+16*1], m4 movq [tlq+r4+8], m5 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge add tlq, 63 cmp wd, 32 jle .h32_main pshuflw m0, [tlq-79], q0000 movq m1, [base+z3_filter_k_tail] add r4d, 2 pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq-67], m0 .h32_main: movd m5, dyd sub tlq, r4 movd m4, r4d shl r4d, 6 movd m7, [tlq] pxor m6, m6 pshufb m5, [base+pw_256] neg dyq pshufb m7, m6 mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, m6 psubb m4, [base+pb_15to0] mova m6, m5 .h32_loop: mov r4, r5 pand m2, m8, m5 sar r4, 6 psubw m1, m9, m2 psllw m2, 8 movu m0, [tlq+r4-8*4] por m2, m1 movu m1, [tlq+r4-8*3] pshufb m0, m3 pmaddubsw m0, m2 pshufb m1, m3 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 sub rsp, 32 packuswb m0, m1 mova [rsp+16*0], m0 movu m0, [tlq+r4-8*2] movu m1, [tlq+r4-8*1] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 psrlw m2, m5, 6 paddw m5, m6 packsswb m2, m2 packuswb m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] pand m0, m1 pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 mova [rsp+16*1], m0 pand m0, m1, [rsp+16*0] pandn m1, m7 por m0, m1 mova [rsp+16*0], m0 dec wd jz .h32_transpose add r5, dyq jg .h32_loop .h32_end_loop: sub rsp, 32 mova [rsp+16*1], m7 mova [rsp+16*0], m7 dec wd jg .h32_end_loop .h32_transpose: or r3d, 32 jmp .end_transpose_main .h64: lea r4d, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main mova m0, [tlq-16*8+1] mova m1, [tlq-16*7+1] mova m2, [tlq-16*6+1] mova m3, [tlq-16*5+1] mova [rsp+16*1], m0 mova [rsp+16*2], m1 mova [rsp+16*3], m2 mova [rsp+16*4], m3 mova m0, [tlq-16*4+1] mova m1, [tlq-16*3+1] mova m2, [tlq-16*2+1] mova m3, [tlq-16*1+1] movd m4, [tlq-16*0+1] neg r4 movd m5, [tlq+r4] pxor m7, m7 lea tlq, [rsp+16*8] mova [tlq-16*3], m0 mova [tlq-16*2], m1 xor r5d, r5d ; filter_strength = 3 mova [tlq-16*1], m2 pshufb m4, m7 mova [tlq+16*0], m3 pshufb m5, m7 mova [tlq+16*1], m4 movq [tlq+r4+8], m5 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 cmp wd, 64 jl .h64_filter96 ; skip one call if the last 32 bytes aren't used call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge .h64_filter96: add tlq, 127 .h64_main: movd m5, dyd sub tlq, r4 movd m4, r4d shl r4d, 6 movd m7, [tlq] pxor m6, m6 pshufb m5, [base+pw_256] neg dyq pshufb m7, m6 mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, m6 psubb m4, [base+pb_15to0] mova m6, m5 .h64_loop: mov r4, r5 pand m2, m8, m5 sar r4, 6 psubw m1, m9, m2 psllw m2, 8 movu m0, [tlq+r4-8*8] por m2, m1 movu m1, [tlq+r4-8*7] pshufb m0, m3 pmaddubsw m0, m2 pshufb m1, m3 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 sub rsp, 64 packuswb m0, m1 mova [rsp+16*0], m0 movu m0, [tlq+r4-8*6] movu m1, [tlq+r4-8*5] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 mova [rsp+16*1], m0 movu m0, [tlq+r4-8*4] movu m1, [tlq+r4-8*3] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 mova [rsp+16*2], m0 movu m0, [tlq+r4-8*2] movu m1, [tlq+r4-8*1] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 psrlw m2, m5, 6 paddw m5, m6 packsswb m2, m2 packuswb m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] pand m0, m1 pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] mova [rsp+16*3], m0 pand m0, m1, [rsp+16*2] pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] mova [rsp+16*2], m0 pand m0, m1, [rsp+16*1] pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 mova [rsp+16*1], m0 pand m0, m1, [rsp+16*0] pandn m1, m7 por m0, m1 mova [rsp+16*0], m0 dec wd jz .h64_transpose add r5, dyq jg .h64_loop .h64_end_loop: sub rsp, 64 mova [rsp+16*3], m7 mova [rsp+16*2], m7 mova [rsp+16*1], m7 mova [rsp+16*0], m7 dec wd jg .h64_end_loop .h64_transpose: or r3d, 64 .end_transpose_main: %if ARCH_X86_64 lea r5, [r3*3] lea r7, [strideq*3] %else mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif .end_transpose_loop: lea r4, [rsp+r3-8] lea r6, [dstq+org_wq-8] .end_transpose_loop_y: movq m0, [r4+r3*1] movq m4, [r4+r3*0] %if ARCH_X86_64 movq m1, [r4+r5 ] movq m5, [r4+r3*2] lea r2, [r4+r3*4] %else lea r2, [r4+r3*2] movq m1, [r2+r3*1] movq m5, [r2+r3*0] lea r2, [r2+r3*2] %endif movq m2, [r2+r3*1] movq m6, [r2+r3*0] %if ARCH_X86_64 movq m3, [r2+r5 ] movq m7, [r2+r3*2] %else lea r2, [r2+r3*2] movq m3, [r2+r3*1] movq m7, [r2+r3*0] %endif sub r4, 8 punpcklbw m0, m4 punpcklbw m1, m5 punpcklbw m2, m6 punpcklbw m3, m7 punpckhwd m4, m1, m0 punpcklwd m1, m0 punpckhwd m0, m3, m2 punpcklwd m3, m2 punpckhdq m2, m3, m1 punpckldq m3, m1 punpckldq m1, m0, m4 punpckhdq m0, m4 movhps [r6+strideq*0], m0 movq [r6+strideq*1], m0 %if ARCH_X86_64 movhps [r6+strideq*2], m1 movq [r6+r7 ], m1 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m1 movq [r6+strideq*1], m1 lea r6, [r6+strideq*2] %endif movhps [r6+strideq*0], m2 movq [r6+strideq*1], m2 %if ARCH_X86_64 movhps [r6+strideq*2], m3 movq [r6+r7 ], m3 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m3 movq [r6+strideq*1], m3 lea r6, [r6+strideq*2] %endif cmp r4, rsp jae .end_transpose_loop_y lea rsp, [rsp+r3*8] sub org_wd, 8 jg .end_transpose_loop RET ;--------------------------------------------------------------------------------------- ;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, ; const uint8_t *idx, const int w, const int h); ;--------------------------------------------------------------------------------------- cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h mova m4, [palq] LEA r2, pal_pred_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r2+wq*4] packuswb m4, m4 add wq, r2 lea r2, [strideq*3] jmp wq .w4: pshufb m0, m4, [idxq] add idxq, 16 movd [dstq ], m0 pshuflw m1, m0, q1032 movd [dstq+strideq ], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r2 ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET ALIGN function_align .w8: pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] add idxq, 32 movq [dstq ], m0 movhps [dstq+strideq ], m0 movq [dstq+strideq*2], m1 movhps [dstq+r2 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET ALIGN function_align .w16: pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] pshufb m2, m4, [idxq+32] pshufb m3, m4, [idxq+48] add idxq, 64 mova [dstq ], m0 mova [dstq+strideq ], m1 mova [dstq+strideq*2], m2 mova [dstq+r2 ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET ALIGN function_align .w32: pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] pshufb m2, m4, [idxq+32] pshufb m3, m4, [idxq+48] add idxq, 64 mova [dstq ], m0 mova [dstq+16 ], m1 mova [dstq+strideq ], m2 mova [dstq+strideq+16], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET ALIGN function_align .w64: pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] pshufb m2, m4, [idxq+32] pshufb m3, m4, [idxq+48] add idxq, 64 mova [dstq ], m0 mova [dstq+16], m1 mova [dstq+32], m2 mova [dstq+48], m3 add dstq, strideq sub hd, 1 jg .w64 RET ;--------------------------------------------------------------------------------------- ;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- %macro IPRED_CFL 1 ; ac in, unpacked pixels out psignw m3, m%1, m1 pabsw m%1, m%1 pmulhrsw m%1, m2 psignw m%1, m3 paddw m%1, m0 %endmacro %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn wd, wm movifnidn hd, hm tzcnt r6d, hd lea t0d, [wq+hq] movd m4, t0d tzcnt t0d, t0d movd m5, t0d LEA t0, ipred_cfl_ssse3_table tzcnt wd, wd movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+16] pcmpeqd m3, m3 psrlw m4, 1 add r6, t0 add wq, t0 movifnidn acq, acmp jmp r6 .h4: movd m0, [tlq-4] pmaddubsw m0, m3 jmp wq .w4: movd m1, [tlq+1] pmaddubsw m1, m3 psubw m0, m4 paddw m0, m1 pmaddwd m0, m3 cmp hd, 4 jg .w4_mul psrlw m0, 3 ; dc >>= ctz(width + height); jmp .w4_end .w4_mul: punpckhqdq m1, m0, m0 paddw m0, m1 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 psrlw m0, 2 mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8 cmovz r6d, r2d movd m5, r6d pmulhuw m0, m5 .w4_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s4: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 movd [dstq+strideq*0], m4 pshuflw m4, m4, q1032 movd [dstq+strideq*1], m4 punpckhqdq m4, m4 movd [dstq+strideq*2], m4 psrlq m4, 32 movd [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] add acq, 32 sub hd, 4 jg .s4_loop RET ALIGN function_align .h8: movq m0, [tlq-8] pmaddubsw m0, m3 jmp wq .w8: movq m1, [tlq+1] pmaddubsw m1, m3 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 paddw m0, m1 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 movq [dstq ], m4 movhps [dstq+strideq ], m4 mova m4, [acq+32] mova m5, [acq+48] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 movq [dstq+strideq*2], m4 movhps [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] add acq, 64 sub hd, 4 jg .s8_loop RET ALIGN function_align .h16: mova m0, [tlq-16] pmaddubsw m0, m3 jmp wq .w16: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq], m4 mova m4, [acq+32] mova m5, [acq+48] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq+strideq], m4 lea dstq, [dstq+strideq*2] add acq, 64 sub hd, 2 jg .s16_loop RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 mova m2, [tlq-16] pmaddubsw m2, m3 paddw m0, m2 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 movu m2, [tlq+17] pmaddubsw m2, m3 paddw m1, m2 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x5556 mov r2d, 0x3334 test hd, 64|16 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq], m4 mova m4, [acq+32] mova m5, [acq+48] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq+16], m4 add dstq, strideq add acq, 64 dec hd jg .s32_loop RET ;--------------------------------------------------------------------------------------- ;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] mov t0d, 0x8000 movd m3, t0d movd m2, r6d psrld m3, m2 LEA t0, ipred_cfl_left_ssse3_table movsxd r6, [t0+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 .h32: movu m1, [tlq+16] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h16: pshufd m1, m0, q3232 ; psrlq m1, m0, 16 paddw m0, m1 .h8: pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 .h4: pmaddwd m0, m2 pmulhrsw m0, m3 pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq ;--------------------------------------------------------------------------------------- ;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha LEA t0, ipred_cfl_left_ssse3_table tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm mov r6d, 0x8000 movd m3, r6d movd m2, wd psrld m3, m2 movsxd r6, [t0+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 ;--------------------------------------------------------------------------------------- ;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha tzcnt wd, wm movifnidn hd, hm LEA r6, ipred_cfl_splat_ssse3_table movsxd wq, [r6+wq*4] movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] add wq, r6 movifnidn acq, acmp jmp wq %macro RELOAD_ACQ_32 1 mov acq, ac_bakq ; restore acq %endmacro %if ARCH_X86_64 cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak DECLARE_REG_TMP 7 movddup m2, [pb_2] %else cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h DECLARE_REG_TMP 4 %define ac_bakq acmp mov t0d, 0x02020202 movd m2, t0d pshufd m2, m2, q0000 %endif movifnidn wd, wm mov t0d, hm mov hd, t0d imul t0d, wd movd m5, t0d movifnidn hpadd, hpadm %if ARCH_X86_64 mov ac_bakq, acq %endif shl hpadd, 2 sub hd, hpadd pxor m4, m4 cmp wd, 8 jg .w16 je .w8 ; fall-through %if ARCH_X86_64 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak %else DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: lea stride3q, [strideq*3] .w4_loop: movq m0, [yq] movq m1, [yq+strideq] movhps m0, [yq+strideq*2] movhps m1, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .calc_avg_4_8 punpckhqdq m0, m0 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 16 sub hpadd, 2 jg .w4_hpad_loop jmp .calc_avg_4_8 .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 mova m0, [yq+strideq*2] mova m1, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq+16], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 32 sub hd, 2 jg .w8_loop test hpadd, hpadd jz .calc_avg_4_8 jmp .w8_hpad .w8_wpad: ; wpadd=1 movddup m0, [yq] movddup m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufhw m0, m0, q3333 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 16 sub hd, 1 jg .w8_wpad test hpadd, hpadd jz .calc_avg_4_8 .w8_hpad: mova [acq], m0 paddw m4, m0 add acq, 16 sub hpadd, 1 jg .w8_hpad jmp .calc_avg_4_8 .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 mova m6, [yq+16] mova m1, [yq+strideq+16] pmaddubsw m6, m2 pmaddubsw m1, m2 paddw m6, m1 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_loop test hpadd, hpadd jz .calc_avg16 jmp .w16_hpad_loop .w16_wpad: cmp wpadd, 2 jl .w16_pad1 je .w16_pad2 .w16_pad3: movddup m0, [yq] movddup m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufhw m0, m0, q3333 mova [acq], m0 paddw m4, m0 mova m6, m0 punpckhqdq m6, m0, m0 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_pad3 jmp .w16_wpad_done .w16_pad2: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 pshufhw m6, m0, q3333 punpckhqdq m6, m6 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_pad2 jmp .w16_wpad_done .w16_pad1: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 movddup m6, [yq+16] movddup m1, [yq+strideq+16] pmaddubsw m6, m2 pmaddubsw m1, m2 paddw m6, m1 pshufhw m6, m6, q3333 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_pad1 .w16_wpad_done: test hpadd, hpadd jz .calc_avg16 .w16_hpad_loop: mova [acq], m0 paddw m4, m0 mova [acq+16], m6 paddw m4, m6 add acq, 32 dec hpadd jg .w16_hpad_loop jmp .calc_avg16 %if ARCH_X86_64 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak %else DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h %endif .calc_avg_4_8: psrlw m2, 9 pmaddwd m4, m2 jmp .calc_avg .calc_avg16: psrld m0, m4, 16 pslld m4, 16 psrld m4, 16 paddd m4, m0 .calc_avg: movd szd, m5 psrad m5, 1 tzcnt r1d, szd paddd m4, m5 movd m1, r1d pshufd m0, m4, q2301 paddd m0, m4 pshufd m4, m0, q1032 paddd m0, m4 psrad m0, m1 ; sum >>= log2sz; packssdw m0, m0 RELOAD_ACQ_32 acq .sub_loop: mova m1, [acq] psubw m1, m0 ; ac[x] -= sum; mova [acq], m1 add acq, 16 sub szd, 8 jg .sub_loop RET %if ARCH_X86_64 cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h mov t0d, 0x04040404 movd m2, t0d pshufd m2, m2, q0000 %endif movifnidn wd, wm mov t0d, hm mov hd, t0d imul t0d, wd movd m6, t0d movifnidn hpadd, hpadm %if ARCH_X86_64 mov ac_bakq, acq %endif shl hpadd, 2 sub hd, hpadd pxor m4, m4 pxor m5, m5 cmp wd, 8 jg .w16 je .w8 ; fall-through %if ARCH_X86_64 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak %else DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: lea stride3q, [strideq*3] .w4_loop: movq m1, [yq] movhps m1, [yq+strideq] movq m0, [yq+strideq*2] movhps m0, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+16], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg_4 punpckhqdq m0, m0 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 16 sub hpadd, 2 jg .w4_hpad_loop jmp .calc_avg_4 .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova m1, [yq] mova m0, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+16], m0 paddw m4, m0 paddw m5, m1 mova m1, [yq+strideq*2] mova m0, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w8_hpad .w8_wpad: movddup m1, [yq] pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq], m1 paddw m5, m1 movddup m0, [yq+strideq] pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+16], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 sub hd, 2 jg .w8_wpad test hpadd, hpadd jz .calc_avg_8_16 .w8_hpad: mova [acq], m0 paddw m4, m0 mova [acq+16], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad jmp .calc_avg_8_16 .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m1, [yq] mova m0, [yq+16] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+16], m0 paddw m5, m0 paddw m5, m1 mova m1, [yq+strideq] mova m0, [yq+strideq+16] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m0 paddw m4, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w16_hpad_loop .w16_wpad: cmp wpadd, 2 jl .w16_pad1 je .w16_pad2 .w16_pad3: movddup m1, [yq] pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq], m1 paddw m5, m1 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 movddup m1, [yq+strideq] pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq+32], m1 paddw m4, m1 punpckhqdq m0, m1, m1 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad3 jmp .w16_wpad_done .w16_pad2: mova m1, [yq] pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 pshufhw m1, m1, q3333 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 mova m1, [yq+strideq] pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 mova m0, m1 pshufhw m0, m0, q3333 punpckhqdq m0, m0 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad2 jmp .w16_wpad_done .w16_pad1: mova m1, [yq] pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 movddup m0, [yq+16] pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+16], m0 paddw m5, m0 mova m1, [yq+strideq] pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 movddup m0, [yq+strideq+16] pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad1 .w16_wpad_done: test hpadd, hpadd jz .calc_avg_8_16 .w16_hpad_loop: mova [acq], m1 mova [acq+16], m0 paddw m4, m1 paddw m5, m0 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m1 paddw m5, m0 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop jmp .calc_avg_8_16 %if ARCH_X86_64 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak %else DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h %endif .calc_avg_4: psrlw m2, 10 pmaddwd m5, m2 pmaddwd m0, m4, m2 jmp .calc_avg .calc_avg_8_16: mova m0, m5 psrld m5, 16 pslld m0, 16 psrld m0, 16 paddd m5, m0 mova m0, m4 psrld m0, 16 pslld m4, 16 psrld m4, 16 paddd m0, m4 .calc_avg: paddd m5, m0 movd szd, m6 psrad m6, 1 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); paddd m5, m6 movd m1, r1d pshufd m0, m5, q2301 paddd m0, m5 pshufd m5, m0, q1032 paddd m0, m5 psrad m0, m1 ; sum >>= log2sz; packssdw m0, m0 RELOAD_ACQ_32 acq ; ac = ac_orig .sub_loop: mova m1, [acq] psubw m1, m0 mova [acq], m1 add acq, 16 sub szd, 8 jg .sub_loop RET %if ARCH_X86_64 cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h %define ac_bakq [rsp+16*4] mov t0d, 0x04040404 movd m2, t0d pshufd m2, m2, q0000 %endif movifnidn wd, wm movifnidn hpadd, hpadm movd m0, hpadd mov t0d, hm mov hd, t0d imul t0d, wd movd m6, t0d movd hpadd, m0 mov ac_bakq, acq shl hpadd, 2 sub hd, hpadd pxor m5, m5 pxor m4, m4 cmp wd, 16 jg .w32 cmp wd, 8 jg .w16 je .w8 ; fall-through %if ARCH_X86_64 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak %else DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: lea stride3q, [strideq*3] .w4_loop: movd m1, [yq] movd m3, [yq+strideq] punpckldq m1, m3 punpcklbw m1, m1 movd m0, [yq+strideq*2] movd m3, [yq+stride3q] punpckldq m0, m3 punpcklbw m0, m0 pmaddubsw m1, m2 pmaddubsw m0, m2 mova [acq], m1 mova [acq+16], m0 paddw m5, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg_4 punpckhqdq m0, m0 .w4_hpad_loop: mova [acq], m0 paddw m5, m0 add acq, 16 sub hpadd, 2 jg .w4_hpad_loop .calc_avg_4: psrlw m2, 10 pmaddwd m5, m2 jmp .calc_avg .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: movq m1, [yq] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 movq m0, [yq+strideq] punpcklbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0 movq m1, [yq+strideq*2] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 movq m0, [yq+stride3q] punpcklbw m0, m0 pmaddubsw m0, m2 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w8_hpad .w8_wpad: movd m1, [yq] punpcklbw m1, m1 punpcklqdq m1, m1 pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq], m1 paddw m5, m1 movd m0, [yq+strideq] punpcklbw m0, m0 punpcklqdq m0, m0 pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+16], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 sub hd, 2 jg .w8_wpad test hpadd, hpadd jz .calc_avg_8_16 .w8_hpad: mova [acq], m0 paddw m5, m0 mova [acq+16], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad jmp .calc_avg_8_16 .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0 mova m0, [yq+strideq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w16_hpad_loop .w16_wpad: cmp wpadd, 2 jl .w16_pad1 je .w16_pad2 .w16_pad3: movd m1, [yq] punpcklbw m1, m1 punpcklqdq m1, m1 pshufhw m1, m1, q3333 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 movd m1, [yq+strideq] punpcklbw m1, m1 punpcklqdq m1, m1 pshufhw m1, m1, q3333 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 punpckhqdq m0, m1, m1 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad3 jmp .w16_wpad_done .w16_pad2: movq m1, [yq] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 pshufhw m1, m1, q3333 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 movq m1, [yq+strideq] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 mova m0, m1 pshufhw m0, m0, q3333 punpckhqdq m0, m0 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad2 jmp .w16_wpad_done .w16_pad1: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 punpckhbw m0, m0 punpcklqdq m0, m0 pshufhw m0, m0, q3333 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0 mova m0, [yq+strideq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 punpckhbw m0, m0 punpcklqdq m0, m0 pshufhw m0, m0, q3333 pmaddubsw m0, m2 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad1 .w16_wpad_done: test hpadd, hpadd jz .calc_avg_8_16 .w16_hpad_loop: mova [acq], m1 mova [acq+16], m0 paddw m4, m1 paddw m5, m0 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m1 paddw m5, m0 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop .calc_avg_8_16: mova m0, m5 psrld m5, 16 pslld m0, 16 psrld m0, 16 paddd m5, m0 mova m0, m4 psrld m0, 16 pslld m4, 16 psrld m4, 16 paddd m0, m4 paddd m5, m0 jmp .calc_avg .w32: pxor m0, m0 mova [rsp ], m0 mova [rsp+16], m0 mova [rsp+32], m0 mova [rsp+48], m0 test wpadd, wpadd jnz .w32_wpad .w32_loop: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m4, [yq+16] mova m3, m4 punpcklbw m3, m3 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 punpckhbw m4, m4 pmaddubsw m4, m2 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_loop test hpadd, hpadd jz .calc_avg_32 jmp .w32_hpad_loop .w32_wpad: cmp wpadd, 2 jl .w32_pad1 je .w32_pad2 cmp wpadd, 4 jl .w32_pad3 je .w32_pad4 cmp wpadd, 6 jl .w32_pad5 je .w32_pad6 .w32_pad7: movd m1, [yq] punpcklbw m1, m1 punpcklqdq m1, m1 pshufhw m1, m1, q3333 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 mova m0, m1 punpckhqdq m0, m0 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad7 jmp .w32_wpad_done .w32_pad6: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 pshufhw m0, m1, q3333 punpckhqdq m0, m0 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad6 jmp .w32_wpad_done .w32_pad5: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 mova m5, [rsp] paddw m5, m1 mova [rsp ], m5 punpckhbw m0, m0 punpcklqdq m0, m0 pshufhw m0, m0, q3333 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 punpckhqdq m3, m3 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad5 jmp .w32_wpad_done .w32_pad4: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 pshufhw m3, m3, q3333 punpckhqdq m3, m3 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad4 jmp .w32_wpad_done .w32_pad3: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 movd m3, [yq+16] punpcklbw m3, m3 punpcklqdq m3, m3 pshufhw m3, m3, q3333 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 punpckhqdq m4, m4 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad3 jmp .w32_wpad_done .w32_pad2: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, [yq+16] punpcklbw m3, m3 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 pshufhw m4, m3, q3333 punpckhqdq m4, m4 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad2 jmp .w32_wpad_done .w32_pad1: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m4, [yq+16] mova m3, m4 punpcklbw m3, m3 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 punpckhbw m4, m4 punpcklqdq m4, m4 pshufhw m4, m4, q3333 pmaddubsw m4, m2 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad1 .w32_wpad_done: test hpadd, hpadd jz .calc_avg_32 .w32_hpad_loop: mova [acq], m1 mova [acq+16], m0 paddw m5, m1, [rsp] mova [rsp ], m5 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova [acq+32], m3 mova [acq+48], m4 paddw m5, m3, [rsp+32] mova [rsp+32], m5 paddw m5, m4, [rsp+48] mova [rsp+48], m5 add acq, 64 sub hpadd, 1 jg .w32_hpad_loop %if ARCH_X86_64 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak %else DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h %endif .calc_avg_32: mova m5, [rsp] mova m0, m5 psrld m5, 16 pslld m0, 16 psrld m0, 16 paddd m5, m0 mova m0, [rsp+16] mova m3, m0 psrld m0, 16 pslld m3, 16 psrld m3, 16 paddd m0, m3 paddd m5, m0 mova m0, [rsp+32] mova m3, m0 psrld m0, 16 pslld m3, 16 psrld m3, 16 paddd m0, m3 mova m1, [rsp+48] mova m3, m1 psrld m1, 16 pslld m3, 16 psrld m3, 16 paddd m1, m3 paddd m1, m0 paddd m5, m1 .calc_avg: movd szd, m6 psrad m6, 1 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); paddd m5, m6 movd m1, r1d pshufd m0, m5, q2301 paddd m0, m5 pshufd m5, m0, q1032 paddd m0, m5 psrad m0, m1 ; sum >>= log2sz; packssdw m0, m0 RELOAD_ACQ_32 acq ; ac = ac_orig .sub_loop: mova m1, [acq] psubw m1, m0 mova [acq], m1 add acq, 16 sub szd, 8 jg .sub_loop RET ; %1 simd register that hold the mask and will hold the result ; %2 simd register that holds the "true" values ; %3 location of the "false" values (simd register/memory) %macro BLEND 3 ; mask, true, false pand %2, %1 pandn %1, %3 por %1, %2 %endmacro %macro PAETH 2 ; top, ldiff pavgb m1, m%1, m3 pxor m0, m%1, m3 pand m0, m4 psubusb m2, m5, m1 psubb m1, m0 psubusb m1, m5 por m1, m2 paddusb m1, m1 por m1, m0 ; min(tldiff, 255) psubusb m2, m5, m3 psubusb m0, m3, m5 por m2, m0 ; tdiff %ifnum %2 pminub m2, m%2 pcmpeqb m0, m%2, m2 ; ldiff <= tdiff %else mova m0, %2 pminub m2, m0 pcmpeqb m0, m2 %endif pminub m1, m2 pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff mova m2, m3 BLEND m0, m2, m%1 BLEND m1, m0, m5 %endmacro cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h %define base r5-ipred_paeth_ssse3_table tzcnt wd, wm movifnidn hd, hm pxor m0, m0 movd m5, [tlq] pshufb m5, m0 LEA r5, ipred_paeth_ssse3_table movsxd wq, [r5+wq*4] movddup m4, [base+ipred_paeth_shuf] add wq, r5 jmp wq .w4: movd m6, [tlq+1] ; top pshufd m6, m6, q0000 lea r3, [strideq*3] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: sub tlq, 4 movd m3, [tlq] mova m1, [base+ipred_h_shuf] pshufb m3, m1 ; left PAETH 6, 7 movd [dstq ], m1 pshuflw m0, m1, q1032 movd [dstq+strideq ], m0 punpckhqdq m1, m1 movd [dstq+strideq*2], m1 psrlq m1, 32 movd [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: movddup m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: sub tlq, 2 movd m3, [tlq] pshufb m3, [base+ipred_paeth_shuf] PAETH 6, 7 movq [dstq ], m1 movhps [dstq+strideq], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: sub tlq, 1 movd m3, [tlq] pxor m1, m1 pshufb m3, m1 PAETH 6, 7 mova [dstq], m1 add dstq, strideq sub hd, 1 jg .w16_loop RET ALIGN function_align .w32: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp ], m6 mova [rsp+16], m7 movu m6, [tlq+17] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+32], m6 .w32_loop: dec tlq movd m3, [tlq] pxor m1, m1 pshufb m3, m1 mova m6, [rsp] PAETH 6, [rsp+16] mova [dstq ], m1 mova m6, [rsp+32] PAETH 6, 7 mova [dstq+16], m1 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp ], m6 mova [rsp+16], m7 movu m6, [tlq+17] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+32], m6 mova [rsp+48], m7 movu m6, [tlq+33] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+64], m6 mova [rsp+80], m7 movu m6, [tlq+49] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+96], m6 .w64_loop: dec tlq movd m3, [tlq] pxor m1, m1 pshufb m3, m1 mova m6, [rsp] PAETH 6, [rsp+16] mova [dstq ], m1 mova m6, [rsp+32] PAETH 6, [rsp+48] mova [dstq+16], m1 mova m6, [rsp+64] PAETH 6, [rsp+80] mova [dstq+32], m1 mova m6, [rsp+96] PAETH 6, 7 mova [dstq+48], m1 add dstq, strideq dec hd jg .w64_loop RET %macro FILTER 4 ;dst, src, tmp, shuf %ifnum %4 pshufb m%2, m%4 %else pshufb m%2, %4 %endif pshufd m%1, m%2, q0000 ;p0 p1 pmaddubsw m%1, m2 pshufd m%3, m%2, q1111 ;p2 p3 pmaddubsw m%3, m3 paddw m%1, [base+pw_8] paddw m%1, m%3 pshufd m%3, m%2, q2222 ;p4 p5 pmaddubsw m%3, m4 paddw m%1, m%3 pshufd m%3, m%2, q3333 ;p6 __ pmaddubsw m%3, m5 paddw m%1, m%3 psraw m%1, 4 packuswb m%1, m%1 %endmacro cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter %define base r6-$$ LEA r6, $$ tzcnt wd, wm %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 lea filterq, [base+filter_intra_taps+filterq] movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 movsxd wq, [base+ipred_filter_ssse3_table+wq*4] mova m2, [filterq+16*0] mova m3, [filterq+16*1] mova m4, [filterq+16*2] mova m5, [filterq+16*3] lea wq, [base+ipred_filter_ssse3_table+wq] mov hd, hm jmp wq .w4: mova m1, [base+filter_shuf1] sub tlq, 3 sub tlq, hq jmp .w4_loop_start .w4_loop: movd m0, [tlq+hq] punpckldq m0, m6 lea dstq, [dstq+strideq*2] .w4_loop_start: FILTER 6, 0, 7, 1 movd [dstq+strideq*0], m6 pshuflw m6, m6, q1032 movd [dstq+strideq*1], m6 sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 sub tlq, 5 sub tlq, hq .w8_loop: FILTER 7, 0, 1, [base+filter_shuf1] punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER 0, 6, 1, [base+filter_shuf2] punpckldq m6, m7, m0 movq [dstq+strideq*0], m6 punpckhqdq m6, m6 movq [dstq+strideq*1], m6 movd m0, [tlq+hq] ;_ 6 5 0 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: movu m6, [tlq+1] ;top row sub tlq, 5 sub tlq, hq .w16_loop: FILTER 7, 0, 1, [base+filter_shuf1] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+4+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 FILTER 7, 0, 1, [base+filter_shuf2] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+8+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] movd [dstq+12+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 mova [dstq+strideq*1], m6 movd m0, [tlq+hq] ;_ 6 5 0 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: movu m6, [tlq+1] ;top row lea filterq, [tlq+17] sub tlq, 5 sub tlq, hq .w32_loop: FILTER 7, 0, 1, [base+filter_shuf1] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+4+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 FILTER 7, 0, 1, [base+filter_shuf2] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+8+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] movu m1, [filterq] punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+12+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 mova [dstq+strideq*1], m6 mova m6, m1 FILTER 7, 0, 6, [base+filter_shuf2] punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+16+strideq*0], m7 psrlq m7, 32 palignr m7, m1, 4 FILTER 6, 0, 1, [base+filter_shuf2] punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+20+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 FILTER 7, 0, 1, [base+filter_shuf2] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+24+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] movd [dstq+28+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 mova [dstq+16+strideq*1], m6 mova m6, [dstq+strideq*1] movd m0, [tlq+hq] ;_ 6 5 0 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 lea filterq, [dstq+16+strideq*1] lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET av-scenechange-0.14.1/src/asm/x86/mc16_avx2.asm000064400000000000000000006007441046102023000167750ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 ; dav1d_obmc_masks[] * -512 const obmc_masks_avx2 dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0 dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120 dw -4096, -3072, -2048, -1536, 0, 0, 0, 0 dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240 dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608 dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024 dw 0, 0, 0, 0, 0, 0, 0, 0 deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 bdct_lb_q: times 8 db 0 times 8 db 4 times 8 db 8 times 8 db 12 prep_mul: dw 16, 16, 4, 4 put_bilin_h_rnd: dw 8, 8, 10, 10 put_8tap_h_rnd: dd 34, 40 s_8tap_h_rnd: dd 2, 8 s_8tap_h_sh: dd 2, 4 put_s_8tap_v_rnd: dd 512, 128 put_s_8tap_v_sh: dd 10, 8 prep_8tap_1d_rnd: dd 8 - (8192 << 4) prep_8tap_2d_rnd: dd 32 - (8192 << 5) warp8x8t_rnd: dd 16384 - (8192 << 15) warp8x8_shift: dd 5, 3 warp8x8_rnd: dw 4096, 4096, 16384, 16384 bidir_rnd: dw -16400, -16400, -16388, -16388 bidir_mul: dw 2048, 2048, 8192, 8192 %define pw_16 prep_mul %define pd_512 put_s_8tap_v_rnd pw_2: times 2 dw 2 pw_64: times 2 dw 64 pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 pw_m512: times 2 dw -512 pd_32: dd 32 pd_63: dd 63 pd_64: dd 64 pd_32768: dd 32768 pd_65538: dd 65538 pd_m524256: dd -524256 ; -8192 << 6 + 32 pd_0x3ff: dd 0x3ff pq_0x40000000: dq 0x40000000 dd 0 %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern resize_filter SECTION .text INIT_XMM avx2 cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [put_avx2] %if UNIX64 DECLARE_REG_TMP 8 %define org_w r8d mov r8d, wd %else DECLARE_REG_TMP 7 %define org_w wm %endif tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET INIT_YMM avx2 .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*0+32*1], m1 mova [dstq+dsq*1+32*0], m2 mova [dstq+dsq*1+32*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] movu m3, [srcq+32*3] add srcq, ssq mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 add dstq, dsq dec hd jg .put_w64 RET .put_w128: movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] movu m3, [srcq+32*3] mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 movu m0, [srcq+32*4] movu m1, [srcq+32*5] movu m2, [srcq+32*6] movu m3, [srcq+32*7] add srcq, ssq mova [dstq+32*4], m0 mova [dstq+32*5], m1 mova [dstq+32*6], m2 mova [dstq+32*7], m3 add dstq, dsq dec hd jg .put_w128 RET .h: movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] mov r6d, r8m ; bitdepth_max add wq, r7 shr r6d, 11 vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4] jmp wq .h_w2: movq xm1, [srcq+ssq*0] movhps xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw xm0, xm4, xm1 psrlq xm1, 16 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 4 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] movq xm1, [srcq+ssq*0+2] movhps xm1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw xm0, xm4 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 4 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 movu xm1, [srcq+ssq*0+2] vinserti128 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+ssq*1] pmullw m2, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+32*0] pmullw m1, m5, [srcq+32*0+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+32*1] pmullw m2, m5, [srcq+32*1+2] add srcq, ssq paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, dsq dec hd jg .h_w32 RET .h_w64: .h_w128: movifnidn t0d, org_w .h_w64_loop0: mov r6d, t0d .h_w64_loop: pmullw m0, m4, [srcq+r6*2-32*1] pmullw m1, m5, [srcq+r6*2-32*1+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+r6*2-32*2] pmullw m2, m5, [srcq+r6*2-32*2+2] paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+r6*2-32*1], m0 mova [dstq+r6*2-32*2], m1 sub r6d, 32 jg .h_w64_loop add srcq, ssq add dstq, dsq dec hd jg .h_w64_loop0 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] shl mxyd, 11 movd xm5, mxyd add wq, r7 vpbroadcastw m5, xm5 jmp wq .v_w2: movd xm0, [srcq+ssq*0] .v_w2_loop: movd xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq xm2, xm0, xm1 movd xm0, [srcq+ssq*0] punpckldq xm1, xm0 psubw xm1, xm2 pmulhrsw xm1, xm5 paddw xm1, xm2 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xm0, [srcq+ssq*0] .v_w4_loop: movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq xm2, xm0, xm1 movq xm0, [srcq+ssq*0] punpcklqdq xm1, xm0 psubw xm1, xm2 pmulhrsw xm1, xm5 paddw xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m2, m0, m1, 0xf0 vbroadcasti128 m0, [srcq+ssq*0] vpblendd m1, m0, 0xf0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w32: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] .v_w32_loop: movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] psubw m4, m2, m0 pmulhrsw m4, m5 paddw m4, m0 movu m0, [srcq+ssq*0+32*0] mova [dstq+dsq*0+32*0], m4 psubw m4, m3, m1 pmulhrsw m4, m5 paddw m4, m1 movu m1, [srcq+ssq*0+32*1] mova [dstq+dsq*0+32*1], m4 psubw m4, m0, m2 pmulhrsw m4, m5 paddw m4, m2 mova [dstq+dsq*1+32*0], m4 psubw m4, m1, m3 pmulhrsw m4, m5 paddw m4, m3 mova [dstq+dsq*1+32*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w16: .v_w64: .v_w128: movifnidn t0d, org_w add t0d, t0d mov r4, srcq lea r6d, [hq+t0*8-256] mov r7, dstq .v_w16_loop0: movu m0, [srcq+ssq*0] .v_w16_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m5 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m5 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET .hv: movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 vpbroadcastd m3, [pw_2] movd xm6, mxyd vpbroadcastd m7, [pw_8192] add wq, r7 vpbroadcastw m6, xm6 test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 vpbroadcastd m7, [pw_2048] .hv_12bpc: jmp wq .hv_w2: vpbroadcastq xm1, [srcq+ssq*0] pmullw xm0, xm4, xm1 psrlq xm1, 16 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 .hv_w2_loop: movq xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm2, [srcq+ssq*0] pmullw xm1, xm4, xm2 psrlq xm2, 16 pmullw xm2, xm5 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 ; 1 _ 2 _ shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _ mova xm0, xm1 psubw xm1, xm2 paddw xm1, xm1 pmulhw xm1, xm6 paddw xm1, xm2 pmulhrsw xm1, xm7 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: pmullw xm0, xm4, [srcq+ssq*0-8] pmullw xm1, xm5, [srcq+ssq*0-6] paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 .hv_w4_loop: movq xm1, [srcq+ssq*1] movq xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps xm1, [srcq+ssq*0] movhps xm2, [srcq+ssq*0+2] pmullw xm1, xm4 pmullw xm2, xm5 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 ; 1 2 shufpd xm2, xm0, xm1, 0x01 ; 0 1 mova xm0, xm1 psubw xm1, xm2 paddw xm1, xm1 pmulhw xm1, xm6 paddw xm1, xm2 pmulhrsw xm1, xm7 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: pmullw xm0, xm4, [srcq+ssq*0] pmullw xm1, xm5, [srcq+ssq*0+2] paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 vinserti128 m0, xm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti128 m1, [srcq+ssq*0], 1 vinserti128 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: .hv_w32: .hv_w64: .hv_w128: %if UNIX64 lea r6d, [r8*2-32] %else mov r6d, wm lea r6d, [r6*2-32] %endif mov r4, srcq lea r6d, [hq+r6*8] mov r7, dstq .hv_w16_loop0: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w16_loop: pmullw m1, m4, [srcq+ssq*1] pmullw m2, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m1, m3 paddw m1, m2 psrlw m1, 2 psubw m2, m1, m0 paddw m2, m2 pmulhw m2, m6 paddw m2, m0 pmulhrsw m2, m7 mova [dstq+dsq*0], m2 pmullw m0, m4, [srcq+ssq*0] pmullw m2, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m1 paddw m2, m2 pmulhw m2, m6 paddw m2, m1 pmulhrsw m2, m7 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w16_loop0 RET cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep_avx2] %if UNIX64 DECLARE_REG_TMP 7 %define org_w r7d %else DECLARE_REG_TMP 6 %define org_w r5m %endif mov org_w, wd tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [r6-prep_avx2+pw_8192] add wq, r6 shr r5d, 11 vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4] lea stride3q, [strideq*3] jmp wq .prep_w4: movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] vpbroadcastq m1, [srcq+strideq*2] vpbroadcastq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 pmullw m0, m4 psubw m0, m5 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .prep_w4 RET .prep_w8: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 movu xm1, [srcq+strideq*2] vinserti128 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 4 jg .prep_w8 RET .prep_w16: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m4, [srcq+strideq*1] pmullw m2, m4, [srcq+strideq*2] pmullw m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 4 jg .prep_w16 RET .prep_w32: pmullw m0, m4, [srcq+strideq*0+32*0] pmullw m1, m4, [srcq+strideq*0+32*1] pmullw m2, m4, [srcq+strideq*1+32*0] pmullw m3, m4, [srcq+strideq*1+32*1] lea srcq, [srcq+strideq*2] psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 2 jg .prep_w32 RET .prep_w64: pmullw m0, m4, [srcq+32*0] pmullw m1, m4, [srcq+32*1] pmullw m2, m4, [srcq+32*2] pmullw m3, m4, [srcq+32*3] add srcq, strideq psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 dec hd jg .prep_w64 RET .prep_w128: pmullw m0, m4, [srcq+32*0] pmullw m1, m4, [srcq+32*1] pmullw m2, m4, [srcq+32*2] pmullw m3, m4, [srcq+32*3] psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 pmullw m0, m4, [srcq+32*4] pmullw m1, m4, [srcq+32*5] pmullw m2, m4, [srcq+32*6] pmullw m3, m4, [srcq+32*7] add tmpq, 32*8 add srcq, strideq psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq-32*4], m0 mova [tmpq-32*3], m1 mova [tmpq-32*2], m2 mova [tmpq-32*1], m3 dec hd jg .prep_w128 RET .h: movd xm5, mxyd mov mxyd, r6m ; my vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 vpbroadcastd m3, [pw_32766] psubw m4, m5 test dword r7m, 0x800 jnz .h_12bpc psllw m4, 2 psllw m5, 2 .h_12bpc: test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: movu xm1, [srcq+strideq*0] vinserti128 m1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti128 m2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq m0, m1, m2 psrldq m1, 2 pslldq m2, 6 pmullw m0, m4 vpblendd m1, m2, 0xcc pmullw m1, m5 psubw m0, m3 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4 RET .h_w8: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 movu xm1, [srcq+strideq*0+2] vinserti128 m1, [srcq+strideq*1+2], 1 lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5 psubw m0, m3 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m5, [srcq+strideq*0+2] psubw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+strideq*1] pmullw m2, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m1, m3 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 2 jg .h_w16 RET .h_w32: .h_w64: .h_w128: movifnidn t0d, org_w .h_w32_loop0: mov r3d, t0d .h_w32_loop: pmullw m0, m4, [srcq+r3*2-32*1] pmullw m1, m5, [srcq+r3*2-32*1+2] psubw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+r3*2-32*2] pmullw m2, m5, [srcq+r3*2-32*2+2] psubw m1, m3 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+r3*2-32*1], m0 mova [tmpq+r3*2-32*2], m1 sub r3d, 32 jg .h_w32_loop add srcq, strideq lea tmpq, [tmpq+t0*2] dec hd jg .h_w32_loop0 RET .v: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] movd xm5, mxyd vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 vpbroadcastd m3, [pw_32766] add wq, r6 lea stride3q, [strideq*3] psubw m4, m5 test dword r7m, 0x800 jnz .v_12bpc psllw m4, 2 psllw m5, 2 .v_12bpc: jmp wq .v_w4: movq xm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq xm1, [srcq+strideq*1] vpblendd m2, m0, 0x03 ; 0 2 2 2 vpbroadcastq m0, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m1, m0, 0xf0 ; 1 1 3 3 vpbroadcastq m0, [srcq+strideq*0] vpblendd m1, m2, 0x33 ; 0 1 2 3 vpblendd m0, m2, 0x0c ; 4 2 4 4 punpckhqdq m2, m1, m0 ; 1 2 3 4 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+strideq*0] .v_w8_loop: vbroadcasti128 m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpblendd m1, m0, m2, 0xf0 ; 0 1 vbroadcasti128 m0, [srcq+strideq*0] vpblendd m2, m0, 0xf0 ; 1 2 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 32 sub hd, 2 jg .v_w8_loop RET .v_w16: movu m0, [srcq+strideq*0] .v_w16_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5, m2 psubw m0, m3 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m4 mova [tmpq+32*0], m1 pmullw m1, m5, m0 psubw m2, m3 paddw m1, m2 psraw m1, 2 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 PUSH r7 %endif movifnidn r7d, org_w add r7d, r7d mov r3, srcq lea r6d, [hq+r7*8-256] mov r5, tmpq .v_w32_loop0: movu m0, [srcq+strideq*0] .v_w32_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5, m2 psubw m0, m3 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m4 mova [tmpq+r7*0], m1 pmullw m1, m5, m0 psubw m2, m3 paddw m1, m2 psraw m1, 2 mova [tmpq+r7*1], m1 lea tmpq, [tmpq+r7*2] sub hd, 2 jg .v_w32_loop add r3, 32 add r5, 32 movzx hd, r6b mov srcq, r3 mov tmpq, r5 sub r6d, 1<<8 jg .v_w32_loop0 %if WIN64 POP r7 %endif RET .hv: WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 movd xm6, mxyd add wq, r6 lea stride3q, [strideq*3] vpbroadcastw m6, xm6 jmp wq .hv_w4: movu xm1, [srcq+strideq*0] %if WIN64 movaps [rsp+24], xmm7 %endif pmullw xm0, xm4, xm1 psrldq xm1, 2 pmullw xm1, xm5 psubw xm0, xm3 paddw xm0, xm1 psraw xm0, 2 vpbroadcastq m0, xm0 .hv_w4_loop: movu xm1, [srcq+strideq*1] vinserti128 m1, [srcq+stride3q ], 1 movu xm2, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vinserti128 m2, [srcq+strideq*0], 1 punpcklqdq m7, m1, m2 psrldq m1, 2 pslldq m2, 6 pmullw m7, m4 vpblendd m1, m2, 0xcc pmullw m1, m5 psubw m7, m3 paddw m1, m7 psraw m1, 2 ; 1 2 3 4 vpblendd m0, m1, 0x3f vpermq m2, m0, q2103 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop %if WIN64 movaps xmm7, [rsp+24] %endif RET .hv_w8: pmullw xm0, xm4, [srcq+strideq*0] pmullw xm1, xm5, [srcq+strideq*0+2] psubw xm0, xm3 paddw xm0, xm1 psraw xm0, 2 vinserti128 m0, xm0, 1 .hv_w8_loop: movu xm1, [srcq+strideq*1] movu xm2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] vinserti128 m1, [srcq+strideq*0], 1 vinserti128 m2, [srcq+strideq*0+2], 1 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 32 sub hd, 2 jg .hv_w8_loop RET .hv_w16: .hv_w32: .hv_w64: .hv_w128: %if WIN64 PUSH r7 %endif movifnidn r7d, org_w add r7d, r7d mov r3, srcq lea r6d, [hq+r7*8-256] mov r5, tmpq .hv_w16_loop0: pmullw m0, m4, [srcq] pmullw m1, m5, [srcq+2] psubw m0, m3 paddw m0, m1 psraw m0, 2 .hv_w16_loop: pmullw m1, m4, [srcq+strideq*1] pmullw m2, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m1, m3 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [tmpq+r7*0], m2 pmullw m0, m4, [srcq+strideq*0] pmullw m2, m5, [srcq+strideq*0+2] psubw m0, m3 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+r7*1], m2 lea tmpq, [tmpq+r7*2] sub hd, 2 jg .hv_w16_loop add r3, 32 add r5, 32 movzx hd, r6b mov srcq, r3 mov tmpq, r5 sub r6d, 1<<8 jg .hv_w16_loop0 %if WIN64 POP r7 %endif RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; prefix, type, type_h, type_v cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_FN smooth, SMOOTH, SMOOTH PUT_8TAP_FN sharp_regular, SHARP, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx2 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx2] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 %if WIN64 pop r8 %endif jmp wq .h_w2: movzx mxd, mxb sub srcq, 2 mova xm2, [subpel_h_shuf2] vpbroadcastd xm3, [base+subpel_filters+mxq*8+2] pmovsxbw xm3, xm3 .h_w2_loop: movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm2 pshufb xm1, xm2 pmaddwd xm0, xm3 pmaddwd xm1, xm3 phaddd xm0, xm1 paddd xm0, xm4 psrad xm0, 6 packusdw xm0, xm0 pminsw xm0, xm5 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xm3, [base+subpel_filters+mxq*8] WIN64_SPILL_XMM 8 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] pshufd xm3, xm3, q2211 vpbroadcastq m2, xm3 vpermq m3, m3, q1111 .h_w4_loop: movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 pshufb m1, m7 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m2 pmaddwd m1, m3 paddd m0, m4 paddd m0, m1 psrad m0, 6 vextracti128 xm1, m0, 1 packusdw xm0, xm1 pminsw xm0, xm5 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv mov r7d, r8m vpbroadcastw m5, r8m shr r7d, 11 vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 je .h_w4 jl .h_w2 %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 13 shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 8 jg .h_w16 .h_w8: %macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 pmaddwd m%5, m9, m%4 ; abcd1 pmaddwd m%1, m8 ; abcd0 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m%5, m4 paddd m%1, m%5 pmaddwd m%5, m11, m%2 ; abcd3 paddd m%1, m%5 pmaddwd m%5, m10, m%4 ; abcd2 pshufb m%3, m7 ; a b b c c d d e pmaddwd m%4, m8 ; efgh0 paddd m%1, m%5 pmaddwd m%5, m9, m%2 ; efgh1 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m11 ; efgh3 pmaddwd m%2, m10 ; efgh2 paddd m%4, m4 paddd m%4, m%5 paddd m%3, m%4 paddd m%2, m%3 psrad m%1, 6 psrad m%2, 6 packusdw m%1, m%2 pminsw m%1, m5 %endmacro movu xm0, [srcq+ssq*0+ 0] vinserti128 m0, [srcq+ssq*1+ 0], 1 movu xm2, [srcq+ssq*0+16] vinserti128 m2, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] shufpd m1, m0, m2, 0x05 PUT_8TAP_H 0, 1, 2, 3, 12 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6*2-32] movu m1, [srcq+r6*2-24] movu m2, [srcq+r6*2-16] PUT_8TAP_H 0, 1, 2, 3, 12 mova [dstq+r6*2-32], m0 sub r6d, 16 jg .h_w16_loop add srcq, ssq add dstq, dsq dec hd jg .h_w16 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m6, [pd_32] vpbroadcastw m7, r8m lea r6, [ssq*3] sub srcq, r6 punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 4 jg .v_w8 je .v_w4 .v_w2: movd xm2, [srcq+ssq*0] pinsrd xm2, [srcq+ssq*1], 1 pinsrd xm2, [srcq+ssq*2], 2 pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*4] movd xm3, [srcq+ssq*0] vpbroadcastd xm1, [srcq+ssq*1] vpbroadcastd xm0, [srcq+ssq*2] add srcq, r6 vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklwd xm3, xm1 ; 45 56 punpcklwd xm1, xm2, xm4 ; 01 12 punpckhwd xm2, xm4 ; 23 34 .v_w2_loop: vpbroadcastd xm4, [srcq+ssq*0] pmaddwd xm5, xm8, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm9 ; a1 b1 paddd xm5, xm6 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm10 ; a2 b2 paddd xm5, xm3 vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklwd xm3, xm4 ; 67 78 pmaddwd xm4, xm11, xm3 ; a3 b3 paddd xm5, xm4 psrad xm5, 6 packusdw xm5, xm5 pminsw xm5, xm7 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xm1, [srcq+ssq*0] vpbroadcastq m0, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] vpbroadcastq m4, [srcq+r6 ] lea srcq, [srcq+ssq*4] vpbroadcastq m3, [srcq+ssq*0] vpbroadcastq m5, [srcq+ssq*1] vpblendd m1, m0, 0x30 vpblendd m0, m2, 0x30 punpcklwd m1, m0 ; 01 12 vpbroadcastq m0, [srcq+ssq*2] add srcq, r6 vpblendd m2, m4, 0x30 vpblendd m4, m3, 0x30 punpcklwd m2, m4 ; 23 34 vpblendd m3, m5, 0x30 vpblendd m5, m0, 0x30 punpcklwd m3, m5 ; 45 56 .v_w4_loop: vpbroadcastq m4, [srcq+ssq*0] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m6 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m4, m0, 0x30 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 6 vextracti128 xm4, m5, 1 packusdw xm5, xm4 pminsw xm5, xm7 movq [dstq+dsq*0], xm5 movhps [dstq+dsq*1], xm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: shl wd, 5 mov r7, srcq mov r8, dstq lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+ssq*2] lea srcq, [srcq+ssq*4] vbroadcasti128 m1, [srcq+ssq*0] vbroadcasti128 m2, [srcq+ssq*1] vbroadcasti128 m3, [srcq+ssq*2] add srcq, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: vbroadcasti128 m14, [srcq+ssq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 vbroadcasti128 m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c punpcklwd m5, m6, m0 ; 67 punpckhwd m6, m0 ; 78 pmaddwd m14, m11, m5 ; a3 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 5 psrad m13, 5 packusdw m12, m13 pxor m13, m13 pavgw m12, m13 pminsw m12, m7 vpermq m12, m12, q3120 mova [dstq+dsq*0], xm12 vextracti128 [dstq+dsq*1], m12, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop add r7, 16 add r8, 16 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .v_w8_loop0 RET .hv: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastw m15, r8m cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] vpbroadcastd m6, [pd_512] lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 pxor m7, m7 punpcklbw m7, m0 punpcklbw m1, m1 psraw m1, 8 ; sign-extend test dword r8m, 0x800 jz .hv_10bit psraw m7, 2 psllw m1, 2 .hv_10bit: pshufd m11, m1, q0000 pshufd m12, m1, q1111 pshufd m13, m1, q2222 pshufd m14, m1, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 m9, [subpel_h_shuf2] vbroadcasti128 m1, [srcq+r6 ] ; 3 3 movu xm3, [srcq+ssq*2] movu xm0, [srcq+ssq*0] movu xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*4] vinserti128 m3, [srcq+ssq*0], 1 ; 2 4 vinserti128 m0, [srcq+ssq*1], 1 ; 0 5 vinserti128 m2, [srcq+ssq*2], 1 ; 1 6 add srcq, r6 pshufb m1, m9 pshufb m3, m9 pshufb m0, m9 pshufb m2, m9 pmaddwd m1, m7 pmaddwd m3, m7 pmaddwd m0, m7 pmaddwd m2, m7 phaddd m1, m3 phaddd m0, m2 paddd m1, m6 paddd m0, m6 psrad m1, 10 psrad m0, 10 packssdw m1, m0 ; 3 2 0 1 vextracti128 xm0, m1, 1 ; 3 4 5 6 pshufd xm2, xm1, q1301 ; 2 3 1 2 pshufd xm3, xm0, q2121 ; 4 5 4 5 punpckhwd xm1, xm2 ; 01 12 punpcklwd xm2, xm0 ; 23 34 punpckhwd xm3, xm0 ; 45 56 .hv_w2_loop: movu xm4, [srcq+ssq*0] movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm4, xm9 pshufb xm5, xm9 pmaddwd xm4, xm7 pmaddwd xm5, xm7 phaddd xm4, xm5 pmaddwd xm5, xm11, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm12 ; a1 b1 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm13 ; a2 b2 paddd xm5, xm3 paddd xm4, xm6 psrad xm4, 10 packssdw xm4, xm4 palignr xm3, xm4, xm0, 12 mova xm0, xm4 punpcklwd xm3, xm0 ; 67 78 pmaddwd xm4, xm14, xm3 ; a3 b3 paddd xm5, xm6 paddd xm5, xm4 psrad xm5, 10 packusdw xm5, xm5 pminsw xm5, xm15 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti128 m9, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] pshufd m8, m7, q1111 pshufd m7, m7, q0000 movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 vbroadcasti128 m0, [srcq+r6 ] vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 lea srcq, [srcq+ssq*4] vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 movu xm3, [srcq+ssq*1] vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 add srcq, r6 pshufb m4, m1, m9 pshufb m1, m10 pmaddwd m4, m7 pmaddwd m1, m8 pshufb m5, m2, m9 pshufb m2, m10 pmaddwd m5, m7 pmaddwd m2, m8 paddd m4, m6 paddd m1, m4 pshufb m4, m0, m9 pshufb m0, m10 pmaddwd m4, m7 pmaddwd m0, m8 paddd m5, m6 paddd m2, m5 pshufb m5, m3, m9 pshufb m3, m10 pmaddwd m5, m7 pmaddwd m3, m8 paddd m4, m6 paddd m4, m0 paddd m5, m6 paddd m5, m3 vperm2i128 m0, m1, m2, 0x21 psrld m1, 10 psrld m2, 10 vperm2i128 m3, m4, m5, 0x21 pslld m4, 6 pslld m5, 6 pblendw m2, m4, 0xaa ; 23 34 pslld m0, 6 pblendw m1, m0, 0xaa ; 01 12 psrld m3, 10 pblendw m3, m5, 0xaa ; 45 56 psrad m0, m5, 16 .hv_w4_loop: movu xm4, [srcq+ssq*0] vinserti128 m4, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m6 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 pshufb m3, m4, m9 pshufb m4, m10 pmaddwd m3, m7 pmaddwd m4, m8 paddd m3, m6 paddd m4, m3 psrad m4, 10 packssdw m0, m4 ; _ 7 6 8 vpermq m3, m0, q1122 ; _ 6 _ 7 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m4, m5 psrad m4, 10 vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, xm15 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] shl wd, 5 lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 pxor m0, m0 punpcklbw m0, m2 mov r7, srcq mov r8, dstq lea wd, [hq+wq-256] test dword r8m, 0x800 jz .hv_w8_10bit psraw m0, 2 psllw xm1, 2 .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 pshufd m13, m0, q2222 pshufd m14, m0, q3333 %if WIN64 %define v_mul (rsp+stack_offset+40) ; r4m %else %define v_mul (rsp-24) ; red zone %endif mova [v_mul], xm1 .hv_w8_loop0: %macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 pmaddwd m3, m12, m2 pmaddwd m%1, m11 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m3, m10 paddd m%1, m3 pmaddwd m3, m14, m%2 paddd m%1, m3 pmaddwd m3, m13, m2 pshufb m%3, m9 ; a b b c c d d e pmaddwd m2, m11 paddd m%1, m3 pmaddwd m3, m12, m%2 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m14 pmaddwd m%2, m13 paddd m2, m10 paddd m2, m3 paddd m%3, m2 paddd m%2, m%3 psrad m%1, 10 psrad m%2, 10 packssdw m%1, m%2 %endmacro movu xm4, [srcq+r6 *1+ 0] vbroadcasti128 m8, [subpel_h_shufA] movu xm6, [srcq+r6 *1+ 8] vbroadcasti128 m9, [subpel_h_shufB] movu xm0, [srcq+r6 *1+16] vpbroadcastd m10, [pd_512] movu xm5, [srcq+ssq*0+ 0] vinserti128 m5, [srcq+ssq*4+ 0], 1 movu xm1, [srcq+ssq*0+16] vinserti128 m1, [srcq+ssq*4+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PUT_8TAP_HV_H 4, 6, 0 ; 3 INIT_YMM avx2 PUT_8TAP_HV_H 5, 7, 1 ; 0 4 movu xm0, [srcq+ssq*2+ 0] vinserti128 m0, [srcq+r6 *2+ 0], 1 movu xm1, [srcq+ssq*2+16] vinserti128 m1, [srcq+r6 *2+16], 1 shufpd m7, m0, m1, 0x05 PUT_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+ssq*1+ 0] movu xm1, [srcq+ssq*1+16] lea srcq, [srcq+ssq*4] vinserti128 m6, [srcq+ssq*1+ 0], 1 vinserti128 m1, [srcq+ssq*1+16], 1 add srcq, r6 shufpd m7, m6, m1, 0x05 PUT_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 vpermq m5, m5, q3120 vpermq m6, m6, q3120 vpermq m7, m0, q3120 punpcklwd m3, m7, m4 ; 23 punpckhwd m4, m5 ; 34 punpcklwd m1, m5, m6 ; 01 punpckhwd m5, m6 ; 45 punpcklwd m2, m6, m7 ; 12 punpckhwd m6, m7 ; 56 .hv_w8_loop: vpbroadcastd m9, [v_mul+4*0] vpbroadcastd m7, [v_mul+4*1] vpbroadcastd m10, [v_mul+4*2] pmaddwd m8, m9, m1 ; a0 pmaddwd m9, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m7 ; a1 pmaddwd m4, m7 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 movu xm5, [srcq+ssq*0] vinserti128 m5, [srcq+ssq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] movu xm6, [srcq+ssq*0+16] vinserti128 m6, [srcq+ssq*1+16], 1 vextracti128 [dstq], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 pmaddwd m5, m12 paddd m0, m5 pshufb m5, m6, m7 ; 89 pshufb m6, m10 ; ab pmaddwd m5, m13 pmaddwd m6, m14 paddd m6, m5 movu xm5, [srcq+ssq*0+8] vinserti128 m5, [srcq+ssq*1+8], 1 lea srcq, [srcq+ssq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 pmaddwd m7, m11 paddd m0, m10 vpbroadcastd m10, [pd_512] paddd m6, m7 pmaddwd m7, m14, m5 pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 vbroadcasti128 m6, [dstq] paddd m8, m10 paddd m9, m10 paddd m0, m10 paddd m5, m10 vpbroadcastd m10, [v_mul+4*3] psrad m0, 10 psrad m5, 10 packssdw m0, m5 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m10, m5 ; a3 pmaddwd m10, m6 ; b3 paddd m7, m8 paddd m9, m10 psrad m7, 10 psrad m9, 10 packusdw m7, m9 pminsw m7, m15 vpermq m7, m7, q3120 mova [dstq+dsq*0], xm7 vextracti128 [dstq+dsq*1], m7, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop add r7, 16 add r8, 16 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .hv_w8_loop0 RET %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_FN smooth, SMOOTH, SMOOTH PREP_8TAP_FN sharp_regular, SHARP, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %define base r7-prep_avx2 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx2] movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd mov r6d, r7m ; bitdepth_max movzx wd, word [r7+wq*2+table_offset(prep,)] vpbroadcastd m5, [r7-prep_avx2+pw_8192] shr r6d, 11 add wq, r7 vpbroadcastd m4, [base+prep_mul+r6*4] lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xm0, [base+subpel_filters+mxq*8] vbroadcasti128 m3, [subpel_h_shufA] vbroadcasti128 m4, [subpel_h_shufB] WIN64_SPILL_XMM 8 pshufd xm0, xm0, q2211 test dword r7m, 0x800 jnz .h_w4_12bpc psllw xm0, 2 .h_w4_12bpc: vpbroadcastq m6, xm0 vpermq m7, m0, q1111 .h_w4_loop: movu xm1, [srcq+strideq*0] vinserti128 m1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti128 m2, [srcq+r6 ], 1 lea srcq, [srcq+strideq*4] pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m6 pmaddwd m1, m7 paddd m0, m5 paddd m0, m1 pshufb m1, m2, m3 pshufb m2, m4 pmaddwd m1, m6 pmaddwd m2, m7 paddd m1, m5 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) lea r6, [strideq*3] cmp wd, 4 je .h_w4 shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] punpcklbw m0, m0 psraw m0, 8 ; sign-extend test dword r7m, 0x800 jnz .h_12bpc psllw m0, 2 .h_12bpc: pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 8 jg .h_w16 .h_w8: %macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 pmaddwd m%5, m9, m%4 ; abcd1 pmaddwd m%1, m8 ; abcd0 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m%5, m5 paddd m%1, m%5 pmaddwd m%5, m11, m%2 ; abcd3 paddd m%1, m%5 pmaddwd m%5, m10, m%4 ; abcd2 pshufb m%3, m7 ; a b b c c d d e pmaddwd m%4, m8 ; efgh0 paddd m%1, m%5 pmaddwd m%5, m9, m%2 ; efgh1 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m11 ; efgh3 pmaddwd m%2, m10 ; efgh2 paddd m%4, m5 paddd m%4, m%5 paddd m%3, m%4 paddd m%2, m%3 psrad m%1, 4 psrad m%2, 4 packssdw m%1, m%2 %endmacro movu xm0, [srcq+strideq*0+ 0] vinserti128 m0, [srcq+strideq*1+ 0], 1 movu xm2, [srcq+strideq*0+16] vinserti128 m2, [srcq+strideq*1+16], 1 lea srcq, [srcq+strideq*2] shufpd m1, m0, m2, 0x05 PREP_8TAP_H 0, 1, 2, 3, 4 mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: add wd, wd .h_w16_loop0: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6-32] movu m1, [srcq+r6-24] movu m2, [srcq+r6-16] PREP_8TAP_H 0, 1, 2, 3, 4 mova [tmpq+r6-32], m0 sub r6d, 32 jg .h_w16_loop add srcq, strideq add tmpq, wq dec hd jg .h_w16_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m7, [prep_8tap_1d_rnd] lea r6, [strideq*3] sub srcq, r6 punpcklbw m0, m0 psraw m0, 8 ; sign-extend test dword r7m, 0x800 jnz .v_12bpc psllw m0, 2 .v_12bpc: pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 4 jg .v_w8 .v_w4: movq xm1, [srcq+strideq*0] vpbroadcastq m0, [srcq+strideq*1] vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq m4, [srcq+r6 ] lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*0] vpbroadcastq m5, [srcq+strideq*1] vpblendd m1, m0, 0x30 vpblendd m0, m2, 0x30 punpcklwd m1, m0 ; 01 12 vpbroadcastq m0, [srcq+strideq*2] add srcq, r6 vpblendd m2, m4, 0x30 vpblendd m4, m3, 0x30 punpcklwd m2, m4 ; 23 34 vpblendd m3, m5, 0x30 vpblendd m5, m0, 0x30 punpcklwd m3, m5 ; 45 56 .v_w4_loop: vpbroadcastq m4, [srcq+strideq*0] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m7 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpblendd m4, m0, 0x30 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 4 vextracti128 xm4, m5, 1 packssdw xm5, xm4 mova [tmpq], xm5 add tmpq, 16 sub hd, 2 jg .v_w4_loop RET .v_w8: %if WIN64 push r8 %endif mov r8d, wd shl wd, 5 mov r5, srcq mov r7, tmpq lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vbroadcasti128 m1, [srcq+strideq*0] vbroadcasti128 m2, [srcq+strideq*1] vbroadcasti128 m3, [srcq+strideq*2] add srcq, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: vbroadcasti128 m14, [srcq+strideq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m7 paddd m13, m7 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 vbroadcasti128 m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c punpcklwd m5, m6, m0 ; 67 punpckhwd m6, m0 ; 78 pmaddwd m14, m11, m5 ; a3 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 4 psrad m13, 4 packssdw m12, m13 vpermq m12, m12, q3120 mova [tmpq+r8*0], xm12 vextracti128 [tmpq+r8*2], m12, 1 lea tmpq, [tmpq+r8*4] sub hd, 2 jg .v_w8_loop add r5, 16 add r7, 16 movzx hd, wb mov srcq, r5 mov tmpq, r7 sub wd, 1<<8 jg .v_w8_loop0 %if WIN64 pop r8 %endif RET .hv: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastd m15, [prep_8tap_2d_rnd] cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 sub srcq, r6 pxor m7, m7 punpcklbw m7, m0 punpcklbw m1, m1 psraw m7, 4 psraw m1, 8 test dword r7m, 0x800 jz .hv_w4_10bit psraw m7, 2 .hv_w4_10bit: pshufd m11, m1, q0000 pshufd m12, m1, q1111 pshufd m13, m1, q2222 pshufd m14, m1, q3333 .hv_w4: vbroadcasti128 m9, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] pshufd m8, m7, q1111 pshufd m7, m7, q0000 movu xm1, [srcq+strideq*0] vinserti128 m1, [srcq+strideq*1], 1 ; 0 1 vbroadcasti128 m0, [srcq+r6 ] vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3 lea srcq, [srcq+strideq*4] vinserti128 m0, [srcq+strideq*0], 1 ; 3 4 movu xm3, [srcq+strideq*1] vinserti128 m3, [srcq+strideq*2], 1 ; 5 6 add srcq, r6 pshufb m4, m1, m9 pshufb m1, m10 pmaddwd m4, m7 pmaddwd m1, m8 pshufb m5, m2, m9 pshufb m2, m10 pmaddwd m5, m7 pmaddwd m2, m8 paddd m4, m15 paddd m1, m4 pshufb m4, m0, m9 pshufb m0, m10 pmaddwd m4, m7 pmaddwd m0, m8 paddd m5, m15 paddd m2, m5 pshufb m5, m3, m9 pshufb m3, m10 pmaddwd m5, m7 pmaddwd m3, m8 paddd m4, m15 paddd m4, m0 paddd m5, m15 paddd m5, m3 vperm2i128 m0, m1, m2, 0x21 psrld m1, 6 psrld m2, 6 vperm2i128 m3, m4, m5, 0x21 pslld m4, 10 pslld m5, 10 pblendw m2, m4, 0xaa ; 23 34 pslld m0, 10 pblendw m1, m0, 0xaa ; 01 12 psrld m3, 6 pblendw m3, m5, 0xaa ; 45 56 psrad m0, m5, 16 .hv_w4_loop: movu xm4, [srcq+strideq*0] vinserti128 m4, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m15 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 pshufb m3, m4, m9 pshufb m4, m10 pmaddwd m3, m7 pmaddwd m4, m8 paddd m3, m15 paddd m4, m3 psrad m4, 6 packssdw m0, m4 ; _ 7 6 8 vpermq m3, m0, q1122 ; _ 6 _ 7 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m4, m5 psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] %if WIN64 PUSH r8 %endif mov r8d, wd shl wd, 5 lea r6, [strideq*3] sub srcq, 6 sub srcq, r6 mov r5, srcq mov r7, tmpq lea wd, [hq+wq-256] pxor m0, m0 punpcklbw m0, m2 mova [v_mul], xm1 psraw m0, 4 test dword r7m, 0x800 jz .hv_w8_10bit psraw m0, 2 .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 pshufd m13, m0, q2222 pshufd m14, m0, q3333 .hv_w8_loop0: %macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 pmaddwd m3, m12, m2 pmaddwd m%1, m11 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m3, m15 paddd m%1, m3 pmaddwd m3, m14, m%2 paddd m%1, m3 pmaddwd m3, m13, m2 pshufb m%3, m9 ; a b b c c d d e pmaddwd m2, m11 paddd m%1, m3 pmaddwd m3, m12, m%2 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m14 pmaddwd m%2, m13 paddd m2, m15 paddd m2, m3 paddd m2, m%3 paddd m2, m%2 psrad m%1, 6 psrad m2, 6 packssdw m%1, m2 %endmacro movu xm4, [srcq+r6 + 0] vbroadcasti128 m8, [subpel_h_shufA] movu xm6, [srcq+r6 + 8] vbroadcasti128 m9, [subpel_h_shufB] movu xm0, [srcq+r6 +16] movu xm5, [srcq+strideq*0+ 0] vinserti128 m5, [srcq+strideq*4+ 0], 1 movu xm1, [srcq+strideq*0+16] vinserti128 m1, [srcq+strideq*4+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PREP_8TAP_HV_H 4, 6, 0 ; 3 INIT_YMM avx2 PREP_8TAP_HV_H 5, 7, 1 ; 0 4 movu xm0, [srcq+strideq*2+ 0] vinserti128 m0, [srcq+r6 *2+ 0], 1 movu xm1, [srcq+strideq*2+16] vinserti128 m1, [srcq+r6 *2+16], 1 shufpd m7, m0, m1, 0x05 PREP_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+strideq*1+ 0] movu xm1, [srcq+strideq*1+16] lea srcq, [srcq+strideq*4] vinserti128 m6, [srcq+strideq*1+ 0], 1 vinserti128 m1, [srcq+strideq*1+16], 1 add srcq, r6 shufpd m7, m6, m1, 0x05 PREP_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 vpermq m5, m5, q3120 vpermq m6, m6, q3120 vpermq m7, m0, q3120 punpcklwd m3, m7, m4 ; 23 punpckhwd m4, m5 ; 34 punpcklwd m1, m5, m6 ; 01 punpckhwd m5, m6 ; 45 punpcklwd m2, m6, m7 ; 12 punpckhwd m6, m7 ; 56 .hv_w8_loop: vpbroadcastd m9, [v_mul+4*0] vpbroadcastd m7, [v_mul+4*1] vpbroadcastd m10, [v_mul+4*2] pmaddwd m8, m9, m1 ; a0 pmaddwd m9, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m7 ; a1 pmaddwd m4, m7 ; b1 paddd m8, m15 paddd m9, m15 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 movu xm5, [srcq+strideq*0] vinserti128 m5, [srcq+strideq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] movu xm6, [srcq+strideq*0+16] vinserti128 m6, [srcq+strideq*1+16], 1 vextracti128 [tmpq], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 pmaddwd m5, m12 paddd m0, m15 paddd m0, m5 pshufb m5, m6, m7 ; 89 pshufb m6, m10 ; ab pmaddwd m5, m13 pmaddwd m6, m14 paddd m5, m15 paddd m6, m5 movu xm5, [srcq+strideq*0+8] vinserti128 m5, [srcq+strideq*1+8], 1 lea srcq, [srcq+strideq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 pmaddwd m7, m11 paddd m0, m10 paddd m6, m7 pmaddwd m7, m14, m5 pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 vbroadcasti128 m6, [tmpq] vpbroadcastd m10, [v_mul+4*3] psrad m0, 6 psrad m5, 6 packssdw m0, m5 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m10, m5 ; a3 pmaddwd m10, m6 ; b3 paddd m7, m8 paddd m9, m10 psrad m7, 6 psrad m9, 6 packssdw m7, m9 vpermq m7, m7, q3120 mova [tmpq+r8*0], xm7 vextracti128 [tmpq+r8*2], m7, 1 lea tmpq, [tmpq+r8*4] sub hd, 2 jg .hv_w8_loop add r5, 16 add r7, 16 movzx hd, wb mov srcq, r5 mov tmpq, r7 sub wd, 1<<8 jg .hv_w8_loop0 %if WIN64 POP r8 %endif RET %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro REMAP_REG 2 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %xdefine r14_save r14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep %xdefine r14 r14_save %undef r14_save %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd movu xm%1, [srcq+ r4*2] movu xm%2, [srcq+ r6*2] movu xm%3, [srcq+ r7*2] movu xm%4, [srcq+ r9*2] vinserti128 m%1, [srcq+r10*2], 1 vinserti128 m%2, [srcq+r11*2], 1 vinserti128 m%3, [srcq+r13*2], 1 vinserti128 m%4, [srcq+ rX*2], 1 add srcq, ssq movu xm%5, [srcq+ r4*2] movu xm%6, [srcq+ r6*2] movu xm%7, [srcq+ r7*2] movu xm%8, [srcq+ r9*2] vinserti128 m%5, [srcq+r10*2], 1 vinserti128 m%6, [srcq+r11*2], 1 vinserti128 m%7, [srcq+r13*2], 1 vinserti128 m%8, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m%1, m12 pmaddwd m%2, m13 pmaddwd m%3, m14 pmaddwd m%4, m15 pmaddwd m%5, m12 pmaddwd m%6, m13 pmaddwd m%7, m14 pmaddwd m%8, m15 phaddd m%1, m%2 %if %9 mova m10, [rsp+0x00] %endif phaddd m%3, m%4 phaddd m%5, m%6 phaddd m%7, m%8 phaddd m%1, m%3 phaddd m%5, m%7 paddd m%1, m10 paddd m%5, m10 psrad m%1, xm11 psrad m%5, xm11 packssdw m%1, m%5 %endmacro %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isput 1 %assign isprep 0 cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %xdefine base_reg r12 mov r7d, pxmaxm %else %assign isput 0 %assign isprep 1 cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %define tmp_stridem qword [rsp+0xd0] %xdefine base_reg r11 %endif lea base_reg, [%1_8tap_scaled_16bpc_avx2] %define base base_reg-%1_8tap_scaled_16bpc_avx2 tzcnt wd, wm vpbroadcastd m8, dxm %if isprep && UNIX64 movd xm10, mxd vpbroadcastd m10, xm10 mov r5d, t0d DECLARE_REG_TMP 5, 7 mov r6d, pxmaxm %else vpbroadcastd m10, mxm %if isput vpbroadcastw m11, pxmaxm %else mov r6d, pxmaxm %endif %endif mov dyd, dym %if isput %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %else DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %define dsm [rsp+0x98] %define rX r1 %define rXd r1d %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %else DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+0x98] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define rX r14 %define rXd r14d %endif shr r7d, 11 vpbroadcastd m6, [base+pd_0x3ff] vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4] movd xm7, [base+s_8tap_h_sh+r7*4] %if isput vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4] pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2 %else vpbroadcastd m13, [base+pd_m524256] %endif pxor m9, m9 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] add wq, base_reg jmp wq %if isput .w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0,1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd xm4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*2] movu xm3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*0], 1 ; 0 4 vinserti128 m1, [srcq+ssq*1], 1 ; 1 5 vinserti128 m2, [srcq+ssq*2], 1 ; 2 6 vinserti128 m3, [srcq+ss3q ], 1 ; 3 7 lea srcq, [srcq+ssq*4] REPX {pshufb x, m10}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 phaddd m0, m1 phaddd m2, m3 paddd m0, m12 paddd m2, m12 psrad m0, xm7 psrad m2, xm7 packssdw m0, m2 ; 0 1 2 3 4 5 6 7 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 pshufd xm4, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm4 ; 45 56 punpckhwd xm4, xm1, xm4 ; 67 __ .w2_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm14, r6q pmovsxbw xm14, xm14 pshufd xm8, xm14, q0000 pshufd xm9, xm14, q1111 pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pshufd xm8, xm14, q2222 pshufd xm14, xm14, q3333 paddd xm5, xm6 pmaddwd xm6, xm2, xm8 pmaddwd xm8, xm4, xm14 psrldq xm9, xm7, 8 paddd xm5, xm6 paddd xm5, xm13 paddd xm5, xm8 psrad xm5, xm9 packusdw xm5, xm5 pminsw xm5, xm11 movd [dstq], xm5 add dstq, dsq dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w2_loop movu xm5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps xm3, xm0, q1032 ; 01 12 shufps xm0, xm2, q1032 ; 23 34 shufps xm2, xm4, q1032 ; 45 56 pshufb xm5, xm10 pmaddwd xm5, xm15 phaddd xm5, xm5 paddd xm5, xm12 psrad xm5, xm7 packssdw xm5, xm5 palignr xm1, xm5, xm1, 12 punpcklqdq xm1, xm1 ; 6 7 6 7 punpcklwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop .w2_skip_line: movu xm6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xm3, xm0 ; 01 12 mova xm0, xm2 ; 23 34 pshufb xm5, xm10 pshufb xm6, xm10 pmaddwd xm5, xm15 pmaddwd xm6, xm15 phaddd xm5, xm6 paddd xm5, xm12 psrad xm5, xm7 packssdw xm5, xm5 ; 6 7 6 7 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 pshufd xm5, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm5 ; 45 56 punpckhwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop %endif .w4: mov myd, mym mova [rsp+0x00], m12 %if isput mova [rsp+0x20], xm13 %else SWAP m11, m13 %endif mova [rsp+0x30], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m0, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] pcmpeqd m6, m9 punpckldq m10, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 pblendvb m14, m2, m10 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu xm7, [srcq+ssq*0] movu xm9, [srcq+ssq*1] movu xm8, [srcq+ssq*2] movu xm10, [srcq+ss3q ] movu xm1, [srcq+r4 ] movu xm3, [srcq+r6 ] movu xm2, [srcq+r11 ] movu xm4, [srcq+r13 ] lea srcq, [srcq+ssq*4] vinserti128 m7, [srcq+ssq*0], 1 vinserti128 m9, [srcq+ssq*1], 1 vinserti128 m8, [srcq+ssq*2], 1 vinserti128 m10, [srcq+ss3q ], 1 vinserti128 m1, [srcq+r4 ], 1 vinserti128 m3, [srcq+r6 ], 1 vinserti128 m2, [srcq+r11 ], 1 vinserti128 m4, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m0 paddb m13, m0 REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m14}, m7, m9, m8, m10 REPX {pshufb x, m13}, m1, m2, m3, m4 REPX {pmaddwd x, m15}, m1, m2, m3, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x30] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 4 5 packssdw m8, m10 ; 2 3 6 7 vextracti128 xm9, m7, 1 ; 4 5 vextracti128 xm3, m8, 1 ; 6 7 shufps xm4, xm7, xm8, q1032 ; 1 2 shufps xm5, xm8, xm9, q1032 ; 3 4 shufps xm6, xm9, xm3, q1032 ; 5 6 psrldq xm10, xm3, 8 ; 7 _ punpcklwd xm0, xm7, xm4 ; 01 punpckhwd xm7, xm4 ; 12 punpcklwd xm1, xm8, xm5 ; 23 punpckhwd xm8, xm5 ; 34 punpcklwd xm2, xm9, xm6 ; 45 punpckhwd xm9, xm6 ; 56 punpcklwd xm3, xm10 ; 67 mova [rsp+0x40], xm7 mova [rsp+0x50], xm8 mova [rsp+0x60], xm9 .w4_loop: and myd, 0x3ff mov r11d, 64 << 24 mov r13d, myd shr r13d, 6 lea r13d, [t1+r13] cmovnz r11q, [base+subpel_filters+r13*8] movq xm9, r11q pmovsxbw xm9, xm9 pshufd xm7, xm9, q0000 pshufd xm8, xm9, q1111 pmaddwd xm4, xm0, xm7 pmaddwd xm5, xm1, xm8 pshufd xm7, xm9, q2222 pshufd xm9, xm9, q3333 pmaddwd xm6, xm2, xm7 pmaddwd xm8, xm3, xm9 %if isput mova xm7, [rsp+0x20] movd xm9, [rsp+0x38] %else SWAP m7, m11 %endif paddd xm4, xm5 paddd xm6, xm8 paddd xm4, xm6 paddd xm4, xm7 %if isput psrad xm4, xm9 packusdw xm4, xm4 pminuw xm4, xm11 movq [dstq], xm4 add dstq, dsq %else SWAP m11, m7 psrad xm4, 6 packssdw xm4, xm4 movq [tmpq], xm4 add tmpq, 8 %endif dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w4_loop mova xm8, [rsp+0x00] movd xm9, [rsp+0x30] movu xm4, [srcq] movu xm5, [srcq+r4] test myd, 0x400 jz .w4_skip_line mova xm0, [rsp+0x40] mova [rsp+0x40], xm1 mova xm1, [rsp+0x50] mova [rsp+0x50], xm2 mova xm2, [rsp+0x60] mova [rsp+0x60], xm3 pshufb xm4, xm12 pshufb xm5, xm13 pmaddwd xm4, xm14 pmaddwd xm5, xm15 phaddd xm4, xm5 paddd xm4, xm8 psrad xm4, xm9 packssdw xm4, xm4 punpcklwd xm3, xm10, xm4 mova xm10, xm4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu xm6, [srcq+ssq*1] movu xm7, [srcq+r6] movu m0, [rsp+0x50] pshufb xm4, xm12 pshufb xm6, xm12 pshufb xm5, xm13 pshufb xm7, xm13 pmaddwd xm4, xm14 pmaddwd xm6, xm14 pmaddwd xm5, xm15 pmaddwd xm7, xm15 mova [rsp+0x40], m0 phaddd xm4, xm5 phaddd xm6, xm7 paddd xm4, xm8 paddd xm6, xm8 psrad xm4, xm9 psrad xm6, xm9 packssdw xm4, xm6 punpcklwd xm9, xm10, xm4 mova [rsp+0x60], xm9 psrldq xm10, xm4, 8 mova xm0, xm1 mova xm1, xm2 mova xm2, xm3 punpcklwd xm3, xm4, xm10 lea srcq, [srcq+ssq*2] jmp .w4_loop SWAP m10, m13 %if isprep SWAP m13, m11 %endif .w8: mov dword [rsp+0x80], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+0x80], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+0x80], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+0x80], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+0x80], 16 movifprep tmp_stridem, 256 .w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free %if isput movifnidn dsm, dsq mova [rsp+0xb0], xm7 %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 shr t0d, 16 sub srcq, 6 pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0x84], t0d mov [rsp+0x88], srcq mov [rsp+0x90], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] jmp .hloop .hloop_prep: dec dword [rsp+0x80] jz .ret add qword [rsp+0x90], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x40] vpbroadcastd m15, [rsp+0x84] pxor m9, m9 mov srcq, [rsp+0x88] mov r0q, [rsp+0x90] ; dstq / tmpq .hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x40], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq [rsp+0xa0], xm1 movq [rsp+0xa8], xm7 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x60], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x60] vbroadcasti128 m9, [base+subpel_s_shuf8] mov myd, mym mov dyd, dym pshufb m0, m9 ; 01a 01b pshufb m1, m9 ; 23a 23b pshufb m2, m9 ; 45a 45b pshufb m3, m9 ; 67a 67b .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm9, r6q punpcklqdq xm9, xm9 pmovsxbw m9, xm9 pshufd m8, m9, q0000 pshufd m7, m9, q1111 pmaddwd m4, m0, m8 pmaddwd m5, m1, m7 pshufd m8, m9, q2222 pshufd m9, m9, q3333 pmaddwd m6, m2, m8 pmaddwd m7, m3, m9 %if isput psrldq xm8, xm11, 8 %endif paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, xm8 vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xb0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+0x60], myd mov r4d, [rsp+0xa0] mov r6d, [rsp+0xa4] mov r7d, [rsp+0xa8] mov r9d, [rsp+0xac] jz .skip_line vbroadcasti128 m9, [base+wswap] movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq mov myd, [rsp+0x60] mov dyd, dym pshufb m0, m9 pshufb m1, m9 pshufb m2, m9 pshufb m3, m9 pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 phaddd m4, m6 paddd m4, m10 psrad m4, xm11 pslld m4, 16 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .vloop .skip_line: mova m0, m1 mova m1, m2 mova m2, m3 MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1 vbroadcasti128 m9, [base+subpel_s_shuf8] mov myd, [rsp+0x60] mov dyd, dym pshufb m3, m9 jmp .vloop SWAP m1, m12, m10 SWAP m7, m11 .dy1: movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] add wq, base_reg jmp wq %if isput .dy1_w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0-1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*2] movu xm3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*1], 1 vinserti128 m2, [srcq+ssq*2], 1 add srcq, ss3q movq xm6, r4q pmovsxbw xm6, xm6 pshufd xm8, xm6, q0000 pshufd xm9, xm6, q1111 pshufd xm14, xm6, q2222 pshufd xm6, xm6, q3333 REPX {pshufb x, m10}, m0, m1, m2 pshufb xm3, xm10 REPX {pmaddwd x, m15}, m0, m1, m2 pmaddwd xm3, xm15 phaddd m0, m1 phaddd m2, m3 paddd m0, m12 paddd m2, m12 psrad m0, xm7 psrad m2, xm7 packssdw m0, m2 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 pshufd xm4, xm1, q2121 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 punpcklwd xm2, xm1, xm4 ; 45 56 .dy1_w2_loop: movu xm1, [srcq+ssq*0] movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm1, xm10 pshufb xm5, xm10 pmaddwd xm1, xm15 pmaddwd xm5, xm15 phaddd xm1, xm5 pmaddwd xm5, xm3, xm8 mova xm3, xm0 pmaddwd xm0, xm9 paddd xm1, xm12 psrad xm1, xm7 packssdw xm1, xm1 paddd xm5, xm0 mova xm0, xm2 pmaddwd xm2, xm14 paddd xm5, xm2 palignr xm2, xm1, xm4, 12 punpcklwd xm2, xm1 ; 67 78 pmaddwd xm4, xm2, xm6 paddd xm5, xm13 paddd xm5, xm4 mova xm4, xm1 psrldq xm1, xm7, 8 psrad xm5, xm1 packusdw xm5, xm5 pminsw xm5, xm11 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif .dy1_w4: mov myd, mym %if isput mova [rsp+0x50], xm11 %endif mova [rsp+0x00], m12 mova [rsp+0x20], m13 mova [rsp+0x40], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m4, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] pcmpeqd m6, m9 punpckldq m10, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 pblendvb m14, m2, m10 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*2] lea r11, [r4+ssq*1] lea r13, [r4+ss3q ] movu xm0, [srcq+ssq*0] movu xm7, [srcq+r4 ] movu xm1, [srcq+ssq*2] movu xm8, [srcq+r6 ] vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 vinserti128 m7, [srcq+r11 ], 1 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 vinserti128 m8, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] movu xm2, [srcq+ssq*0] movu xm9, [srcq+r4 ] movu xm3, [srcq+ssq*2] ; 6 _ movu xm10, [srcq+r6 ] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 vinserti128 m9, [srcq+r11 ], 1 lea srcq, [srcq+ss3q ] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m4 paddb m13, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x40] pshufb m0, m12 pshufb m1, m12 pmaddwd m0, m14 pmaddwd m1, m14 pshufb m7, m13 pshufb m8, m13 pmaddwd m7, m15 pmaddwd m8, m15 pshufb m2, m12 pshufb xm3, xm12 pmaddwd m2, m14 pmaddwd xm3, xm14 pshufb m9, m13 pshufb xm10, xm13 pmaddwd m9, m15 pmaddwd xm10, xm15 phaddd m0, m7 phaddd m1, m8 phaddd m2, m9 phaddd xm3, xm10 paddd m0, m5 paddd m1, m5 paddd m2, m5 paddd xm3, xm5 psrad m0, xm6 psrad m1, xm6 psrad m2, xm6 psrad xm3, xm6 vperm2i128 m4, m0, m1, 0x21 ; 1 2 vperm2i128 m5, m1, m2, 0x21 ; 3 4 vperm2i128 m6, m2, m3, 0x21 ; 5 6 shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] pslld m4, 16 pslld m5, 16 pslld m6, 16 pblendw m0, m4, 0xaa ; 01 12 pblendw m1, m5, 0xaa ; 23 34 pblendw m2, m6, 0xaa ; 45 56 movq xm10, r13q punpcklqdq xm10, xm10 pmovsxbw m10, xm10 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 .dy1_w4_loop: movu xm11, [srcq+ssq*0] movu xm6, [srcq+r4 ] vinserti128 m11, [srcq+ssq*1], 1 vinserti128 m6, [srcq+r11 ], 1 lea srcq, [srcq+ssq*2] pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pshufb m11, m12 pshufb m6, m13 pmaddwd m11, m14 pmaddwd m6, m15 paddd m4, [rsp+0x20] phaddd m11, m6 pmaddwd m6, m2, m9 paddd m11, [rsp+0x00] psrad m11, [rsp+0x40] mova m0, m1 mova m1, m2 paddd m5, m6 paddd m4, m5 vinserti128 m2, m3, xm11, 1 pslld m3, m11, 16 pblendw m2, m3, 0xaa ; 67 78 pmaddwd m5, m2, m10 vextracti128 xm3, m11, 1 paddd m4, m5 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0x50] movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET SWAP m10, m13 .dy1_w8: mov dword [rsp+0xa0], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+0xa0], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+0xa0], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+0xa0], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+0xa0], 16 movifprep tmp_stridem, 256 .dy1_w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput %define dsm [rsp+0xb8] movifnidn dsm, dsq mova [rsp+0xc0], xm7 %else %if UNIX64 %define hm [rsp+0xb8] %endif %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 mova [rsp+0x40], xm11 shr t0d, 16 sub srcq, 6 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0xa4], t0d mov [rsp+0xa8], srcq mov [rsp+0xb0], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+0xa0] jz .ret add qword [rsp+0xb0], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x60] vpbroadcastd m15, [rsp+0xa4] pxor m9, m9 mov srcq, [rsp+0xa8] mov r0q, [rsp+0xb0] ; dstq / tmpq mova m10, [rsp+0x00] mova xm11, [rsp+0x40] .dy1_hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x60], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x80], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x80] vbroadcasti128 m7, [base+subpel_s_shuf8] vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m10, [rsp+0x58] vpbroadcastd m11, [rsp+0x5c] pshufb m0, m7 ; 01a 01b pshufb m1, m7 ; 23a 23b pshufb m2, m7 ; 45a 45b pshufb m3, m7 ; 67a 67b .dy1_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xc0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep vbroadcasti128 m7, [base+wswap] pshufb m0, m7 pshufb m1, m7 pshufb m2, m7 pshufb m3, m7 movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 phaddd m4, m6 paddd m4, [rsp+0x00] psrad m4, [rsp+0x40] pslld m4, 16 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .dy1_vloop SWAP m1, m12, m10 SWAP m7, m11 .dy2: movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] add wq, base_reg jmp wq %if isput .dy2_w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0-1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd xm4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*2] movu xm2, [srcq+ssq*4] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 lea srcq, [srcq+ssq*4] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m0, m10 pshufb m1, m10 pshufb m2, m10 pmaddwd m0, m15 pmaddwd m1, m15 pmaddwd m2, m15 movq xm6, r4q pmovsxbw xm6, xm6 phaddd m0, m1 phaddd m1, m2 paddd m0, m12 paddd m1, m12 psrad m0, xm7 psrad m1, xm7 packssdw m0, m1 ; 0 2 2 4 1 3 3 5 vextracti128 xm1, m0, 1 pshufd xm8, xm6, q0000 pshufd xm9, xm6, q1111 pshufd xm14, xm6, q2222 pshufd xm6, xm6, q3333 punpcklwd xm2, xm0, xm1 ; 01 23 punpckhwd xm1, xm0, xm1 ; 23 45 .dy2_w2_loop: movu xm3, [srcq+ssq*0] movu xm5, [srcq+ssq*2] vinserti128 m3, [srcq+ssq*1], 1 ; 6 7 vinserti128 m5, [srcq+ss3q ], 1 ; 8 9 lea srcq, [srcq+ssq*4] pmaddwd xm4, xm2, xm8 pmaddwd xm1, xm9 pshufb m3, m10 pshufb m5, m10 pmaddwd m3, m15 pmaddwd m5, m15 phaddd m3, m5 paddd xm4, xm1 paddd m3, m12 psrad m3, xm7 packssdw m3, m3 pshufd m3, m3, q2100 palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9 vextracti128 xm1, m0, 1 punpcklwd xm2, xm0, xm1 ; 45 67 punpckhwd xm1, xm0, xm1 ; 67 89 pmaddwd xm3, xm2, xm14 pmaddwd xm5, xm1, xm6 paddd xm4, xm13 paddd xm4, xm3 psrldq xm3, xm7, 8 paddd xm4, xm5 psrad xm4, xm3 packusdw xm4, xm4 pminsw xm4, xm11 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif .dy2_w4: mov myd, mym %if isput mova [rsp+0x50], xm11 %endif mova [rsp+0x00], m12 mova [rsp+0x20], m13 mova [rsp+0x40], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m4, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] pcmpeqd m6, m9 punpckldq m11, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 movq xm10, r13q pblendvb m14, m2, m11 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu xm0, [srcq+ssq*0] movu xm7, [srcq+r4 ] movu xm1, [srcq+ssq*1] movu xm8, [srcq+r6 ] vinserti128 m0, [srcq+ssq*2], 1 ; 0 2 vinserti128 m7, [srcq+r11 ], 1 vinserti128 m1, [srcq+ss3q ], 1 ; 1 3 vinserti128 m8, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] movu xm2, [srcq+ssq*0] movu xm9, [srcq+r4 ] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 vinserti128 m9, [srcq+r6 ], 1 lea srcq, [srcq+ssq*2] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m4 paddb m13, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x40] pshufb m0, m12 pshufb m1, m12 pshufb m2, m12 pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pshufb m7, m13 pshufb m8, m13 pshufb m9, m13 pmaddwd m7, m15 pmaddwd m8, m15 pmaddwd m9, m15 punpcklqdq xm10, xm10 pmovsxbw m10, xm10 phaddd m0, m7 phaddd m1, m8 phaddd m2, m9 paddd m0, m5 paddd m1, m5 paddd m2, m5 psrad m0, xm6 psrad m1, xm6 psrad m2, xm6 vperm2i128 m3, m0, m2, 0x21 ; 2 4 vperm2i128 m2, m1, 0x13 ; 3 5 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 packssdw m0, m3 ; 0 2 2 4 packssdw m1, m2 ; 1 3 3 5 punpckhwd m2, m0, m1 ; 23 45 punpcklwd m0, m1 ; 01 23 .dy2_w4_loop: movu xm1, [srcq+ssq*0] movu xm6, [srcq+r4 ] movu xm3, [srcq+ssq*1] movu xm11, [srcq+r6 ] vinserti128 m1, [srcq+ssq*2], 1 ; 6 8 vinserti128 m6, [srcq+r11 ], 1 vinserti128 m3, [srcq+ss3q ], 1 ; 7 9 vinserti128 m11, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] pmaddwd m4, m0, m7 pmaddwd m5, m2, m8 pshufb m1, m12 pshufb m3, m12 pmaddwd m1, m14 pmaddwd m3, m14 mova m0, [rsp+0x00] pshufb m6, m13 pshufb m11, m13 pmaddwd m6, m15 pmaddwd m11, m15 paddd m4, m5 movd xm5, [rsp+0x40] phaddd m1, m6 phaddd m3, m11 paddd m1, m0 paddd m3, m0 psrad m1, xm5 psrad m3, xm5 pslld m3, 16 pblendw m1, m3, 0xaa ; 67 89 vperm2i128 m0, m2, m1, 0x21 ; 45 67 paddd m4, [rsp+0x20] mova m2, m1 pmaddwd m5, m0, m9 pmaddwd m6, m2, m10 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0x50] movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET SWAP m10, m13 .dy2_w8: mov dword [rsp+0xa0], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+0xa0], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+0xa0], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+0xa0], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+0xa0], 16 movifprep tmp_stridem, 256 .dy2_w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput movifnidn dsm, dsq mova [rsp+0xc0], xm7 %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 mova [rsp+0x40], xm11 shr t0d, 16 sub srcq, 6 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0xa4], t0d mov [rsp+0xa8], srcq mov [rsp+0xb0], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+0xa0] jz .ret add qword [rsp+0xb0], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x60] vpbroadcastd m15, [rsp+0xa4] pxor m9, m9 mov srcq, [rsp+0xa8] mov r0q, [rsp+0xb0] ; dstq / tmpq mova m10, [rsp+0x00] mova xm11, [rsp+0x40] .dy2_hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x60], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x80], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x80] vbroadcasti128 m7, [base+subpel_s_shuf8] vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m10, [rsp+0x58] vpbroadcastd m11, [rsp+0x5c] pshufb m0, m7 ; 01a 01b pshufb m1, m7 ; 23a 23b pshufb m2, m7 ; 45a 45b pshufb m3, m7 ; 67a 67b .dy2_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xc0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep mova m0, m1 mova m1, m2 mova m2, m3 movu xm3, [srcq+ r4*2] movu xm4, [srcq+ r6*2] movu xm5, [srcq+ r7*2] movu xm6, [srcq+ r9*2] vinserti128 m3, [srcq+r10*2], 1 vinserti128 m4, [srcq+r11*2], 1 vinserti128 m5, [srcq+r13*2], 1 vinserti128 m6, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m3, m12 pmaddwd m4, m13 pmaddwd m5, m14 pmaddwd m6, m15 phaddd m3, m4 phaddd m5, m6 phaddd m3, m5 movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 mova m5, [rsp+0x00] movd xm7, [rsp+0x40] phaddd m4, m6 paddd m3, m5 paddd m4, m5 psrad m3, xm7 psrad m4, xm7 pslld m4, 16 pblendw m3, m4, 0xaa jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %undef isput %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_16bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, t0d jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %else DECLARE_REG_TMP 6, 8 %endif %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put PUT_8TAP_SCALED_FN sharp, SHARP, SHARP PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep PREP_8TAP_SCALED_FN sharp, SHARP, SHARP PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep %macro WARP_V 5 ; dst, 01, 23, 45, 67 lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm8, [filterq+myq *8] vinserti128 m8, [filterq+tmp1q*8], 1 ; a e lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+deltaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; b f lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm9, [filterq+myq *8] vinserti128 m9, [filterq+tmp1q*8], 1 ; c g lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+gammaq] ; my += gamma punpcklwd m8, m0 shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; d h punpcklwd m0, m9, m0 punpckldq m9, m8, m0 punpckhdq m0, m8, m0 punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 pmaddwd m%2, m8 pmaddwd m9, m%3 punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 pmaddwd m8, m%4 pmaddwd m0, m%5 paddd m9, m%2 mova m%2, m%3 paddd m0, m8 mova m%3, m%4 mova m%4, m%5 paddd m%1, m0, m9 %endmacro cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts mov r6d, r7m lea r9, [$$] shr r6d, 11 vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4] vpbroadcastd m14, [warp8x8t_rnd] call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main jmp .start .loop: call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2 lea tmpq, [tmpq+tsq*4] .start: paddd m7, m14 paddd m0, m14 psrad m7, 15 psrad m0, 15 packssdw m7, m0 vpermq m7, m7, q3120 mova [tmpq+tsq*0], xm7 vextracti128 [tmpq+tsq*2], m7, 1 dec r4d jg .loop .end: RET cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \ alpha, beta, filter, tmp1, delta, \ my, gamma mov r6d, r7m lea filterq, [$$] shr r6d, 11 vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4] vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4] vpbroadcastw m15, r7m ; pixel_max call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m7, 16 psrad m0, 16 packusdw m7, m0 pmulhrsw m7, m14 pminsw m7, m15 vpermq m7, m7, q3120 mova [dstq+dsq*0], xm7 vextracti128 [dstq+dsq*1], m7, 1 dec r4d jg .loop .end: RET ALIGN function_align .main: ; Stack args offset by one (r4m -> r5m etc.) due to call %if WIN64 mov abcdq, r5m mov mxd, r6m %endif movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] vpbroadcastd m12, [pd_32768] pxor m11, m11 add filterq, mc_warp_filter-$$ lea tmp1q, [ssq*3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 sub betad, tmp2d ; beta -= alpha*3 mov myd, r7m call .h psrld m1, m0, 16 call .h pblendw m1, m0, 0xaa ; 01 psrld m2, m0, 16 call .h pblendw m2, m0, 0xaa ; 12 psrld m3, m0, 16 call .h pblendw m3, m0, 0xaa ; 23 psrld m4, m0, 16 call .h pblendw m4, m0, 0xaa ; 34 psrld m5, m0, 16 call .h pblendw m5, m0, 0xaa ; 45 psrld m6, m0, 16 call .h pblendw m6, m0, 0xaa ; 56 movsx deltad, word [abcdq+2*2] movsx gammad, word [abcdq+2*3] add myd, 512+(64<<10) mov r4d, 4 lea tmp1d, [deltaq*3] sub gammad, tmp1d ; gamma -= delta*3 .main2: call .h psrld m7, m6, 16 pblendw m7, m0, 0xaa ; 67 WARP_V 7, 1, 3, 5, 7 call .h psrld m10, m5, 16 pblendw m10, m0, 0xaa ; 78 WARP_V 0, 2, 4, 6, 10 ret ALIGN function_align .h: lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] movu xm10, [srcq-6] vinserti128 m10, [srcq+2], 1 shr mxd, 10 ; 0 shr tmp1d, 10 ; 4 movq xm0, [filterq+mxq *8] vinserti128 m0, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] movu xm8, [srcq-4] vinserti128 m8, [srcq+4], 1 shr tmp2d, 10 ; 1 shr tmp1d, 10 ; 5 movq xm9, [filterq+tmp2q*8] vinserti128 m9, [filterq+tmp1q*8], 1 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 ; 2 shr tmp1d, 10 ; 6 punpcklbw m0, m11, m0 pmaddwd m0, m10 movu xm10, [srcq-2] vinserti128 m10, [srcq+6], 1 punpcklbw m9, m11, m9 pmaddwd m9, m8 movq xm8, [filterq+mxq *8] vinserti128 m8, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta phaddd m0, m9 ; 0 1 4 5 movu xm9, [srcq+0] vinserti128 m9, [srcq+8], 1 shr tmp2d, 10 ; 3 shr tmp1d, 10 ; 7 punpcklbw m8, m11, m8 pmaddwd m8, m10 movq xm10, [filterq+tmp2q*8] vinserti128 m10, [filterq+tmp1q*8], 1 punpcklbw m10, m11, m10 pmaddwd m9, m10 add srcq, ssq phaddd m8, m9 ; 2 3 6 7 phaddd m0, m8 ; 0 1 2 3 4 5 6 7 vpsllvd m0, m13 paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword ret %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 cmp hd, 8 je .ret lea dstq, [dstq+strideq*4] movq [dstq ], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq ], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 .ret: RET .w8: mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 cmp hd, 4 jne .w8_loop_start RET .w8_loop: call .main lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 .w8_loop_start: lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 mova [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 sub hd, 8 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 call .main mova [dstq+32*4], m0 mova [dstq+32*5], m1 mova [dstq+32*6], m2 mova [dstq+32*7], m3 dec hd jg .w128_loop RET %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx2_table lea r6, [avg_avx2_table] tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 vpbroadcastd m4, [base+bidir_rnd+t0*4] vpbroadcastd m5, [base+bidir_mul+t0*4] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+32*0] paddsw m0, [tmp2q+32*0] mova m1, [tmp1q+32*1] paddsw m1, [tmp2q+32*1] mova m2, [tmp1q+32*2] paddsw m2, [tmp2q+32*2] mova m3, [tmp1q+32*3] paddsw m3, [tmp2q+32*3] add tmp1q, 32*4 add tmp2q, 32*4 pmaxsw m0, m4 pmaxsw m1, m4 pmaxsw m2, m4 pmaxsw m3, m4 psubsw m0, m4 psubsw m1, m4 psubsw m2, m4 psubsw m3, m4 pmulhw m0, m5 pmulhw m1, m5 pmulhw m2, m5 pmulhw m3, m5 ret cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3 lea r6, [w_avg_avx2_table] tzcnt wd, wm mov t0d, r6m ; weight vpbroadcastw m8, r7m ; pixel_max vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538] movsxd wq, [r6+wq*4] paddw m7, m8 add wq, r6 lea r6d, [t0-16] shl t0d, 16 sub t0d, r6d ; 16-weight, weight pslld m7, 7 rorx r6d, t0d, 30 ; << 2 test dword r7m, 0x800 cmovz r6d, t0d movifnidn hd, hm movd xm6, r6d vpbroadcastd m6, xm6 BIDIR_FN ALIGN function_align .main: mova m4, [tmp1q+32*0] mova m0, [tmp2q+32*0] punpckhwd m5, m0, m4 punpcklwd m0, m4 mova m4, [tmp1q+32*1] mova m1, [tmp2q+32*1] pmaddwd m5, m6 pmaddwd m0, m6 paddd m5, m7 paddd m0, m7 psrad m5, 8 psrad m0, 8 packusdw m0, m5 punpckhwd m5, m1, m4 punpcklwd m1, m4 mova m4, [tmp1q+32*2] mova m2, [tmp2q+32*2] pmaddwd m5, m6 pmaddwd m1, m6 paddd m5, m7 paddd m1, m7 psrad m5, 8 psrad m1, 8 packusdw m1, m5 punpckhwd m5, m2, m4 punpcklwd m2, m4 mova m4, [tmp1q+32*3] mova m3, [tmp2q+32*3] add tmp1q, 32*4 add tmp2q, 32*4 pmaddwd m5, m6 pmaddwd m2, m6 paddd m5, m7 paddd m2, m7 psrad m5, 8 psrad m2, 8 packusdw m2, m5 punpckhwd m5, m3, m4 punpcklwd m3, m4 pmaddwd m5, m6 pmaddwd m3, m6 paddd m5, m7 paddd m3, m7 psrad m5, 8 psrad m3, 8 packusdw m3, m5 pminsw m0, m8 pminsw m1, m8 pminsw m2, m8 pminsw m3, m8 ret cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx2_table lea r7, [mask_avx2_table] tzcnt wd, wm mov r6d, r7m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_64] vpbroadcastd m9, [base+bidir_rnd+r6*4] vpbroadcastd m10, [base+bidir_mul+r6*4] mov maskq, maskmp add wq, r7 BIDIR_FN ALIGN function_align .main: %macro MASK 1 pmovzxbw m5, [maskq+16*%1] mova m%1, [tmp1q+32*%1] mova m6, [tmp2q+32*%1] punpckhwd m4, m%1, m6 punpcklwd m%1, m6 psubw m7, m8, m5 punpckhwd m6, m5, m7 ; m, 64-m punpcklwd m5, m7 pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m) pmaddwd m%1, m5 psrad m4, 5 psrad m%1, 5 packssdw m%1, m4 pmaxsw m%1, m9 psubsw m%1, m9 pmulhw m%1, m10 %endmacro MASK 0 MASK 1 MASK 2 MASK 3 add maskq, 16*4 add tmp1q, 32*4 add tmp2q, 32*4 ret cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx2_table lea r7, [w_mask_420_avx2_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movd xm0, r7m ; sign movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+bidir_rnd+r6*4] vpbroadcastd m13, [base+bidir_mul+r6*4] movd xm14, [base+pw_2] mov maskq, maskmp psubw xm14, xm0 vpbroadcastw m14, xm14 add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: phaddd m4, m5 paddw m4, m14 psrlw m4, 2 packuswb m4, m4 vextracti128 xm5, m4, 1 punpcklwd xm4, xm5 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 mova [maskq], xm4 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w8: vperm2i128 m6, m4, m5, 0x21 vpblendd m4, m5, 0xf0 paddw m4, m14 paddw m4, m6 psrlw m4, 2 vextracti128 xm5, m4, 1 packuswb xm4, xm5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 mova [maskq], xm4 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 mova [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w16: punpcklqdq m6, m4, m5 punpckhqdq m4, m5 paddw m6, m14 paddw m4, m6 psrlw m4, 2 vextracti128 xm5, m4, 1 packuswb xm4, xm5 pshufd xm4, xm4, q3120 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 mova [maskq], xm4 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 32 .w32: paddw m4, m14 paddw m4, m5 psrlw m15, m4, 2 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 call .main mova m6, [deint_shuf] paddw m4, m14 paddw m4, m5 psrlw m4, 2 packuswb m15, m4 vpermd m4, m6, m15 mova [dstq+strideq*2+32*0], m0 mova [dstq+strideq*2+32*1], m1 mova [dstq+stride3q +32*0], m2 mova [dstq+stride3q +32*1], m3 mova [maskq], m4 sub hd, 4 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 32 .w64: paddw m4, m14 paddw m15, m14, m5 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [maskq], m4 ; no available registers call .main paddw m4, [maskq] mova m6, [deint_shuf] paddw m5, m15 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 ; 0 2 4 6 1 3 5 7 vpermd m4, m6, m4 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 mova [maskq], m4 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 64 .w128: paddw m4, m14 paddw m5, m14 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [maskq+32*0], m4 mova [dstq+strideq], m5 call .main paddw m4, m14 paddw m15, m14, m5 mova [dstq+strideq*0+32*4], m0 mova [dstq+strideq*0+32*5], m1 mova [dstq+strideq*0+32*6], m2 mova [dstq+strideq*0+32*7], m3 mova [maskq+32*1], m4 call .main paddw m4, [maskq+32*0] paddw m5, [dstq+strideq] mova m6, [deint_shuf] psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m6, m4 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 mova [maskq+32*0], m4 call .main paddw m4, [maskq+32*1] mova m6, [deint_shuf] paddw m5, m15 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m6, m4 mova [dstq+strideq*1+32*4], m0 mova [dstq+strideq*1+32*5], m1 mova [dstq+strideq*1+32*6], m2 mova [dstq+strideq*1+32*7], m3 mova [maskq+32*1], m4 sub hd, 2 jg .w128_loop RET ALIGN function_align .main: %macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul mova m%1, [tmp1q+32*%1] mova m%2, [tmp2q+32*%1] punpcklwd m8, m%2, m%1 punpckhwd m9, m%2, m%1 psubsw m%1, m%2 pabsw m%1, m%1 psubusw m7, m10, m%1 psrlw m7, 10 ; 64-m psubw m%2, m%3, m7 ; m punpcklwd m%1, m7, m%2 punpckhwd m7, m%2 pmaddwd m%1, m8 pmaddwd m7, m9 psrad m%1, 5 psrad m7, 5 packssdw m%1, m7 pmaxsw m%1, m%4 psubsw m%1, m%4 pmulhw m%1, m%5 %endmacro W_MASK 0, 4 W_MASK 1, 5 phaddw m4, m5 W_MASK 2, 5 W_MASK 3, 6 phaddw m5, m6 add tmp1q, 32*4 add tmp2q, 32*4 ret cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx2_table lea r7, [w_mask_422_avx2_table] tzcnt wd, wm mov r6d, r8m ; pixel_max vpbroadcastb m14, r7m ; sign movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+bidir_rnd+r6*4] vpbroadcastd m13, [base+bidir_mul+r6*4] mova m15, [base+deint_shuf] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 mova [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 call .main mova [dstq+32*4], m0 mova [dstq+32*5], m1 mova [dstq+32*6], m2 mova [dstq+32*7], m3 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 4 W_MASK 1, 5 phaddw m4, m5 W_MASK 2, 5 W_MASK 3, 6 phaddw m5, m6 add tmp1q, 32*4 add tmp2q, 32*4 packuswb m4, m5 pxor m5, m5 psubb m4, m14 pavgb m4, m5 vpermd m4, m15, m4 mova [maskq], m4 add maskq, 32 ret cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx2_table lea r7, [w_mask_444_avx2_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] vpbroadcastd m4, [base+pw_64] vpbroadcastd m5, [base+bidir_rnd+r6*4] vpbroadcastd m6, [base+bidir_mul+r6*4] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end call .main lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+32*0], m0 mova [dstq+32*1], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 call .main mova [dstq+32*2], m0 mova [dstq+32*3], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+32*0], m0 mova [dstq+32*1], m1 call .main mova [dstq+32*2], m0 mova [dstq+32*3], m1 call .main mova [dstq+32*4], m0 mova [dstq+32*5], m1 call .main mova [dstq+32*6], m0 mova [dstq+32*7], m1 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 2, 4, 5, 6 W_MASK 1, 3, 4, 5, 6 packuswb m2, m3 vpermq m2, m2, q3120 add tmp1q, 32*2 add tmp2q, 32*2 mova [maskq], m2 add maskq, 32 ret ; (a * (64 - m) + b * m + 32) >> 6 ; = (((b - a) * m + 32) >> 6) + a ; = (((b - a) * (m << 9) + 16384) >> 15) + a ; except m << 9 overflows int16_t when m == 64 (which is possible), ; but if we negate m it works out (-64 << 9 == -32768). ; = (((a - b) * (m * -512) + 16384) >> 15) + a cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx2_table lea r6, [blend_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp vpbroadcastd m6, [base+pw_m512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: pmovzxbw m3, [maskq] movq xm0, [dstq+dsq*0] movhps xm0, [dstq+dsq*1] vpbroadcastq m1, [dstq+dsq*2] vpbroadcastq m2, [dstq+r6 ] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 psubw m1, m0, [tmpq] add maskq, 16 add tmpq, 32 pmullw m3, m6 pmulhrsw m1, m3 paddw m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 movq [dstq+dsq*2], xm1 movhps [dstq+r6 ], xm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET .w8: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 mova xm1, [dstq+dsq*2] vinserti128 m1, [dstq+r6 ], 1 psubw m2, m0, [tmpq+32*0] psubw m3, m1, [tmpq+32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 mova [dstq+dsq*2], xm1 vextracti128 [dstq+r6 ], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16 RET .w32: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova m0, [dstq+32*0] psubw m2, m0, [tmpq+32*0] mova m1, [dstq+32*1] psubw m3, m1, [tmpq+32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, dsq dec hd jg .w32 RET INIT_XMM avx2 cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h %define base r5-blend_v_avx2_table lea r5, [blend_v_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 jmp wq .w2: vpbroadcastd m2, [base+obmc_masks_avx2+2*2] .w2_loop: movd m0, [dstq+dsq*0] pinsrd m0, [dstq+dsq*1], 1 movq m1, [tmpq] add tmpq, 4*2 psubw m1, m0, m1 pmulhrsw m1, m2 paddw m0, m1 movd [dstq+dsq*0], m0 pextrd [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_loop RET .w4: vpbroadcastq m2, [base+obmc_masks_avx2+4*2] .w4_loop: movq m0, [dstq+dsq*0] movhps m0, [dstq+dsq*1] psubw m1, m0, [tmpq] add tmpq, 8*2 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET INIT_YMM avx2 .w8: vbroadcasti128 m2, [base+obmc_masks_avx2+8*2] .w8_loop: mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 psubw m1, m0, [tmpq] add tmpq, 16*2 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: mova m4, [base+obmc_masks_avx2+16*2] .w16_loop: mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add tmpq, 32*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET .w32: %if WIN64 movaps [rsp+ 8], xmm6 movaps [rsp+24], xmm7 %endif mova m6, [base+obmc_masks_avx2+32*2] vbroadcasti128 m7, [base+obmc_masks_avx2+32*3] .w32_loop: mova m0, [dstq+dsq*0+32*0] psubw m3, m0, [tmpq +32*0] mova xm2, [dstq+dsq*0+32*1] mova xm5, [tmpq +32*1] mova m1, [dstq+dsq*1+32*0] psubw m4, m1, [tmpq +32*2] vinserti128 m2, [dstq+dsq*1+32*1], 1 vinserti128 m5, [tmpq +32*3], 1 add tmpq, 32*4 psubw m5, m2, m5 pmulhrsw m3, m6 pmulhrsw m4, m6 pmulhrsw m5, m7 paddw m0, m3 paddw m1, m4 paddw m2, m5 mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*1+32*0], m1 mova [dstq+dsq*0+32*1], xm2 vextracti128 [dstq+dsq*1+32*1], m2, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32_loop %if WIN64 movaps xmm6, [rsp+ 8] movaps xmm7, [rsp+24] %endif RET %macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp mova m0, [dstq+32*(%1+0)] psubw m2, m0, [tmpq+32*(%2+0)] mova m1, [dstq+32*(%1+1)] psubw m3, m1, [tmpq+32*(%2+1)] %if %3 add tmpq, 32*%3 %endif pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+32*(%1+0)], m0 mova [dstq+32*(%1+1)], m1 %endmacro INIT_XMM avx2 cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_h_avx2_table lea r5, [blend_h_avx2_table] tzcnt wd, wm mov hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea maskq, [base+obmc_masks_avx2+hq*2] lea hd, [hq*3] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd m0, [dstq+dsq*0] pinsrd m0, [dstq+dsq*1], 1 movd m2, [maskq+hq*2] movq m1, [tmpq] add tmpq, 4*2 punpcklwd m2, m2 psubw m1, m0, m1 pmulhrsw m1, m2 paddw m0, m1 movd [dstq+dsq*0], m0 pextrd [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova m3, [blend_shuf] .w4_loop: movq m0, [dstq+dsq*0] movhps m0, [dstq+dsq*1] movd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 8*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET INIT_YMM avx2 .w8: vbroadcasti128 m3, [blend_shuf] shufpd m3, m3, 0x0c .w8_loop: mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 vpbroadcastd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 16*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop RET .w16: vpbroadcastw m4, [maskq+hq*2] vpbroadcastw m5, [maskq+hq*2+2] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add tmpq, 32*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16 RET .w32: vpbroadcastw m4, [maskq+hq*2] BLEND_H_ROW 0, 0, 2 add dstq, dsq inc hq jl .w32 RET .w64: vpbroadcastw m4, [maskq+hq*2] BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2, 4 add dstq, dsq inc hq jl .w64 RET .w128: vpbroadcastw m4, [maskq+hq*2] BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2, 8 BLEND_H_ROW 4, -4 BLEND_H_ROW 6, -2 add dstq, dsq inc hq jl .w128 RET cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ bottomext, rightext ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor r12d, r12d lea r10, [ihq-1] cmp yq, ihq cmovs r10, yq test yq, yq cmovs r10, r12 imul r10, sstrideq add srcq, r10 ; ref += iclip(x, 0, iw - 1) lea r10, [iwq-1] cmp xq, iwq cmovs r10, xq test xq, xq cmovs r10, r12 lea srcq, [srcq+r10*2] ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) lea bottomextq, [yq+bhq] sub bottomextq, ihq lea r3, [bhq-1] cmovs bottomextq, r12 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ bottomext, rightext ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, r12 cmp bottomextq, bhq cmovns bottomextq, r3 cmp topextq, bhq cmovg topextq, r3 ; right_ext = iclip(x + bw - iw, 0, bw - 1) lea rightextq, [xq+bwq] sub rightextq, iwq lea r2, [bwq-1] cmovs rightextq, r12 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ bottomext, rightext ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, r12 cmp rightextq, bwq cmovns rightextq, r2 cmp leftextq, bwq cmovns leftextq, r2 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ dst, dstride, src, sstride, bottomext, rightext ; center_h = bh - top_ext - bottom_ext lea r3, [bottomextq+topextq] sub centerhq, r3 ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq imul r2, dstrideq add dstq, r2 mov r9m, dstq ; center_w = bw - left_ext - right_ext mov centerwq, bwq lea r3, [rightextq+leftextq] sub centerwq, r3 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix .v_loop_%3: %if %1 ; left extension xor r3, r3 vpbroadcastw m0, [srcq] .left_loop_%3: mova [dstq+r3*2], m0 add r3, 16 cmp r3, leftextq jl .left_loop_%3 ; body lea r12, [dstq+leftextq*2] %endif xor r3, r3 .body_loop_%3: movu m0, [srcq+r3*2] %if %1 movu [r12+r3*2], m0 %else movu [dstq+r3*2], m0 %endif add r3, 16 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 lea r12, [r12+centerwq*2] %else lea r12, [dstq+centerwq*2] %endif xor r3, r3 vpbroadcastw m0, [srcq+centerwq*2-2] .right_loop_%3: movu [r12+r3*2], m0 add r3, 16 cmp r3, rightextq jl .right_loop_%3 %endif add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %endmacro test leftextq, leftextq jnz .need_left_ext test rightextq, rightextq jnz .need_right_ext v_loop 0, 0, 0 jmp .body_done .need_left_ext: test rightextq, rightextq jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; bottom edge extension test bottomextq, bottomextq jz .top mov srcq, dstq sub srcq, dstrideq xor r1, r1 .bottom_x_loop: mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, bottomextq .bottom_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .bottom_y_loop add r1, 16 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end mov srcq, r9m mov dstq, dstm xor r1, r1 .top_x_loop: mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, topextq .top_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .top_y_loop add r1, 16 cmp r1, bwq jl .top_x_loop .end: RET cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax sub dword mx0m, 4<<14 sub dword src_wm, 8 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pd_64] vpbroadcastw xm7, pxmaxm pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] pslld m5, 3 ; dx*8 pslld m6, 14 paddd m8, m2 ; mx+[0..7]*dx .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: vpbroadcastd m10, [base+pd_63] pxor m2, m2 pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset pand m9, m10 ; filter offset (masked) ; load source pixels movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vextracti128 xm0, m0, 1 movu xm10, [srcq+r8*2] movu xm11, [srcq+r9*2] movu xm12, [srcq+r10*2] movu xm13, [srcq+r11*2] movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vinserti128 m10, [srcq+r8*2], 1 vinserti128 m11, [srcq+r9*2], 1 vinserti128 m12, [srcq+r10*2], 1 vinserti128 m13, [srcq+r11*2], 1 ptest m1, m1 jz .filter movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vextracti128 xm1, m1, 1 movu xm14, [base+resize_shuf+8+r8*2] movu xm15, [base+resize_shuf+8+r9*2] movu xm0, [base+resize_shuf+8+r10*2] movu xm2, [base+resize_shuf+8+r11*2] movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vinserti128 m14, [base+resize_shuf+8+r8*2], 1 vinserti128 m15, [base+resize_shuf+8+r9*2], 1 vinserti128 m0, [base+resize_shuf+8+r10*2], 1 vinserti128 m2, [base+resize_shuf+8+r11*2], 1 pshufb m10, m14 pshufb m11, m15 pshufb m12, m0 pshufb m13, m2 .filter: movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 vextracti128 xm9, m9, 1 movq xm14, [base+resize_filter+r8*8] movq xm15, [base+resize_filter+r9*8] movq xm0, [base+resize_filter+r10*8] movq xm2, [base+resize_filter+r11*8] movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 movhps xm14, [base+resize_filter+r8*8] movhps xm15, [base+resize_filter+r9*8] movhps xm0, [base+resize_filter+r10*8] movhps xm2, [base+resize_filter+r11*8] pmovsxbw m14, xm14 pmovsxbw m15, xm15 pmovsxbw m0, xm0 pmovsxbw m2, xm2 pmaddwd m10, m14 pmaddwd m11, m15 pmaddwd m12, m0 pmaddwd m13, m2 phaddd m10, m11 phaddd m12, m13 phaddd m10, m12 psubd m10, m3, m10 psrad m10, 7 vextracti128 xm0, m10, 1 packusdw xm10, xm0 pminsw xm10, xm7 mova [dstq+xq*2], xm10 paddd m4, m5 add xd, 8 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET %endif ; ARCH_X86_64 av-scenechange-0.14.1/src/asm/x86/mc16_avx512.asm000064400000000000000000005245531046102023000171460ustar 00000000000000; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 dd 1 pw_2048: times 2 dw 2048 dd 3 pw_8192: times 2 dw 8192 avg_shift: dw 5, 5, 3, 3 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 resize_permE: dq 0, 2, 4, 6 resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 prep_hv_shift: dq 6, 4 put_bilin_h_rnd: dw 8, 8, 10, 10 prep_mul: dw 16, 16, 4, 4 put_8tap_h_rnd: dd 34, 40 prep_8tap_rnd: dd 128 - (8192 << 8) warp_8x8_rnd_h: dd 512, 2048 warp_8x8_rnd_v: dd 262144, 65536 warp_8x8t_rnd_v: dd 16384 - (8192 << 15) avg_round: dw -16400, -16400, -16388, -16388 w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) w_mask_round: dd 128, 64 bidir_shift: dw 6, 6, 4, 4 pb_64: times 4 db 64 pw_m512: times 2 dw -512 pw_2: times 2 dw 2 pw_64: times 2 dw 64 pd_32: dd 32 pd_63: dd 63 pd_128: dd 128 pd_640: dd 640 pd_2176: dd 2176 pd_16384: dd 16384 pd_0_4: dd 0, 4 %define pw_16 prep_mul %define pd_512 warp_8x8_rnd_h %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern obmc_masks_avx2 cextern resize_filter SECTION .text %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif INIT_ZMM avx512icl cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [put_avx512icl] tzcnt t0d, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx t0d, word [r7+t0*2+table_offset(put,)] add t0, r7 jmp t0 .put_w2: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu xmm0, [srcq+ssq*0] movu xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], xmm0 mova [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu ym0, [srcq+ssq*0] movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], ym0 mova [dstq+dsq*1], ym1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+64*0], m0 mova [dstq+dsq*0+64*1], m1 mova [dstq+dsq*1+64*0], m2 mova [dstq+dsq*1+64*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] add srcq, ssq mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .put_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] mov r6d, r8m ; bitdepth_max add t0, r7 shr r6d, 11 vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] jmp t0 .h_w2: movq xmm1, [srcq+ssq*0] movhps xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq xmm0, [srcq+ssq*0+0] movhps xmm0, [srcq+ssq*1+0] movq xmm1, [srcq+ssq*0+2] movhps xmm1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw xmm0, xm4 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu xm0, [srcq+ssq*0+0] vinserti32x4 ym0, [srcq+ssq*1+0], 1 movu xm1, [srcq+ssq*0+2] vinserti32x4 ym1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw ym0, ym4 pmullw ym1, ym5 paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 4 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu ym0, [srcq+ssq*0+0] vinserti32x8 m0, [srcq+ssq*1+0], 1 movu ym1, [srcq+ssq*0+2] vinserti32x8 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m6 paddw m0, m1 psrlw m0, 4 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] pmullw m1, m4, [srcq+ssq*1+0] pmullw m3, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+64*0+0] pmullw m2, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m3, m5, [srcq+64*1+2] add srcq, ssq paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+64*0+0] pmullw m7, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m8, m5, [srcq+64*1+2] pmullw m2, m4, [srcq+64*2+0] pmullw m9, m5, [srcq+64*2+2] pmullw m3, m4, [srcq+64*3+0] pmullw m10, m5, [srcq+64*3+2] add srcq, ssq REPX {paddw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psrlw x, 4}, m0, m1, m2, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .h_w128 RET .v: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] shl mxyd, 11 vpbroadcastw m8, mxyd add t0, r7 jmp t0 .v_w2: movd xmm0, [srcq+ssq*0] .v_w2_loop: movd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq xmm2, xmm0, xmm1 movd xmm0, [srcq+ssq*0] punpckldq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm0, [srcq+ssq*0] .v_w4_loop: movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq xmm2, xmm0, xmm1 movq xmm0, [srcq+ssq*0] punpcklqdq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movu xmm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 ymm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd ymm2, ymm0, ymm1, 0xf0 vbroadcasti128 ymm0, [srcq+ssq*0] vpblendd ymm1, ymm0, 0xf0 psubw ymm1, ymm2 pmulhrsw ymm1, ym8 paddw ymm1, ymm2 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: movu ym0, [srcq+ssq*0] .v_w16_loop: movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw ym1, ym3, ym0 pmulhrsw ym1, ym8 paddw ym1, ym0 movu ym0, [srcq+ssq*0] psubw ym2, ym0, ym3 pmulhrsw ym2, ym8 paddw ym2, ym3 mova [dstq+dsq*0], ym1 mova [dstq+dsq*1], ym2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: movu m0, [srcq+ssq*0] .v_w32_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m8 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m8 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] .v_w64_loop: movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] psubw m4, m2, m0 pmulhrsw m4, m8 paddw m4, m0 movu m0, [srcq+ssq*0+64*0] psubw m5, m3, m1 pmulhrsw m5, m8 paddw m5, m1 movu m1, [srcq+ssq*0+64*1] psubw m6, m0, m2 pmulhrsw m6, m8 psubw m7, m1, m3 pmulhrsw m7, m8 mova [dstq+dsq*0+64*0], m4 mova [dstq+dsq*0+64*1], m5 paddw m6, m2 paddw m7, m3 mova [dstq+dsq*1+64*0], m6 mova [dstq+dsq*1+64*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w64_loop RET .v_w128: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*0+64*2] movu m3, [srcq+ssq*0+64*3] .v_w128_loop: movu m4, [srcq+ssq*1+64*0] movu m5, [srcq+ssq*1+64*1] movu m6, [srcq+ssq*1+64*2] movu m7, [srcq+ssq*1+64*3] lea srcq, [srcq+ssq*2] psubw m9, m4, m0 pmulhrsw m9, m8 paddw m9, m0 movu m0, [srcq+ssq*0+64*0] psubw m10, m5, m1 pmulhrsw m10, m8 paddw m10, m1 movu m1, [srcq+ssq*0+64*1] psubw m11, m6, m2 pmulhrsw m11, m8 paddw m11, m2 movu m2, [srcq+ssq*0+64*2] psubw m12, m7, m3 pmulhrsw m12, m8 paddw m12, m3 movu m3, [srcq+ssq*0+64*3] mova [dstq+dsq*0+64*0], m9 psubw m9, m0, m4 pmulhrsw m9, m8 mova [dstq+dsq*0+64*1], m10 psubw m10, m1, m5 pmulhrsw m10, m8 mova [dstq+dsq*0+64*2], m11 psubw m11, m2, m6 pmulhrsw m11, m8 mova [dstq+dsq*0+64*3], m12 psubw m12, m3, m7 pmulhrsw m12, m8 paddw m9, m4 paddw m10, m5 mova [dstq+dsq*1+64*0], m9 mova [dstq+dsq*1+64*1], m10 paddw m11, m6 paddw m12, m7 mova [dstq+dsq*1+64*2], m11 mova [dstq+dsq*1+64*3], m12 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w128_loop RET .hv: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] shl mxyd, 11 vpbroadcastd m6, [pw_2] vpbroadcastw m7, mxyd vpbroadcastd m8, [pw_8192] add t0, r7 test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 vpbroadcastd m8, [pw_2048] .hv_12bpc: jmp t0 .hv_w2: vpbroadcastq xmm1, [srcq+ssq*0] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w2_loop: movq xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm2, [srcq+ssq*0] pmullw xmm1, xmm2, xm4 psrlq xmm2, 16 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 _ 2 _ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: pmullw xmm0, xm4, [srcq+ssq*0-8] pmullw xmm1, xm5, [srcq+ssq*0-6] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w4_loop: movq xmm1, [srcq+ssq*1+0] movq xmm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps xmm1, [srcq+ssq*0+0] movhps xmm2, [srcq+ssq*0+2] pmullw xmm1, xm4 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 2 shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: pmullw xmm0, xm4, [srcq+ssq*0+0] pmullw xmm1, xm5, [srcq+ssq*0+2] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 vinserti32x4 ym0, xmm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1+0] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x4 ym1, [srcq+ssq*0+0], 1 vinserti32x4 ym2, [srcq+ssq*0+2], 1 pmullw ym1, ym4 pmullw ym2, ym5 paddw ym1, ym6 paddw ym1, ym2 psrlw ym1, 2 ; 1 2 vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 mova ym0, ym1 psubw ym1, ym2 paddw ym1, ym1 pmulhw ym1, ym7 paddw ym1, ym2 pmulhrsw ym1, ym8 mova [dstq+dsq*0], xm1 vextracti32x4 [dstq+dsq*1], ym1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+ssq*0+0] pmullw ym1, ym5, [srcq+ssq*0+2] paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+ssq*1+0] movu ym2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x8 m1, [srcq+ssq*0+0], 1 vinserti32x8 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 paddw m1, m6 paddw m1, m2 psrlw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m7 paddw m1, m2 pmulhrsw m1, m8 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: .hv_w64: .hv_w128: movifnidn wd, wm lea r6d, [hq+wq*8-256] mov r4, srcq mov r7, dstq .hv_w32_loop0: pmullw m0, m4, [srcq+ssq*0+0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m1 psrlw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+ssq*1+0] pmullw m1, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m3, m6 paddw m3, m1 psrlw m3, 2 psubw m1, m3, m0 paddw m1, m1 pmulhw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m3 paddw m2, m2 pmulhw m2, m7 paddw m2, m3 pmulhrsw m1, m8 pmulhrsw m2, m8 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop add r4, 64 add r7, 64 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w32_loop0 RET cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] add wq, r6 shr r5d, 11 vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] lea stride3q, [strideq*3] jmp wq .prep_w4: movq xmm0, [srcq+strideq*0] movhps xmm0, [srcq+strideq*1] vpbroadcastq ymm1, [srcq+strideq*2] vpbroadcastq ymm2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm0, ymm1, 0x30 vpblendd ymm0, ymm2, 0xc0 pmullw ymm0, ym4 psubw ymm0, ym5 mova [tmpq], ymm0 add tmpq, 32 sub hd, 4 jg .prep_w4 vzeroupper RET .prep_w8: movu xm0, [srcq+strideq*0] vinserti32x4 ym0, [srcq+strideq*1], 1 vinserti32x4 m0, [srcq+strideq*2], 2 vinserti32x4 m0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 psubw m0, m5 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .prep_w8 RET .prep_w16: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 movu ym1, [srcq+strideq*2] vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 4 jg .prep_w16 RET .prep_w32: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m4, [srcq+strideq*1] pmullw m2, m4, [srcq+strideq*2] pmullw m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .prep_w32 RET .prep_w64: pmullw m0, m4, [srcq+strideq*0+64*0] pmullw m1, m4, [srcq+strideq*0+64*1] pmullw m2, m4, [srcq+strideq*1+64*0] pmullw m3, m4, [srcq+strideq*1+64*1] lea srcq, [srcq+strideq*2] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .prep_w64 RET .prep_w128: pmullw m0, m4, [srcq+64*0] pmullw m1, m4, [srcq+64*1] pmullw m2, m4, [srcq+64*2] pmullw m3, m4, [srcq+64*3] add srcq, strideq REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .prep_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r6m ; my vpbroadcastd m4, [pw_16] vpbroadcastd m6, [pw_32766] psubw m4, m5 test dword r7m, 0x800 jnz .h_12bpc psllw m4, 2 psllw m5, 2 .h_12bpc: test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: movu xm1, [srcq+strideq*0] vinserti32x4 ym1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti32x4 ym2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym0, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4 RET .h_w8: movu xm0, [srcq+strideq*0+0] movu xm1, [srcq+strideq*0+2] vinserti32x4 ym0, [srcq+strideq*1+0], 1 vinserti32x4 ym1, [srcq+strideq*1+2], 1 vinserti32x4 m0, [srcq+strideq*2+0], 2 vinserti32x4 m1, [srcq+strideq*2+2], 2 vinserti32x4 m0, [srcq+stride3q +0], 3 vinserti32x4 m1, [srcq+stride3q +2], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8 RET .h_w16: movu ym0, [srcq+strideq*0+0] vinserti32x8 m0, [srcq+strideq*1+0], 1 movu ym1, [srcq+strideq*0+2] vinserti32x8 m1, [srcq+strideq*1+2], 1 lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] pmullw m1, m4, [srcq+strideq*1+0] pmullw m3, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] add srcq, strideq psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+ 0] pmullw m7, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m8, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m9, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m10, m5, [srcq+194] add srcq, strideq REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psraw x, 2}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .h_w128 RET .v: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] vpbroadcastw m9, mxyd vpbroadcastd m8, [pw_16] vpbroadcastd m10, [pw_32766] add wq, r6 lea stride3q, [strideq*3] psubw m8, m9 test dword r7m, 0x800 jnz .v_12bpc psllw m8, 2 psllw m9, 2 .v_12bpc: jmp wq .v_w4: movq xmm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastq xmm2, [srcq+strideq*1] vpbroadcastq ymm1, [srcq+strideq*2] vpbroadcastq ymm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm2, ymm1, 0x30 vpblendd ymm2, ymm3, 0xc0 vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 movq xmm0, [srcq+strideq*0] valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 pmullw ymm1, ym8 pmullw ymm2, ym9 psubw ymm1, ym10 paddw ymm1, ymm2 psraw ymm1, 2 mova [tmpq], ymm1 add tmpq, 32 sub hd, 4 jg .v_w4_loop vzeroupper RET .v_w8: movu xm0, [srcq+strideq*0] .v_w8_loop: vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vinserti32x4 m1, [srcq+strideq*2], 2 vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 lea srcq, [srcq+strideq*4] movu xm0, [srcq+strideq*0] valignq m2, m0, m1, 2 ; 1 2 3 4 pmullw m1, m8 pmullw m2, m9 psubw m1, m10 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: movu ym0, [srcq+strideq*0] .v_w16_loop: vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 movu ym3, [srcq+strideq*2] vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 lea srcq, [srcq+strideq*4] movu ym0, [srcq+strideq*0] vshufi32x4 m3, m1, m3, q1032 ; 1 2 vshufi32x4 m4, m2, m0, q1032 ; 3 4 pmullw m1, m8 pmullw m2, m8 pmullw m3, m9 pmullw m4, m9 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: movu m0, [srcq+strideq*0] .v_w32_loop: movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m1, m8, m0 movu m0, [srcq+strideq*0] pmullw m2, m8, m3 pmullw m3, m9 pmullw m4, m9, m0 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+64*0] movu m1, [srcq+64*1] .v_w64_loop: add srcq, strideq pmullw m2, m8, m0 movu m0, [srcq+64*0] pmullw m3, m8, m1 movu m1, [srcq+64*1] pmullw m4, m9, m0 pmullw m5, m9, m1 psubw m2, m10 psubw m3, m10 paddw m2, m4 paddw m3, m5 psraw m2, 2 psraw m3, 2 mova [tmpq+64*0], m2 mova [tmpq+64*1], m3 add tmpq, 64*2 dec hd jg .v_w64_loop RET .v_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] .v_w128_loop: add srcq, strideq pmullw m4, m8, m0 movu m0, [srcq+64*0] pmullw m5, m8, m1 movu m1, [srcq+64*1] pmullw m6, m8, m2 movu m2, [srcq+64*2] pmullw m7, m8, m3 movu m3, [srcq+64*3] pmullw m11, m9, m0 pmullw m12, m9, m1 pmullw m13, m9, m2 pmullw m14, m9, m3 REPX {psubw x, m10}, m4, m5, m6, m7 paddw m4, m11 paddw m5, m12 paddw m6, m13 paddw m7, m14 REPX {psraw x, 2}, m4, m5, m6, m7 mova [tmpq+64*0], m4 mova [tmpq+64*1], m5 mova [tmpq+64*2], m6 mova [tmpq+64*3], m7 add tmpq, 64*4 dec hd jg .v_w128_loop RET .hv: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 vpbroadcastw m7, mxyd add wq, r6 lea stride3q, [strideq*3] jmp wq .hv_w4: movq xmm0, [srcq+strideq*0+0] movq xmm1, [srcq+strideq*0+2] pmullw xmm0, xm4 pmullw xmm1, xm5 psubw xmm0, xm6 paddw xmm0, xmm1 psraw xmm0, 2 vpbroadcastq ym0, xmm0 .hv_w4_loop: movu xm1, [srcq+strideq*1] vinserti128 ym1, [srcq+stride3q ], 1 movu xm2, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vinserti128 ym2, [srcq+strideq*0], 1 punpcklqdq ym3, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym3, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym3, ym6 paddw ym1, ym3 psraw ym1, 2 ; 1 2 3 4 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 mova ym0, ym1 psubw ym1, ym2 pmulhrsw ym1, ym7 paddw ym1, ym2 mova [tmpq], ym1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: pmullw xm0, xm4, [srcq+strideq*0+0] pmullw xm1, xm5, [srcq+strideq*0+2] psubw xm0, xm6 paddw xm0, xm1 psraw xm0, 2 vinserti32x4 m0, xm0, 3 .hv_w8_loop: movu xm1, [srcq+strideq*1+0] movu xm2, [srcq+strideq*1+2] vinserti32x4 ym1, [srcq+strideq*2+0], 1 vinserti32x4 ym2, [srcq+strideq*2+2], 1 vinserti32x4 m1, [srcq+stride3q +0], 2 vinserti32x4 m2, [srcq+stride3q +2], 2 lea srcq, [srcq+strideq*4] vinserti32x4 m1, [srcq+strideq*0+0], 3 vinserti32x4 m2, [srcq+strideq*0+2], 3 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 3 4 valignq m2, m1, m0, 6 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+strideq*0+0] pmullw ym1, ym5, [srcq+strideq*0+2] psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+strideq*1+0] movu ym2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] vinserti32x8 m1, [srcq+strideq*0+0], 1 vinserti32x8 m2, [srcq+strideq*0+2], 1 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 2 jg .hv_w16_loop RET .hv_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m1, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m1 psraw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+strideq*1+0] pmullw m1, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m3, m6 paddw m3, m1 psraw m3, 2 psubw m1, m3, m0 pmulhrsw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m2 psraw m0, 2 psubw m2, m0, m3 pmulhrsw m2, m7 paddw m2, m3 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .hv_w32_loop RET .hv_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 .hv_w64_loop: add srcq, strideq pmullw m2, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m3, m4, [srcq+64] pmullw m9, m5, [srcq+66] psubw m2, m6 psubw m3, m6 paddw m2, m8 paddw m3, m9 psraw m2, 2 psraw m3, 2 psubw m8, m2, m0 psubw m9, m3, m1 pmulhrsw m8, m7 pmulhrsw m9, m7 paddw m8, m0 mova m0, m2 paddw m9, m1 mova m1, m3 mova [tmpq+64*0], m8 mova [tmpq+64*1], m9 add tmpq, 64*2 dec hd jg .hv_w64_loop RET .hv_w128: pmullw m0, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m9, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m10, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m11, m5, [srcq+194] REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m8 paddw m1, m9 paddw m2, m10 paddw m3, m11 REPX {psraw x, 2}, m0, m1, m2, m3 .hv_w128_loop: add srcq, strideq pmullw m8, m4, [srcq+ 0] pmullw m12, m5, [srcq+ 2] pmullw m9, m4, [srcq+ 64] pmullw m13, m5, [srcq+ 66] pmullw m10, m4, [srcq+128] pmullw m14, m5, [srcq+130] pmullw m11, m4, [srcq+192] pmullw m15, m5, [srcq+194] REPX {psubw x, m6}, m8, m9, m10, m11 paddw m8, m12 paddw m9, m13 paddw m10, m14 paddw m11, m15 REPX {psraw x, 2}, m8, m9, m10, m11 psubw m12, m8, m0 psubw m13, m9, m1 psubw m14, m10, m2 psubw m15, m11, m3 REPX {pmulhrsw x, m7}, m12, m13, m14, m15 paddw m12, m0 mova m0, m8 paddw m13, m1 mova m1, m9 mova [tmpq+64*0], m12 mova [tmpq+64*1], m13 paddw m14, m2 mova m2, m10 paddw m15, m3 mova m3, m11 mova [tmpq+64*2], m14 mova [tmpq+64*3], m15 add tmpq, 64*4 dec hd jg .hv_w128_loop RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v cglobal %1_8tap_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %define buf rsp+stack_offset+8 ; shadow space %else DECLARE_REG_TMP 7, 8 %define buf rsp-40 ; red zone %endif MC_8TAP_FN put, sharp, SHARP, SHARP MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP MC_8TAP_FN put, smooth, SMOOTH, SMOOTH MC_8TAP_FN put, sharp_regular, SHARP, REGULAR MC_8TAP_FN put, regular_sharp, REGULAR, SHARP MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH MC_8TAP_FN put, regular, REGULAR, REGULAR cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx512icl] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 %if WIN64 pop r8 %endif jmp wq .h_w2: movzx mxd, mxb sub srcq, 2 mova ym2, [spel_h_shuf2a] pmovsxbw xmm4, [base+subpel_filters+mxq*8] pshufd xmm3, xmm4, q1111 pshufd xmm4, xmm4, q2222 .h_w2_loop: movu xm1, [srcq+ssq*0] vinserti32x4 ym1, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova xmm0, xm8 vpermb ym1, ym2, ym1 vpdpwssd xmm0, xmm3, xm1 vextracti32x4 xm1, ym1, 1 vpdpwssd xmm0, xmm4, xm1 psrad xmm0, 6 packusdw xmm0, xmm0 pminsw xmm0, xm9 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xmm0, [base+subpel_filters+mxq*8] vbroadcasti32x4 ym4, [spel_h_shufA] vbroadcasti32x4 ym5, [spel_h_shufB] pshufd xmm0, xmm0, q2211 vpbroadcastq ym6, xmm0 vpermq ym7, ymm0, q1111 .h_w4_loop: movu xm2, [srcq+ssq*0] vinserti32x4 ym2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova ym0, ym8 pshufb ym1, ym2, ym4 vpdpwssd ym0, ym6, ym1 pshufb ym2, ym5 vpdpwssd ym0, ym7, ym2 psrad ym0, 6 vextracti32x4 xm1, ym0, 1 packusdw xm0, xm1 pminsw xmm0, xm0, xm9 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv mov r7d, r8m vpbroadcastw m9, r8m shr r7d, 11 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 je .h_w4 jl .h_w2 shr mxd, 16 sub srcq, 6 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mova [buf], xmm0 vpbroadcastd m10, xmm0 vpbroadcastd m11, [buf+ 4] vpbroadcastd m12, [buf+ 8] vpbroadcastd m13, [buf+12] sub wd, 16 je .h_w16 jg .h_w32 .h_w8: mova m4, [spel_h_shufA] movu m5, [spel_h_shufB] movu m6, [spel_h_shufC] mova m7, [spel_h_shufD] .h_w8_loop: movu ym2, [srcq+ssq*0] vinserti32x8 m2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova m0, m8 vpermb m1, m4, m2 vpdpwssd m0, m10, m1 vpermb m1, m5, m2 vpdpwssd m0, m11, m1 vpermb m1, m6, m2 vpdpwssd m0, m12, m1 vpermb m1, m7, m2 vpdpwssd m0, m13, m1 psrad m0, 6 vextracti32x8 ym1, m0, 1 packusdw ym0, ym1 pminsw ym0, ym9 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8_loop RET .h_w16: vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 movu ym3, [srcq+ssq*0+16] vinserti32x8 m3, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m11, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m13, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a2 vpdpwssd m1, m10, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a3 vpdpwssd m1, m11, m2 ; b1 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m9 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] vbroadcasti32x4 m6, [spel_h_shufA] lea dstq, [dstq+wq*2] vbroadcasti32x4 m7, [spel_h_shufB] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m10, m4 ; b0 vpdpwssd m0, m12, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m11, m3 ; b1 vpdpwssd m0, m13, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m12, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m11, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m13, m4 ; b3 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m9 mova [dstq+r6*2], m0 add r6, 32 jl .h_w32_loop add srcq, ssq add dstq, dsq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m10, [pd_32] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r7d, wd vpbroadcastw m11, r8m lea r6, [ssq*3] movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] sub srcq, r6 mova [rsp+stack_offset+8], xmm0 vpbroadcastd m12, xmm0 add r7, r8 vpbroadcastd m13, [rsp+stack_offset+12] vpbroadcastd m14, [rsp+stack_offset+16] vpbroadcastd m15, [rsp+stack_offset+20] jmp r7 .v_w2: movd xmm2, [srcq+ssq*0] pinsrd xmm2, [srcq+ssq*1], 1 pinsrd xmm2, [srcq+ssq*2], 2 add srcq, r6 pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, r6 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklwd xmm3, xmm1 ; 45 56 punpcklwd xmm1, xmm2, xmm4 ; 01 12 punpckhwd xmm2, xmm4 ; 23 34 .v_w2_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xmm5, xm10 vpdpwssd xmm5, xm12, xmm1 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm5, xm13, xmm2 ; a1 b1 mova xmm2, xmm3 vpdpwssd xmm5, xm14, xmm3 ; a2 b2 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm0, 0x02 ; 7 8 punpcklwd xmm3, xmm4 ; 67 78 vpdpwssd xmm5, xm15, xmm3 ; a3 b3 psrad xmm5, 6 packusdw xmm5, xmm5 pminsw xmm5, xm11 movd [dstq+dsq*0], xmm5 pextrd [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm1, [srcq+ssq*0] vpbroadcastq ymm0, [srcq+ssq*1] vpbroadcastq ymm2, [srcq+ssq*2] add srcq, r6 vpbroadcastq ymm4, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm5, [srcq+ssq*2] add srcq, r6 vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklwd ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm3, 0x30 punpcklwd ymm2, ymm4 ; 23 34 vpblendd ymm3, ymm5, 0x30 vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 45 56 .v_w4_loop: vpbroadcastq ymm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova ymm4, ym10 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 mova ymm1, ymm2 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 mova ymm2, ymm3 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 vpblendd ymm3, ymm0, ymm5, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 67 78 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 psrad ymm4, 6 vextracti128 xmm5, ymm4, 1 packusdw xmm4, xmm5 pminsw xmm4, xm11 movq [dstq+dsq*0], xmm4 movhps [dstq+dsq*1], xmm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m2, [srcq+ssq*2] vinserti32x4 m1, m2, [srcq+ssq*0], 0 vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 add srcq, r6 vinserti32x4 ym2, [srcq+ssq*0], 1 vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 mova m6, [spel_v_shuf8] movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m2 ; 23 34 vpermb m3, m6, m0 ; 45 56 .v_w8_loop: vinserti32x4 m0, [srcq+ssq*1], 3 lea srcq, [srcq+ssq*2] movu xm5, [srcq+ssq*0] mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m1, m2 vshufi32x4 m0, m5, q1032 ; 6 7 8 vpdpwssd m4, m13, m2 ; a1 b1 mova m2, m3 vpdpwssd m4, m14, m3 ; a2 b2 vpermb m3, m6, m0 ; 67 78 vpdpwssd m4, m15, m3 ; a3 b3 psrad m4, 6 vextracti32x8 ym5, m4, 1 packusdw ym4, ym5 pminsw ym4, ym11 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m1, [srcq+ssq*1] vinserti32x8 m0, m1, [srcq+ssq*0], 0 vinserti32x8 m1, [srcq+ssq*2], 1 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+ssq*0] vinserti32x8 m3, [srcq+ssq*1], 1 movu ym5, [srcq+ssq*2] add srcq, r6 vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m0, m8, m0 ; 01 vpermb m1, m8, m1 ; 12 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 mova m9, [deint_q_shuf] vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m6, m10 mova m7, m10 vpdpwssd m6, m12, m0 ; a0 mova m0, m2 vpdpwssd m7, m12, m1 ; b0 mova m1, m3 vpdpwssd m6, m13, m2 ; a1 mova m2, m4 vpdpwssd m7, m13, m3 ; b1 mova m3, m5 vpdpwssd m6, m14, m4 ; a2 mova m4, m5 vpdpwssd m7, m14, m5 ; b2 movu ym5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m5, 16 ; 67 vpdpwssd m6, m15, m4 ; a3 vpdpwssd m7, m15, m5 ; b3 psrad m6, 6 psrad m7, 6 packusdw m6, m7 pminsw m6, m11 vpermq m6, m9, m6 mova [dstq+dsq*0], ym6 vextracti32x8 [dstq+dsq*1], m6, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 movaps [rsp+stack_offset+8], xmm6 %endif lea wd, [hq+wq*8-256] mov r7, srcq mov r8, dstq .v_w32_loop0: movu m16, [srcq+ssq*0] movu m17, [srcq+ssq*1] movu m18, [srcq+ssq*2] add srcq, r6 movu m19, [srcq+ssq*0] movu m20, [srcq+ssq*1] movu m21, [srcq+ssq*2] add srcq, r6 movu m22, [srcq+ssq*0] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [srcq+ssq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h REPX {psrad x, 6}, m6, m8, m7, m9 packusdw m6, m8 packusdw m7, m9 pminsw m6, m11 pminsw m7, m11 mova [dstq+dsq*0], m6 mova [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop add r7, 64 add r8, 64 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .v_w32_loop0 %if WIN64 movaps xmm6, [rsp+stack_offset+8] %endif vzeroupper RET .hv: vpbroadcastw m11, r8m cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 test dword r8m, 0x800 jnz .hv_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_main .hv_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m8, [buf+ 4] vpbroadcastd m9, [buf+ 8] vpbroadcastd ym12, xmm1 vpbroadcastd ym13, [buf+20] vpbroadcastd ym14, [buf+24] vpbroadcastd ym15, [buf+28] movu xm4, [srcq+ssq*0] vinserti32x4 ym4, [srcq+ssq*1], 1 vinserti32x4 m4, [srcq+ssq*2], 2 add srcq, r6 vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 cmp wd, 4 je .hv_w4 vbroadcasti32x4 m2, [spel_h_shufA] mova m3, [spel_h_shuf2b] mova ym6, [spel_h_shuf2a] mova xm7, [spel_shuf2] mova m1, m10 pshufb m4, m2 pshufb m0, m2 punpcklqdq m2, m4, m0 vpdpwssd m1, m8, m2 ; 04 15 26 3_ punpckhqdq m4, m0 vpdpwssd m1, m9, m4 vpermb m1, m3, m1 ; 01 12 vextracti32x4 xm2, ym1, 1 ; 23 34 vextracti32x4 xm3, m1, 2 ; 45 56 .hv_w2_loop: movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym5, [srcq+ssq*0], 1 mova xm4, xm10 vpermb ym5, ym6, ym5 pmaddwd xmm0, xm12, xm1 ; a0 b0 vpdpwssd xm4, xm8, xm5 vextracti32x4 xm5, ym5, 1 mova xm1, xm2 vpdpwssd xmm0, xm13, xm2 ; a1 b1 vpdpwssd xm4, xm9, xm5 ; 7 8 mova xm2, xm3 vpdpwssd xmm0, xm14, xm3 ; a2 b2 vpermt2b xm3, xm7, xm4 ; 67 78 vpdpwssd xmm0, xm15, xm3 ; a3 b3 psrad xmm0, 10 packusdw xmm0, xmm0 pminsw xmm0, xm11 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti32x4 m19, [spel_h_shufA] vbroadcasti32x4 m20, [spel_h_shufB] mova ym6, [spel_shuf4a] mova ym7, [spel_shuf4b] mova m2, m10 mova m3, m10 pshufb m1, m4, m19 vpdpwssd m2, m8, m1 pshufb m1, m0, m19 vpdpwssd m3, m8, m1 pshufb m4, m20 vpdpwssd m2, m9, m4 pshufb m0, m20 vpdpwssd m3, m9, m0 vpermb m1, m6, m2 ; 01 12 vshufi32x4 m2, m3, q1032 vpermb m3, m6, m3 ; 45 56 vpermb m2, m6, m2 ; 23 34 .hv_w4_loop: movu xm18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 ym18, [srcq+ssq*0], 1 mova ym4, ym10 pshufb ym17, ym18, ym19 pmaddwd ym16, ym12, ym1 ; a0 b0 vpdpwssd ym4, ym8, ym17 pshufb ym18, ym20 mova ym1, ym2 vpdpwssd ym16, ym13, ym2 ; a1 b1 vpdpwssd ym4, ym9, ym18 ; 7 8 mova ym2, ym3 vpdpwssd ym16, ym14, ym3 ; a2 b2 vpermt2b ym3, ym7, ym4 ; 67 78 vpdpwssd ym16, ym15, ym3 ; a3 b3 psrad ym16, 10 vextracti128 xm17, ym16, 1 packusdw xm16, xm17 pminsw xm16, xm11 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 test dword r8m, 0x800 jnz .hv_w8_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_w8_main .hv_w8_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_w8_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [buf+ 4] vpbroadcastd m14, [buf+ 8] vpbroadcastd m15, [buf+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [buf+20] vpbroadcastd m18, [buf+24] vpbroadcastd m19, [buf+28] cmp wd, 16 je .hv_w16 jg .hv_w32 mova m5, [spel_h_shufA] movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 movu ym9, [srcq+ssq*2] add srcq, r6 vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 movu ym20, [srcq+ssq*1] vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+ssq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m12, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m12, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m12, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m12, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m13, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m13, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m13, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m13, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m14, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m14, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m14, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m14, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m15, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m15, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m15, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m15, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m0, [srcq+ssq*0], 1 mova m4, m10 vpermb m21, m5, m0 vpdpwssd m4, m12, m21 ; h0 i0 vpermb m21, m6, m0 pmaddwd m20, m16, m1 ; A0 B0 vpdpwssd m4, m13, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m14, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m15, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 psrad m20, 10 vextracti32x8 ym21, m20, 1 packusdw ym20, ym21 pminsw ym20, ym11 mova [dstq+dsq*0], xm20 vextracti128 [dstq+dsq*1], ym20, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: WIN64_SPILL_XMM 26 vbroadcasti32x8 m5, [srcq+ssq*0+ 8] vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 movu ym6, [srcq+ssq*1+ 0] movu ym7, [srcq+ssq*1+16] vinserti32x8 m6, [srcq+ssq*2+ 0], 1 vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 add srcq, r6 movu ym22, [srcq+ssq*0+ 0] movu ym23, [srcq+ssq*0+16] vinserti32x8 m22, [srcq+ssq*1+ 0], 1 vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4 movu ym24, [srcq+ssq*2+ 0] movu ym25, [srcq+ssq*2+16] add srcq, r6 vinserti32x8 m24, [srcq+ssq*0+ 0], 1 vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m9, [spel_shuf16] pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m12, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m12, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m14, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m13, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m13, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m15, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m14, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m14, m7 ; b2 vpdpwssd m3, m12, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m15, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m15, m6 ; b3 vpdpwssd m3, m13, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m12, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m14, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m12, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m14, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m13, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m15, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m13, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m15, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m14, m23 ; d2 vpdpwssd m5, m12, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m14, m25 ; f2 vpdpwssd m7, m12, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m15, m22 ; d3 vpdpwssd m5, m13, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m15, m24 ; f3 vpdpwssd m7, m13, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [srcq+ssq*1+ 0] movu ym25, [srcq+ssq*1+16] lea srcq, [srcq+ssq*2] vinserti32x8 m24, [srcq+ssq*0+ 0], 1 vinserti32x8 m25, [srcq+ssq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m12, m0 ; h0 pshufb m0, m25, m20 vpdpwssd m8, m14, m0 ; i2 pmaddwd m22, m16, m1 ; A0 mova m1, m3 pmaddwd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m13, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m15, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m14, m25 ; h2 vpdpwssd m8, m12, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m15, m24 ; h3 vpdpwssd m8, m13, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 psrad m22, 10 psrad m23, 10 vshufi32x4 m0, m22, m23, q3232 vinserti32x8 m22, ym23, 1 packusdw m22, m0 pminsw m22, m11 mova [dstq+dsq*0], ym22 vextracti32x8 [dstq+dsq*1], m22, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 32 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m22, [spel_shuf32] lea wd, [hq+wq*8-256] mov r7, srcq mov r8, dstq .hv_w32_loop0: movu m6, [srcq+ssq*0+ 0] movu m7, [srcq+ssq*0+ 8] movu m8, [srcq+ssq*0+16] mova m0, m10 mova m23, m10 pshufb m9, m6, m20 vpdpwssd m0, m12, m9 ; a0l pshufb m9, m7, m20 vpdpwssd m23, m12, m9 ; a0h vpdpwssd m0, m14, m9 ; a2l pshufb m7, m21 vpdpwssd m23, m13, m7 ; a1h vpdpwssd m0, m15, m7 ; a3l pshufb m7, m8, m20 vpdpwssd m23, m14, m7 ; a2h pshufb m6, m21 vpdpwssd m0, m13, m6 ; a1l pshufb m8, m21 vpdpwssd m23, m15, m8 ; a3h %macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2] movu m6, [srcq+%3*%4+ 0] movu m7, [srcq+%3*%4+ 8] movu m8, [srcq+%3*%4+16] %if %4 == 2 add srcq, r6 %endif movu m29, [srcq+%3*%5+ 0] movu m30, [srcq+%3*%5+ 8] movu m31, [srcq+%3*%5+16] %if %5 == 2 add srcq, r6 %endif mova m%1, m10 mova m9, m10 pshufb m%2, m6, m20 vpdpwssd m%1, m12, m%2 ; x0l pshufb m%2, m29, m20 vpdpwssd m9, m12, m%2 ; y0l pshufb m6, m21 vpdpwssd m%1, m13, m6 ; x1l pshufb m29, m21 vpdpwssd m9, m13, m29 ; y1l pshufb m6, m7, m20 mova m%2, m10 vpdpwssd m%2, m12, m6 ; x0h pshufb m29, m30, m20 vpdpwssd m%1, m14, m6 ; y2l mova m6, m10 vpdpwssd m6, m12, m29 ; x0h pshufb m7, m21 vpdpwssd m9, m14, m29 ; y2l pshufb m30, m21 vpdpwssd m%2, m13, m7 ; x1h vpdpwssd m%1, m15, m7 ; x3l pshufb m7, m8, m20 vpdpwssd m6, m13, m30 ; y1h vpdpwssd m9, m15, m30 ; y3l pshufb m30, m31, m20 vpdpwssd m%2, m14, m7 ; x2h pshufb m8, m21 vpdpwssd m6, m14, m30 ; y2h pshufb m31, m21 vpdpwssd m%2, m15, m8 ; x3h vpdpwssd m6, m15, m31 ; y3h %if %1 == 1 vpermt2b m0, m22, m%1 ; 01l vpermt2b m23, m22, m%2 ; 01h %endif vpermt2b m%1, m22, m9 ; xyl vpermt2b m%2, m22, m6 ; xyh %endmacro PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12 PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34 PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56 vpshrdd m2, m1, m3, 16 ; 23l vpshrdd m25, m24, m26, 16 ; 23h vpshrdd m4, m3, m5, 16 ; 45l vpshrdd m27, m26, m28, 16 ; 45h .hv_w32_loop: movu m7, [srcq+ssq*1+ 0] movu m9, [srcq+ssq*2+ 0] movu m6, [srcq+ssq*1+ 8] movu m8, [srcq+ssq*2+ 8] mova m29, m10 mova m31, m10 pshufb m30, m7, m20 vpdpwssd m29, m12, m30 ; h0l pshufb m30, m9, m20 vpdpwssd m31, m12, m30 ; i0l pshufb m7, m21 vpdpwssd m29, m13, m7 ; h1l pshufb m9, m21 vpdpwssd m31, m13, m9 ; i1l pshufb m7, m6, m20 vpdpwssd m29, m14, m7 ; h2l pshufb m9, m8, m20 vpdpwssd m31, m14, m9 ; i2l pshufb m6, m21 vpdpwssd m29, m15, m6 ; h3l pshufb m8, m21 vpdpwssd m31, m15, m8 ; i3l mova m30, m10 vpdpwssd m30, m12, m7 ; h0h movu m7, [srcq+ssq*1+16] lea srcq, [srcq+ssq*2] vpermt2b m29, m22, m31 ; 78l mova m31, m10 vpdpwssd m31, m12, m9 ; i0h movu m9, [srcq+ssq*0+16] vpdpwssd m30, m13, m6 ; h1h pshufb m6, m7, m20 vpdpwssd m31, m13, m8 ; i1h pshufb m8, m9, m20 vpdpwssd m30, m14, m6 ; h2h pmaddwd m6, m16, m0 ; A0l pshufb m7, m21 vpdpwssd m31, m14, m8 ; i2h pmaddwd m8, m16, m23 ; A0h pshufb m9, m21 vpdpwssd m30, m15, m7 ; h3h pmaddwd m7, m16, m1 ; B0l vpdpwssd m31, m15, m9 ; i3h pmaddwd m9, m16, m24 ; B0h mova m0, m2 vpdpwssd m6, m17, m2 ; A1l mova m23, m25 vpdpwssd m8, m17, m25 ; A1h mova m1, m3 vpdpwssd m7, m17, m3 ; B1l mova m24, m26 vpdpwssd m9, m17, m26 ; B1h vpermt2b m30, m22, m31 ; 78h vpdpwssd m6, m18, m4 ; A2l mova m2, m4 vpdpwssd m8, m18, m27 ; A2h mova m25, m27 vpdpwssd m7, m18, m5 ; B2l mova m3, m5 vpdpwssd m9, m18, m28 ; B2h mova m26, m28 vpshrdd m4, m5, m29, 16 ; 67l vpdpwssd m6, m19, m4 ; A3l vpshrdd m27, m28, m30, 16 ; 67h vpdpwssd m8, m19, m27 ; A3h mova m5, m29 vpdpwssd m7, m19, m29 ; B3l mova m28, m30 vpdpwssd m9, m19, m30 ; B3h REPX {psrad x, 10}, m6, m8, m7, m9 packusdw m6, m8 packusdw m7, m9 pminsw m6, m11 pminsw m7, m11 mova [dstq+dsq*0], m6 mova [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop add r7, 64 add r8, 64 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .hv_w32_loop0 RET %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif MC_8TAP_FN prep, sharp, SHARP, SHARP MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH MC_8TAP_FN prep, regular, REGULAR, REGULAR cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 %define base r7-prep_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [pw_8192] movzx wd, word [r7+wq*2+table_offset(prep,)] shr r5d, 11 vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] add wq, r7 lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m vbroadcasti32x4 m4, [spel_h_shufA] vbroadcasti32x4 m5, [spel_h_shufB] shr r5d, 11 mova ym9, [prep_endA] psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m6, [tmpq+4] vpbroadcastd m7, [tmpq+8] .h_w4_loop: movu xm2, [srcq+strideq*0] vinserti32x4 ym2, [srcq+strideq*1], 1 vinserti32x4 m2, [srcq+strideq*2], 2 vinserti32x4 m2, [srcq+r6 ], 3 lea srcq, [srcq+strideq*4] mova m0, m10 pshufb m1, m2, m4 vpdpwssd m0, m6, m1 pshufb m2, m5 vpdpwssd m0, m7, m2 vpermb m0, m9, m0 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m10, [prep_8tap_rnd] lea r6, [strideq*3] cmp wd, 4 je .h_w4 shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m sub srcq, 6 shr r5d, 11 psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] cmp wd, 16 je .h_w16 jg .h_w32 .h_w8: mova m6, [spel_h_shufA] movu m7, [spel_h_shufB] movu m8, [spel_h_shufC] mova m9, [spel_h_shufD] mova m11, [prep_endB] .h_w8_loop: movu ym4, [srcq+strideq*0] vinserti32x8 m4, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] vinserti32x8 m5, [srcq+r6 ], 1 lea srcq, [srcq+strideq*4] mova m0, m10 mova m1, m10 vpermb m2, m6, m4 vpermb m3, m6, m5 vpdpwssd m0, m12, m2 vpdpwssd m1, m12, m3 vpermb m2, m7, m4 vpermb m3, m7, m5 vpdpwssd m0, m13, m2 vpdpwssd m1, m13, m3 vpermb m2, m8, m4 vpermb m3, m8, m5 vpdpwssd m0, m14, m2 vpdpwssd m1, m14, m3 vpermb m2, m9, m4 vpermb m3, m9, m5 vpdpwssd m0, m15, m2 vpdpwssd m1, m15, m3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h_w16: vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] mova m11, [prep_endC] .h_w16_loop: movu ym2, [srcq+strideq*0+ 0] vinserti32x8 m2, [srcq+strideq*1+ 0], 1 movu ym3, [srcq+strideq*0+16] vinserti32x8 m3, [srcq+strideq*1+16], 1 lea srcq, [srcq+strideq*2] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m14, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m13, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m15, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m14, m4 ; a2 vpdpwssd m1, m12, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m15, m2 ; a3 vpdpwssd m1, m13, m2 ; b1 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16_loop RET .h_w32: vbroadcasti32x4 m6, [spel_h_shufA] lea srcq, [srcq+wq*2] vbroadcasti32x4 m7, [spel_h_shufB] neg wq mova m11, [prep_endC] .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b0 vpdpwssd m0, m14, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m13, m3 ; b1 vpdpwssd m0, m15, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m14, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m15, m4 ; b3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 add r6, 32 jl .h_w32_loop add srcq, strideq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m vpbroadcastd m10, [prep_8tap_rnd] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r6d, wd shr r5d, 11 movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] psllw xmm0, [base+prep_hv_shift+r5*8] add r7, r6 lea r6, [strideq*3] sub srcq, r6 mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] jmp r7 .v_w4: movq xmm1, [srcq+strideq*0] vpbroadcastq ymm0, [srcq+strideq*1] vpbroadcastq ymm2, [srcq+strideq*2] add srcq, r6 vpbroadcastq ymm4, [srcq+strideq*0] vpbroadcastq ymm3, [srcq+strideq*1] vpbroadcastq ymm5, [srcq+strideq*2] mova xm11, [prep_endA] add srcq, r6 vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklwd ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+strideq*0] vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm3, 0x30 punpcklwd ymm2, ymm4 ; 23 34 vpblendd ymm3, ymm5, 0x30 vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 45 56 .v_w4_loop: vpbroadcastq ymm5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] mova ymm4, ym10 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 mova ymm1, ymm2 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 mova ymm2, ymm3 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 vpblendd ymm3, ymm0, ymm5, 0x30 vpbroadcastq ymm0, [srcq+strideq*0] vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 67 78 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 vpermb ymm4, ym11, ymm4 mova [tmpq], xmm4 add tmpq, 16 sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m2, [srcq+strideq*2] vinserti32x4 m1, m2, [srcq+strideq*0], 0 vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2 add srcq, r6 vinserti32x4 ym2, [srcq+strideq*0], 1 vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4 mova m6, [spel_v_shuf8] movu xm0, [srcq+strideq*1] vinserti32x4 ym0, [srcq+strideq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 mova ym11, [prep_endB] vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m2 ; 23 34 vpermb m3, m6, m0 ; 45 56 .v_w8_loop: vinserti32x4 m0, [srcq+strideq*1], 3 lea srcq, [srcq+strideq*2] movu xm5, [srcq+strideq*0] mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m1, m2 vshufi32x4 m0, m5, q1032 ; 6 7 8 vpdpwssd m4, m13, m2 ; a1 b1 mova m2, m3 vpdpwssd m4, m14, m3 ; a2 b2 vpermb m3, m6, m0 ; 67 78 vpdpwssd m4, m15, m3 ; a3 b3 vpermb m4, m11, m4 mova [tmpq], ym4 add tmpq, 32 sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m1, [srcq+strideq*1] vinserti32x8 m0, m1, [srcq+strideq*0], 0 vinserti32x8 m1, [srcq+strideq*2], 1 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+strideq*0] vinserti32x8 m3, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] add srcq, r6 vinserti32x8 m5, [srcq+strideq*0], 1 mova m11, [prep_endA] vpermb m0, m8, m0 ; 01 vpermb m1, m8, m1 ; 12 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m6, m10 mova m7, m10 vpdpwssd m6, m12, m0 ; a0 mova m0, m2 vpdpwssd m7, m12, m1 ; b0 mova m1, m3 vpdpwssd m6, m13, m2 ; a1 mova m2, m4 vpdpwssd m7, m13, m3 ; b1 mova m3, m5 vpdpwssd m6, m14, m4 ; a2 mova m4, m5 vpdpwssd m7, m14, m5 ; b2 movu ym5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m5, [srcq+strideq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m5, 16 ; 67 vpdpwssd m6, m15, m4 ; a3 vpdpwssd m7, m15, m5 ; b3 vpermt2b m6, m11, m7 mova [tmpq], m6 add tmpq, 64 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 PUSH r8 movaps [rsp+stack_offset+8], xmm6 %endif lea r5, [hq+wq*8-256] mov r7, srcq mov r8, tmpq .v_w32_loop0: movu m16, [srcq+strideq*0] movu m17, [srcq+strideq*1] movu m18, [srcq+strideq*2] add srcq, r6 movu m19, [srcq+strideq*0] movu m20, [srcq+strideq*1] movu m21, [srcq+strideq*2] add srcq, r6 movu m22, [srcq+strideq*0] mova m11, [prep_endC] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [srcq+strideq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h vpermt2b m6, m11, m8 vpermt2b m7, m11, m9 mova [tmpq+wq*0], m6 mova [tmpq+wq*2], m7 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w32_loop add r7, 64 add r8, 64 movzx hd, r5b mov srcq, r7 mov tmpq, r8 sub r5d, 1<<8 jg .v_w32_loop0 %if WIN64 movaps xmm6, [rsp+stack_offset+8] POP r8 %endif vzeroupper RET .hv: cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 shr r5d, 11 sub srcq, r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 vpbroadcastd m10, [prep_8tap_rnd] vpbroadcastd ym11, [pd_128] mova xm21, [prep_endA] mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m8, [tmpq+ 4] vpbroadcastd m9, [tmpq+ 8] vpbroadcastd ym12, xmm1 vpbroadcastd ym13, [tmpq+20] vpbroadcastd ym14, [tmpq+24] vpbroadcastd ym15, [tmpq+28] movu xm4, [srcq+strideq*0] vinserti32x4 ym4, [srcq+strideq*1], 1 vinserti32x4 m4, [srcq+strideq*2], 2 add srcq, r6 vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3 movu xm0, [srcq+strideq*1] vinserti32x4 ym0, [srcq+strideq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 vbroadcasti32x4 m19, [spel_h_shufA] vbroadcasti32x4 m20, [spel_h_shufB] mova ym6, [spel_shuf4a] mova ym7, [spel_shuf4b] mova m2, m10 mova m3, m10 pshufb m1, m4, m19 vpdpwssd m2, m8, m1 pshufb m1, m0, m19 vpdpwssd m3, m8, m1 pshufb m4, m20 vpdpwssd m2, m9, m4 pshufb m0, m20 vpdpwssd m3, m9, m0 vpermb m1, m6, m2 ; 01 12 vshufi32x4 m2, m3, q1032 vpermb m3, m6, m3 ; 45 56 vpermb m2, m6, m2 ; 23 34 .hv_w4_loop: movu xm18, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti128 ym18, [srcq+strideq*0], 1 mova ym16, ym11 mova ym4, ym10 pshufb ym17, ym18, ym19 vpdpwssd ym16, ym12, ym1 ; a0 b0 vpdpwssd ym4, ym8, ym17 pshufb ym18, ym20 mova ym1, ym2 vpdpwssd ym16, ym13, ym2 ; a1 b1 vpdpwssd ym4, ym9, ym18 ; 7 8 mova ym2, ym3 vpdpwssd ym16, ym14, ym3 ; a2 b2 vpermt2b ym3, ym7, ym4 ; 67 78 vpdpwssd ym16, ym15, ym3 ; a3 b3 vpermb ym16, ym21, ym16 mova [tmpq], xm16 add tmpq, 16 sub hd, 2 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 6 shr r5d, 11 sub srcq, r6 vpbroadcastd m10, [prep_8tap_rnd] vpbroadcastd m11, [pd_128] psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [tmpq+20] vpbroadcastd m18, [tmpq+24] vpbroadcastd m19, [tmpq+28] cmp wd, 16 je .hv_w16 jg .hv_w32 WIN64_SPILL_XMM 23 mova m5, [spel_h_shufA] movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 movu ym9, [srcq+strideq*2] add srcq, r6 vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 movu ym20, [srcq+strideq*1] vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+strideq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] mova ym22, [prep_endB] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m12, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m12, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m12, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m12, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m13, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m13, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m13, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m13, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m14, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m14, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m14, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m14, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m15, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m15, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m15, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m15, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m0, [srcq+strideq*0], 1 mova m4, m10 mova m20, m11 vpermb m21, m5, m0 vpdpwssd m4, m12, m21 ; h0 i0 vpermb m21, m6, m0 vpdpwssd m20, m16, m1 ; A0 B0 vpdpwssd m4, m13, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m14, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m15, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 vpermb m20, m22, m20 mova [tmpq], ym20 add tmpq, 32 sub hd, 2 jg .hv_w8_loop RET .hv_w16: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 27 vbroadcasti32x8 m5, [srcq+strideq*0+ 8] vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 movu ym6, [srcq+strideq*1+ 0] movu ym7, [srcq+strideq*1+16] vinserti32x8 m6, [srcq+strideq*2+ 0], 1 vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 add srcq, r6 movu ym22, [srcq+strideq*0+ 0] movu ym23, [srcq+strideq*0+16] vinserti32x8 m22, [srcq+strideq*1+ 0], 1 vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4 movu ym24, [srcq+strideq*2+ 0] movu ym25, [srcq+strideq*2+16] add srcq, r6 vinserti32x8 m24, [srcq+strideq*0+ 0], 1 vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m9, [spel_shuf16] mova m26, [prep_endB] pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m12, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m12, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m14, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m13, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m13, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m15, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m14, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m14, m7 ; b2 vpdpwssd m3, m12, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m15, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m15, m6 ; b3 vpdpwssd m3, m13, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m12, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m14, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m12, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m14, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m13, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m15, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m13, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m15, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m14, m23 ; d2 vpdpwssd m5, m12, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m14, m25 ; f2 vpdpwssd m7, m12, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m15, m22 ; d3 vpdpwssd m5, m13, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m15, m24 ; f3 vpdpwssd m7, m13, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [srcq+strideq*1+ 0] movu ym25, [srcq+strideq*1+16] lea srcq, [srcq+strideq*2] vinserti32x8 m24, [srcq+strideq*0+ 0], 1 vinserti32x8 m25, [srcq+strideq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m12, m0 ; h0 mova m22, m11 pshufb m0, m25, m20 vpdpwssd m8, m14, m0 ; i2 mova m23, m11 vpdpwssd m22, m16, m1 ; A0 mova m1, m3 vpdpwssd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m13, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m15, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m14, m25 ; h2 vpdpwssd m8, m12, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m15, m24 ; h3 vpdpwssd m8, m13, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 vpermt2b m22, m26, m23 mova [tmpq], m22 add tmpq, 64 sub hd, 2 jg .hv_w16_loop RET .hv_w32: %if WIN64 %assign stack_offset stack_offset - stack_size_padded PUSH r8 %assign regs_used regs_used + 1 WIN64_SPILL_XMM 32 %endif vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m22, [spel_shuf32] lea r5d, [hq+wq*8-256] mov r7, srcq mov r8, tmpq .hv_w32_loop0: movu m6, [srcq+strideq*0+ 0] movu m7, [srcq+strideq*0+ 8] movu m8, [srcq+strideq*0+16] mova m0, m10 mova m23, m10 pshufb m9, m6, m20 vpdpwssd m0, m12, m9 ; a0l pshufb m9, m7, m20 vpdpwssd m23, m12, m9 ; a0h vpdpwssd m0, m14, m9 ; a2l pshufb m7, m21 vpdpwssd m23, m13, m7 ; a1h vpdpwssd m0, m15, m7 ; a3l pshufb m7, m8, m20 vpdpwssd m23, m14, m7 ; a2h pshufb m6, m21 vpdpwssd m0, m13, m6 ; a1l pshufb m8, m21 vpdpwssd m23, m15, m8 ; a3h PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12 PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34 PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56 vpshrdd m2, m1, m3, 16 ; 23l vpshrdd m25, m24, m26, 16 ; 23h vpshrdd m4, m3, m5, 16 ; 45l vpshrdd m27, m26, m28, 16 ; 45h .hv_w32_loop: movu m7, [srcq+strideq*1+ 0] movu m9, [srcq+strideq*2+ 0] movu m6, [srcq+strideq*1+ 8] movu m8, [srcq+strideq*2+ 8] mova m29, m10 mova m31, m10 pshufb m30, m7, m20 vpdpwssd m29, m12, m30 ; h0l pshufb m30, m9, m20 vpdpwssd m31, m12, m30 ; i0l pshufb m7, m21 vpdpwssd m29, m13, m7 ; h1l pshufb m9, m21 vpdpwssd m31, m13, m9 ; i1l pshufb m7, m6, m20 vpdpwssd m29, m14, m7 ; h2l pshufb m9, m8, m20 vpdpwssd m31, m14, m9 ; i2l pshufb m6, m21 vpdpwssd m29, m15, m6 ; h3l pshufb m8, m21 vpdpwssd m31, m15, m8 ; i3l mova m30, m10 vpdpwssd m30, m12, m7 ; h0h movu m7, [srcq+strideq*1+16] lea srcq, [srcq+strideq*2] vpermt2b m29, m22, m31 ; 78l mova m31, m10 vpdpwssd m31, m12, m9 ; i0h movu m9, [srcq+strideq*0+16] vpdpwssd m30, m13, m6 ; h1h pshufb m6, m7, m20 vpdpwssd m31, m13, m8 ; i1h pshufb m8, m9, m20 vpdpwssd m30, m14, m6 ; h2h mova m6, m11 vpdpwssd m6, m16, m0 ; A0l pshufb m7, m21 vpdpwssd m31, m14, m8 ; i2h mova m8, m11 vpdpwssd m8, m16, m23 ; A0h pshufb m9, m21 vpdpwssd m30, m15, m7 ; h3h mova m7, m11 vpdpwssd m7, m16, m1 ; B0l vpdpwssd m31, m15, m9 ; i3h mova m9, m11 vpdpwssd m9, m16, m24 ; B0h mova m0, m2 vpdpwssd m6, m17, m2 ; A1l mova m23, m25 vpdpwssd m8, m17, m25 ; A1h mova m1, m3 vpdpwssd m7, m17, m3 ; B1l mova m24, m26 vpdpwssd m9, m17, m26 ; B1h vpermt2b m30, m22, m31 ; 78h mova m31, [prep_endC] vpdpwssd m6, m18, m4 ; A2l mova m2, m4 vpdpwssd m8, m18, m27 ; A2h mova m25, m27 vpdpwssd m7, m18, m5 ; B2l mova m3, m5 vpdpwssd m9, m18, m28 ; B2h mova m26, m28 vpshrdd m4, m5, m29, 16 ; 67l vpdpwssd m6, m19, m4 ; A3l vpshrdd m27, m28, m30, 16 ; 67h vpdpwssd m8, m19, m27 ; A3h mova m5, m29 vpdpwssd m7, m19, m29 ; B3l mova m28, m30 vpdpwssd m9, m19, m30 ; B3h vpermt2b m6, m31, m8 vpermt2b m7, m31, m9 mova [tmpq+wq*0], m6 mova [tmpq+wq*2], m7 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w32_loop add r7, 64 add r8, 64 movzx hd, r5b mov srcq, r7 mov tmpq, r8 sub r5d, 1<<8 jg .hv_w32_loop0 RET %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts %define base r6-pd_0to7 mov t0d, r7m lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m8, [base+warp_8x8t_rnd_v] vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main psrad m14, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m16, 15 packssdw m14, m16 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m15, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 add tsq, tsq psrad m16, 15 packssdw m15, m16 jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd mov t0d, r7m ; pixel_max lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] call .main psrad m14, m16, 13 call .main2 psrad m16, 13 packusdw m14, m16 call .main2 psrad m15, m16, 13 call .main2 vpbroadcastd m0, [base+bidir_shift+t0*4] vpsrlvw m14, m0 psrad m16, 13 packusdw m15, m16 vpsrlvw m15, m0 .end: mova m0, [base+warp8x8_end] vpermb m16, m0, m14 lea r2, [dsq*3] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 vpermb m16, m0, m15 lea dstq, [dstq+dsq*4] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 RET .main: vpbroadcastd ym3, [base+pd_512] %if WIN64 mov abcdq, r5mp vpaddd ym18, ym3, r6m {1to8} ; mx %else add r5d, 512 vpbroadcastd ym18, r5d %endif vpaddd ym20, ym3, r7m {1to8} ; my mova ym16, [base+pd_0to7] vpbroadcastd ym19, [abcdq+4*0] ; alpha vpbroadcastd ym21, [abcdq+4*1] ; gamma lea r4, [ssq*3+6] vpdpwssd ym18, ym19, ym16 ; tmx vpdpwssd ym20, ym21, ym16 ; tmy sub srcq, r4 mova m10, [base+warp8x8_permA] lea r4, [mc_warp_filter+64*8] vbroadcasti32x4 m12, [base+warp8x8_permC] kxnorb k1, k1, k1 vbroadcasti32x4 m13, [base+warp8x8_permD] movu ym5, [srcq+0] vinserti32x8 m5, [srcq+8], 1 psrad ym17, ym18, 10 mova m11, [base+warp8x8_permB] kmovb k2, k1 vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 psrad ym19, 16 ; beta psrad ym21, 16 ; delta paddd ym18, ym19 vpermb m4, m10, m5 vpbroadcastq m9, [base+warp_shift_h+t0*8] pshufd m3, m3, q3120 paddd m7, m1, m1 pshufb m2, m3, m12 vpdpwssd m1, m4, m2 vpermb m5, m11, m5 vshufi32x4 m4, m5, q1021 pshufb m3, m13 vpdpwssd m1, m4, m3 call .h psllq m2, m1, 32 paddd m1, m2 vpmultishiftqb m1, m9, m1 vpshrdq m1, m0, 48 ; 01 12 call .h vpshrdq m2, m1, m0, 48 ; 23 34 call .h vpshrdq m3, m2, m0, 48 ; 45 56 .main2: call .h psrad ym6, ym20, 10 kmovb k1, k2 paddd ym17, ym20, ym21 ; my += delta vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 shufps m5, m20, m6, q2020 mova m16, m8 pshufb m4, m5, m12 vpdpwssd m16, m1, m4 ; a0 b0 pshufb m5, m13 mova m1, m2 vpdpwssd m16, m2, m5 ; a1 b1 shufps m6, m20, m6, q3131 paddd ym20, ym17, ym21 pshufb m4, m6, m12 mova m2, m3 vpdpwssd m16, m3, m4 ; a2 b2 vpshrdq m3, m0, 48 ; 67 78 pshufb m6, m13 vpdpwssd m16, m3, m6 ; a3 b3 ret ALIGN function_align .h: movu ym16, [srcq+ssq*1] psrad ym6, ym18, 10 lea srcq, [srcq+ssq*2] vinserti32x8 m5, m16, [srcq+ssq*0], 1 kmovb k1, k2 paddd ym17, ym18, ym19 ; mx += beta vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 vpermb m4, m10, m5 shufps m16, m18, m6, q2020 shufps m6, m18, m6, q3131 mova m0, m7 pshufb m18, m16, m12 vpdpwssd m0, m4, m18 ; a0 b0 vpermb m5, m11, m5 pshufb m18, m6, m13 vpdpwssd m0, m5, m18 ; a3 b3 paddd ym18, ym17, ym19 vshufi32x4 m17, m4, m5, q1021 pshufb m16, m13 vpdpwssd m0, m17, m16 ; a1 b1 vshufi32x4 m4, m5, q2132 pshufb m6, m12 vpdpwssd m0, m4, m6 ; a2 b2 vpmultishiftqb m0, m9, m0 ; a a b b ret %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm0, ym1, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 vextracti32x4 xm0, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 vpbroadcastd m2, [base+avg_round+t0*4] vpbroadcastd m3, [base+avg_shift+t0*4] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+64*0] paddsw m0, [tmp2q+64*0] mova m1, [tmp1q+64*1] paddsw m1, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 pmaxsw m0, m2 pmaxsw m1, m2 psubsw m0, m2 psubsw m1, m2 vpsrlvw m0, m3 vpsrlvw m1, m3 ret cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg_avx512icl_table lea r6, [w_avg_avx512icl_table] tzcnt wd, wm mov t0d, r7m ; pixel_max shr t0d, 11 movsxd wq, [r6+wq*4] vpbroadcastd m5, [base+w_avg_round+t0*4] vpbroadcastd m7, [base+bidir_shift+t0*4] add wq, r6 mov r6d, r6m ; weight lea t0d, [r6-16] shl r6d, 16 sub r6d, t0d ; 16-weight, weight movifnidn hd, hm vpbroadcastd m6, r6d BIDIR_FN ALIGN function_align .main: mova m3, [tmp1q+64*0] mova m1, [tmp2q+64*0] mova m0, [tmp1q+64*1] mova m4, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m2, m1, m3 punpckhwd m1, m3 punpcklwd m3, m4, m0 punpckhwd m4, m0 mova m0, m5 vpdpwssd m0, m6, m2 mova m2, m5 vpdpwssd m2, m6, m1 mova m1, m5 vpdpwssd m1, m6, m3 mova m3, m5 vpdpwssd m3, m6, m4 REPX {psrad x, 2}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m7 vpsrlvw m1, m7 ret cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx512icl_table lea r7, [mask_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_64] vpbroadcastd m9, [base+mask_round+r6*4] vpbroadcastd m10, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 BIDIR_FN ALIGN function_align .main: pmovzxbw m1, [maskq+32*0] mova m4, [tmp1q+64*0] mova m2, [tmp2q+64*0] pmovzxbw m6, [maskq+32*1] mova m5, [tmp1q+64*1] mova m3, [tmp2q+64*1] add maskq, 32*2 add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m7, m4, m2 punpckhwd m4, m2 psubw m0, m8, m1 punpcklwd m2, m1, m0 ; m, 64-m punpckhwd m1, m0 mova m0, m9 vpdpwssd m0, m7, m2 mova m2, m9 vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) punpcklwd m7, m5, m3 punpckhwd m5, m3 psubw m1, m8, m6 punpcklwd m3, m6, m1 punpckhwd m6, m1 mova m1, m9 vpdpwssd m1, m7, m3 mova m3, m9 vpdpwssd m3, m5, m6 REPX {psrad x, 4}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m10 vpsrlvw m1, m10 ret cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx512icl_table lea r7, [w_mask_420_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+mask_round+r6*4] vpbroadcastd m13, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m14, [base+w_mask_round+r6*4] mova ym15, [w_mask_end42x] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: mova m4, [w_mask_shuf4] vpermt2b m2, m4, m3 mova m3, m14 vpdpbusd m3, m2, [pb_64] {1to16} vpermb m3, m15, m3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 mova [maskq], xm3 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8: mova m8, [w_mask_shuf8] vpbroadcastd m9, [pb_64] jmp .w8_start .w8_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w8_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 mova [maskq], xm3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16: mova m8, [w_mask_shuf16] vpbroadcastd m9, [pb_64] jmp .w16_start .w16_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w16_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 mova [maskq], xm3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 32 .w32: paddw m2, m3 mova m8, m14 vpdpwssd m8, m11, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 call .main paddw m2, m3 mova m3, m14 vpdpwssd m3, m11, m2 vpermt2b m8, m15, m3 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m1 mova [maskq], ym8 sub hd, 4 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 32 .w64: mova m8, m2 mova m9, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main paddw m8, m2 paddw m9, m3 mova m2, m14 vpdpwssd m2, m11, m8 mova m3, m14 vpdpwssd m3, m11, m9 vpermt2b m2, m15, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 mova [maskq], ym2 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 64 .w128: mova m16, m2 mova m8, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main mova m17, m2 mova m9, m3 mova [dstq+strideq*0+64*2], m0 mova [dstq+strideq*0+64*3], m1 call .main paddw m2, m16 paddw m3, m8 mova m16, m14 vpdpwssd m16, m11, m2 mova m8, m14 vpdpwssd m8, m11, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 call .main paddw m2, m17 paddw m3, m9 mova m17, m14 vpdpwssd m17, m11, m2 mova m9, m14 vpdpwssd m9, m11, m3 vpermt2b m16, m15, m8 vpermt2b m17, m15, m9 mova [dstq+strideq*1+64*2], m0 mova [dstq+strideq*1+64*3], m1 mova [maskq+32*0], ym16 mova [maskq+32*1], ym17 sub hd, 2 jg .w128_loop vzeroupper RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m10, m6 psrlw m6, 10 ; 64-m psubw m2, m11, m6 ; m punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m12 vpdpwssd m0, m5, m1 mova m1, m12 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m10, m5 psrlw m5, 10 psubw m3, m11, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m12 vpdpwssd m1, m6, m4 mova m4, m12 vpdpwssd m4, m7, m5 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m13 vpsrlvw m1, m13 ret cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx512icl_table lea r7, [w_mask_422_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] vpbroadcastd m11, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m12, [base+w_mask_round+r6*4] mova ym13, [w_mask_end42x] mov maskq, maskmp add wq, r7 paddw m14, m9, m9 ; pw_128 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 mova m5, m12 vpdpwssd m5, m14, m2 mova m2, m12 vpdpwssd m2, m14, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpermt2b m5, m13, m2 vpsrlvw m0, m11 vpsrlvw m1, m11 mova [maskq], ym5 add maskq, 32 ret cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx512icl_table lea r7, [w_mask_444_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] mova m11, [w_mask_end444] vpbroadcastd m12, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 vpermt2b m2, m11, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m12 vpsrlvw m1, m12 mova [maskq], m2 add maskq, 64 ret cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx512icl_table lea r6, [blend_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp vpbroadcastd m6, [base+pw_m512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: pmovzxbw ym19, [maskq] movq xm16, [dstq+dsq*0] movhps xm16, [dstq+dsq*1] vpbroadcastq ym17, [dstq+dsq*2] vpbroadcastq ym18, [dstq+r6 ] pmullw ym19, ym6 vpblendd ym16, ym17, 0x30 vpblendd ym16, ym18, 0xc0 psubw ym17, ym16, [tmpq] add maskq, 16 add tmpq, 32 pmulhrsw ym17, ym19 paddw ym16, ym17 vextracti128 xm17, ym16, 1 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 movq [dstq+dsq*2], xm17 movhps [dstq+r6 ], xm17 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 vzeroupper RET .w8: pmovzxbw m2, [maskq] mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 vinserti32x4 m0, [dstq+dsq*2], 2 vinserti32x4 m0, [dstq+r6 ], 3 pmullw m2, m6 psubw m1, m0, [tmpq] add maskq, 32 add tmpq, 64 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 vextracti32x4 [dstq+dsq*2], m0, 2 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 mova ym1, [dstq+dsq*2] vinserti32x8 m1, [dstq+r6 ], 1 pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+64*0] psubw m3, m1, [tmpq+64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 mova [dstq+dsq*2], ym1 vextracti32x8 [dstq+r6 ], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w16 RET .w32: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova m0, [dstq+dsq*0] mova m1, [dstq+dsq*1] pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+ 64*0] psubw m3, m1, [tmpq+ 64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32 RET cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h lea r5, [blend_v_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 jmp wq .w2: vpbroadcastd xmm2, [obmc_masks_avx2+2*2] .w2_loop: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movq xmm1, [tmpq] add tmpq, 4*2 psubw xmm1, xmm0, xmm1 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_loop RET .w4: vpbroadcastq xmm2, [obmc_masks_avx2+4*2] .w4_loop: movq xmm0, [dstq+dsq*0] movhps xmm0, [dstq+dsq*1] psubw xmm1, xmm0, [tmpq] add tmpq, 8*2 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET .w8: vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] .w8_loop: mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 psubw ym1, ym0, [tmpq] add tmpq, 16*2 pmulhrsw ym1, ym2 paddw ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] .w16_loop: mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 psubw m1, m0, [tmpq] add tmpq, 32*2 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET .w32: mova m4, [obmc_masks_avx2+32*2] .w32_loop: mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 64*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32_loop RET cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask %define base r6-$$ lea r6, [$$] tzcnt wd, wm mov hd, hm movsxd wq, [base+blend_h_avx512icl_table+wq*4] lea maskq, [base+obmc_masks_avx2+hq*2] lea hd, [hq*3] lea wq, [base+blend_h_avx512icl_table+wq] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movd xmm2, [maskq+hq*2] movq xmm1, [tmpq] add tmpq, 4*2 punpcklwd xmm2, xmm2 psubw xmm1, xmm0, xmm1 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova xmm3, [blend_shuf] .w4_loop: movq xmm0, [dstq+dsq*0] movhps xmm0, [dstq+dsq*1] movd xmm2, [maskq+hq*2] psubw xmm1, xmm0, [tmpq] add tmpq, 8*2 pshufb xmm2, xmm3 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET .w8: vbroadcasti32x4 ym3, [blend_shuf] shufpd ym3, ym3, 0x0c .w8_loop: mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 vpbroadcastd ym2, [maskq+hq*2] psubw ym1, ym0, [tmpq] add tmpq, 16*2 pshufb ym2, ym3 pmulhrsw ym1, ym2 paddw ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop RET .w16: vbroadcasti32x4 m3, [blend_shuf] shufpd m3, m3, 0xf0 .w16_loop: mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 vpbroadcastd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 32*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16_loop RET .w32: vpbroadcastw m4, [maskq+hq*2] vpbroadcastw m5, [maskq+hq*2+2] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 64*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w32 RET .w64: vpbroadcastw m4, [maskq+hq*2] mova m0, [dstq+64*0] psubw m2, m0, [tmpq+64*0] mova m1, [dstq+64*1] psubw m3, m1, [tmpq+64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq inc hq jl .w64 RET .w128: vpbroadcastw m8, [maskq+hq*2] mova m0, [dstq+64*0] psubw m4, m0, [tmpq+64*0] mova m1, [dstq+64*1] psubw m5, m1, [tmpq+64*1] mova m2, [dstq+64*2] psubw m6, m2, [tmpq+64*2] mova m3, [dstq+64*3] psubw m7, m3, [tmpq+64*3] add tmpq, 64*4 REPX {pmulhrsw x, m8}, m4, m5, m6, m7 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq inc hq jl .w128 RET cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax sub dword mx0m, 4<<14 sub dword src_wm, 8 mov r6, ~0 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm kmovq k6, r6 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pd_16384] vpbroadcastd m7, [base+pd_63] mova m24, [base+resize_permA] mova m25, [base+resize_permB] mova m26, [base+resize_permC] mova m27, [base+resize_permD] vbroadcasti32x4 m28, [base+resize_shufA] vbroadcasti32x4 m29, [base+resize_shufB] mova m30, [base+resize_permE] vpbroadcastw ym31, pxmaxm vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] pslld m5, 4 ; dx*16 pslld m6, 14 pxor m2, m2 .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset vptestmd k5, m1, m1 pand m9, m7 ; filter offset (masked) ktestw k5, k5 jz .load vpbroadcastq m14, [base+pd_0_4] vpermq m10, m0, q1100 vpermq m11, m0, q3322 vpermq m20, m1, q1100 vpermq m21, m1, q3322 punpckldq m10, m10 punpckldq m11, m11 punpckldq m20, m20 punpckldq m21, m21 paddd m10, m14 paddd m11, m14 paddd m20, m14 paddd m21, m14 vextracti32x8 ym12, m10, 1 vextracti32x8 ym13, m11, 1 vextracti32x8 ym22, m20, 1 vextracti32x8 ym23, m21, 1 kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] pshufb m16, m0 pshufb m17, m1 pshufb m18, m14 pshufb m19, m15 mova m20, m24 mova m22, m24 mova m21, m25 mova m23, m25 vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd mova m15, m26 mova m17, m26 mova m16, m27 mova m18, m27 vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd kmovq k1, k6 kmovq k2, k6 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] pshufb m10, m11, m28 pshufb m11, m11, m29 pshufb m12, m13, m28 pshufb m13, m13, m29 jmp .filter .load: kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] pshufb m10, m11, m28 pshufb m11, m11, m29 pshufb m12, m13, m28 pshufb m13, m13, m29 vpgatherdd m15{k3}, [srcq+m0*2+ 0] vpgatherdd m16{k4}, [srcq+m0*2+ 4] kmovq k1, k6 kmovq k2, k6 vpgatherdd m17{k1}, [srcq+m0*2+ 8] vpgatherdd m18{k2}, [srcq+m0*2+12] .filter: mova m14, m2 vpdpwssd m14, m15, m10 vpdpwssd m14, m16, m11 vpdpwssd m14, m17, m12 vpdpwssd m14, m18, m13 psubd m14, m3, m14 psrad m14, 15 packusdw m14, m14 vpermq m14, m30, m14 pminsw ym14, ym31 mova [dstq+xq*2], ym14 paddd m4, m5 add xd, 16 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET %endif ; ARCH_X86_64 av-scenechange-0.14.1/src/asm/x86/mc16_sse.asm000064400000000000000000010371341046102023000167050ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" SECTION_RODATA ; dav1d_obmc_masks[] << 9 obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 rescale_mul: dd 0, 1, 2, 3 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 bdct_lb_q: times 8 db 0 times 8 db 4 times 8 db 8 times 8 db 12 pw_2: times 8 dw 2 pw_16: times 4 dw 16 prep_mul: times 4 dw 16 times 8 dw 4 pw_64: times 8 dw 64 pw_256: times 8 dw 256 pw_2048: times 4 dw 2048 bidir_mul: times 4 dw 2048 pw_8192: times 8 dw 8192 pw_27615: times 8 dw 27615 pw_32766: times 8 dw 32766 pw_m512: times 8 dw -512 pd_63: times 4 dd 63 pd_64: times 4 dd 64 pd_512: times 4 dd 512 pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 pd_0x3ff: times 4 dd 0x3ff pd_0x4000: times 4 dd 0x4000 pq_0x400000: times 2 dq 0x400000 pq_0x40000000: times 2 dq 0x40000000 pd_65538: times 2 dd 65538 put_bilin_h_rnd: times 4 dw 8 times 4 dw 10 s_8tap_h_rnd: times 2 dd 2 times 2 dd 8 put_s_8tap_v_rnd: times 2 dd 512 times 2 dd 128 s_8tap_h_sh: dd 2, 4 put_s_8tap_v_sh: dd 10, 8 bidir_rnd: times 4 dw -16400 times 4 dw -16388 put_8tap_h_rnd: dd 34, 34, 40, 40 prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) warp8x8_shift: dd 11, 13 warp8x8_rnd1: dd 1024, 1024, 4096, 4096 warp8x8_rnd2: times 4 dw 4096 times 4 dw 16384 warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern resize_filter SECTION .text %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif INIT_XMM ssse3 cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy %define base t0-put_ssse3 mov mxyd, r6m ; mx LEA t0, put_ssse3 movifnidn wd, wm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: tzcnt wd, wd movzx wd, word [base+put_ssse3_table+wq*2] add wq, t0 movifnidn hd, hm jmp wq .put_w2: mov r4d, [srcq+ssq*0] mov r6d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r4d mov [dstq+dsq*1], r6d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq [dstq+dsq*0], m0 movq [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu m0, [srcq+ssq*0+16*0] movu m1, [srcq+ssq*0+16*1] movu m2, [srcq+ssq*1+16*0] movu m3, [srcq+ssq*1+16*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+16*0], m0 mova [dstq+dsq*0+16*1], m1 mova [dstq+dsq*1+16*0], m2 mova [dstq+dsq*1+16*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] add srcq, ssq mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 add dstq, dsq dec hd jg .put_w32 RET .put_w64: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] add srcq, ssq mova [dstq+16*4], m0 mova [dstq+16*5], m1 mova [dstq+16*6], m2 mova [dstq+16*7], m3 add dstq, dsq dec hd jg .put_w64 RET .put_w128: add srcq, 16*8 add dstq, 16*8 .put_w128_loop: movu m0, [srcq-16*8] movu m1, [srcq-16*7] movu m2, [srcq-16*6] movu m3, [srcq-16*5] mova [dstq-16*8], m0 mova [dstq-16*7], m1 mova [dstq-16*6], m2 mova [dstq-16*5], m3 movu m0, [srcq-16*4] movu m1, [srcq-16*3] movu m2, [srcq-16*2] movu m3, [srcq-16*1] mova [dstq-16*4], m0 mova [dstq-16*3], m1 mova [dstq-16*2], m2 mova [dstq-16*1], m3 movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] add srcq, ssq mova [dstq+16*4], m0 mova [dstq+16*5], m1 mova [dstq+16*6], m2 mova [dstq+16*7], m3 add dstq, dsq dec hd jg .put_w128_loop RET .h: movd m5, mxyd mov mxyd, r7m ; my mova m4, [base+pw_16] pshufb m5, [base+pw_256] psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v mov r6d, r8m ; bitdepth_max shr r6d, 11 movddup m3, [base+put_bilin_h_rnd+r6*8] movifnidn hd, hm sub wd, 8 jg .h_w16 je .h_w8 cmp wd, -4 je .h_w4 .h_w2: movq m1, [srcq+ssq*0] movhps m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw m0, m4, m1 psrlq m1, 16 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 movd [dstq+dsq*0], m0 punpckhqdq m0, m0 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq m0, [srcq+ssq*0] movhps m0, [srcq+ssq*1] movq m1, [srcq+ssq*0+2] movhps m1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*0+2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] neg wq .h_w16_loop0: mov r6, wq .h_w16_loop: movu m0, [srcq+r6*2+ 0] movu m1, [srcq+r6*2+ 2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 movu m1, [srcq+r6*2+16] movu m2, [srcq+r6*2+18] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+r6*2+16*0], m0 mova [dstq+r6*2+16*1], m1 add r6, 16 jl .h_w16_loop add srcq, ssq add dstq, dsq dec hd jg .h_w16_loop0 RET .v: shl mxyd, 11 movd m5, mxyd pshufb m5, [base+pw_256] movifnidn hd, hm cmp wd, 4 jg .v_w8 je .v_w4 .v_w2: movd m0, [srcq+ssq*0] .v_w2_loop: movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq m2, m0, m1 movd m0, [srcq+ssq*0] punpcklqdq m1, m0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 movd [dstq+dsq*0], m1 punpckhqdq m1, m1 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq m0, [srcq+ssq*0] .v_w4_loop: movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq m2, m0, m1 movq m0, [srcq+ssq*0] punpcklqdq m1, m0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: %if ARCH_X86_64 %if WIN64 push r7 %endif shl wd, 5 mov r7, srcq lea r6d, [wq+hq-256] mov r4, dstq %else mov r6, srcq %endif .v_w8_loop0: movu m0, [srcq+ssq*0] .v_w8_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m5 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m5 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop %if ARCH_X86_64 add r7, 16 add r4, 16 movzx hd, r6b mov srcq, r7 mov dstq, r4 sub r6d, 1<<8 %else mov dstq, dstmp add r6, 16 mov hd, hm add dstq, 16 mov srcq, r6 mov dstmp, dstq sub wd, 8 %endif jg .v_w8_loop0 %if WIN64 pop r7 %endif RET .hv: WIN64_SPILL_XMM 8 shl mxyd, 11 mova m3, [base+pw_2] movd m6, mxyd mova m7, [base+pw_8192] pshufb m6, [base+pw_256] test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 mova m7, [base+pw_2048] .hv_12bpc: movifnidn hd, hm cmp wd, 4 jg .hv_w8 je .hv_w4 .hv_w2: movddup m0, [srcq+ssq*0] pshufhw m1, m0, q0321 pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w2_loop: movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps m2, [srcq+ssq*0] pmullw m1, m4, m2 psrlq m2, 16 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 _ 2 _ shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 movd [dstq+dsq*0], m1 punpckhqdq m1, m1 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: movddup m0, [srcq+ssq*0] movddup m1, [srcq+ssq*0+2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w4_loop: movq m1, [srcq+ssq*1] movq m2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps m1, [srcq+ssq*0] movhps m2, [srcq+ssq*0+2] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 2 shufpd m2, m0, m1, 0x01 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: %if ARCH_X86_64 %if WIN64 push r7 %endif shl wd, 5 lea r6d, [wq+hq-256] mov r4, srcq mov r7, dstq %else mov r6, srcq %endif .hv_w8_loop0: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*0+2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w8_loop: movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 psubw m2, m1, m0 paddw m2, m2 pmulhw m2, m6 paddw m2, m0 pmulhrsw m2, m7 mova [dstq+dsq*0], m2 movu m0, [srcq+ssq*0] movu m2, [srcq+ssq*0+2] pmullw m0, m4 pmullw m2, m5 paddw m0, m3 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m1 paddw m2, m2 pmulhw m2, m6 paddw m2, m1 pmulhrsw m2, m7 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop %if ARCH_X86_64 add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 %else mov dstq, dstmp add r6, 16 mov hd, hm add dstq, 16 mov srcq, r6 mov dstmp, dstq sub wd, 8 %endif jg .hv_w8_loop0 %if WIN64 pop r7 %endif RET cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 %define base r6-prep_ssse3 movifnidn mxyd, r5m ; mx LEA r6, prep_ssse3 movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: tzcnt wd, wd movzx wd, word [base+prep_ssse3_table+wq*2] mov r5d, r7m ; bitdepth_max mova m5, [base+pw_8192] add wq, r6 shr r5d, 11 movddup m4, [base+prep_mul+r5*8] lea stride3q, [strideq*3] jmp wq .prep_w4: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] movq m1, [srcq+strideq*2] movhps m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 16*2 sub hd, 4 jg .prep_w4 RET .prep_w8: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .prep_w8 RET .prep_w16: movu m0, [srcq+strideq*0+16*0] movu m1, [srcq+strideq*0+16*1] movu m2, [srcq+strideq*1+16*0] movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 2 jg .prep_w16 RET .prep_w32: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] add srcq, strideq REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 dec hd jg .prep_w32 RET .prep_w64: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] add srcq, strideq REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*4], m0 mova [tmpq+16*5], m1 mova [tmpq+16*6], m2 mova [tmpq+16*7], m3 add tmpq, 16*8 dec hd jg .prep_w64 RET .prep_w128: movu m0, [srcq+16* 0] movu m1, [srcq+16* 1] movu m2, [srcq+16* 2] movu m3, [srcq+16* 3] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 movu m0, [srcq+16* 4] movu m1, [srcq+16* 5] movu m2, [srcq+16* 6] movu m3, [srcq+16* 7] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*4], m0 mova [tmpq+16*5], m1 mova [tmpq+16*6], m2 mova [tmpq+16*7], m3 movu m0, [srcq+16* 8] movu m1, [srcq+16* 9] movu m2, [srcq+16*10] movu m3, [srcq+16*11] add tmpq, 16*16 REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq-16*8], m0 mova [tmpq-16*7], m1 mova [tmpq-16*6], m2 mova [tmpq-16*5], m3 movu m0, [srcq+16*12] movu m1, [srcq+16*13] movu m2, [srcq+16*14] movu m3, [srcq+16*15] add srcq, strideq REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq-16*4], m0 mova [tmpq-16*3], m1 mova [tmpq-16*2], m2 mova [tmpq-16*1], m3 dec hd jg .prep_w128 RET .h: movd m4, mxyd mov mxyd, r6m ; my mova m3, [base+pw_16] pshufb m4, [base+pw_256] mova m5, [base+pw_32766] psubw m3, m4 test dword r7m, 0x800 jnz .h_12bpc psllw m3, 2 psllw m4, 2 .h_12bpc: test mxyd, mxyd jnz .hv sub wd, 8 je .h_w8 jg .h_w16 .h_w4: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] movq m1, [srcq+strideq*0+2] movhps m1, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 16 sub hd, 2 jg .h_w4 RET .h_w8: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 16*2 sub hd, 2 jg .h_w8 RET .h_w16: lea srcq, [srcq+wq*2] neg wq .h_w16_loop0: mov r6, wq .h_w16_loop: movu m0, [srcq+r6*2+ 0] movu m1, [srcq+r6*2+ 2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 movu m1, [srcq+r6*2+16] movu m2, [srcq+r6*2+18] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 16*2 add r6, 16 jl .h_w16_loop add srcq, strideq dec hd jg .h_w16_loop0 RET .v: movd m4, mxyd mova m3, [base+pw_16] pshufb m4, [base+pw_256] mova m5, [base+pw_32766] psubw m3, m4 test dword r7m, 0x800 jnz .v_12bpc psllw m3, 2 psllw m4, 2 .v_12bpc: cmp wd, 8 je .v_w8 jg .v_w16 .v_w4: movq m0, [srcq+strideq*0] .v_w4_loop: movq m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklqdq m1, m0, m2 ; 0 1 movq m0, [srcq+strideq*0] punpcklqdq m2, m0 ; 1 2 pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 16 sub hd, 2 jg .v_w4_loop RET .v_w8: movu m0, [srcq+strideq*0] .v_w8_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m3 pmullw m1, m4, m2 psubw m0, m5 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m3 mova [tmpq+16*0], m1 pmullw m1, m4, m0 psubw m2, m5 paddw m1, m2 psraw m1, 2 mova [tmpq+16*1], m1 add tmpq, 16*2 sub hd, 2 jg .v_w8_loop RET .v_w16: %if WIN64 push r7 %endif mov r5, srcq %if ARCH_X86_64 lea r6d, [wq*4-32] mov wd, wd lea r6d, [hq+r6*8] mov r7, tmpq %else mov r6d, wd %endif .v_w16_loop0: movu m0, [srcq+strideq*0] .v_w16_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m3 pmullw m1, m4, m2 psubw m0, m5 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m3 mova [tmpq+wq*0], m1 pmullw m1, m4, m0 psubw m2, m5 paddw m1, m2 psraw m1, 2 mova [tmpq+wq*2], m1 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w16_loop %if ARCH_X86_64 add r5, 16 add r7, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 %else mov tmpq, tmpmp add r5, 16 mov hd, hm add tmpq, 16 mov srcq, r5 mov tmpmp, tmpq sub r6d, 8 %endif jg .v_w16_loop0 %if WIN64 pop r7 %endif RET .hv: WIN64_SPILL_XMM 7 shl mxyd, 11 movd m6, mxyd pshufb m6, [base+pw_256] cmp wd, 8 je .hv_w8 jg .hv_w16 .hv_w4: movddup m0, [srcq+strideq*0] movddup m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 .hv_w4_loop: movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] movhps m1, [srcq+strideq*0] movhps m2, [srcq+strideq*0+2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 ; 1 2 shufpd m2, m0, m1, 0x01 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 16 sub hd, 2 jg .hv_w4_loop RET .hv_w8: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 .hv_w8_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [tmpq+16*0], m2 movu m0, [srcq+strideq*0] movu m2, [srcq+strideq*0+2] pmullw m0, m3 pmullw m2, m4 psubw m0, m5 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 2 jg .hv_w8_loop RET .hv_w16: %if WIN64 push r7 %endif mov r5, srcq %if ARCH_X86_64 lea r6d, [wq*4-32] mov wd, wd lea r6d, [hq+r6*8] mov r7, tmpq %else mov r6d, wd %endif .hv_w16_loop0: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 .hv_w16_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [tmpq+wq*0], m2 movu m0, [srcq+strideq*0] movu m2, [srcq+strideq*0+2] pmullw m0, m3 pmullw m2, m4 psubw m0, m5 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+wq*2], m2 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w16_loop %if ARCH_X86_64 add r5, 16 add r7, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 %else mov tmpq, tmpmp add r5, 16 mov hd, hm add tmpq, 16 mov srcq, r5 mov tmpmp, tmpq sub r6d, 8 %endif jg .hv_w16_loop0 %if WIN64 pop r7 %endif RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; prefix, type, type_h, type_v cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) %endif %endmacro %if ARCH_X86_32 DECLARE_REG_TMP 1, 2, 6 %elif WIN64 DECLARE_REG_TMP 4, 5, 8 %else DECLARE_REG_TMP 7, 8, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_FN smooth, SMOOTH, SMOOTH PUT_8TAP_FN sharp_regular, SHARP, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_FN regular, REGULAR, REGULAR %if ARCH_X86_32 cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my %define mxb r0b %define mxd r0 %define mxq r0 %define myb r1b %define myd r1 %define myq r1 %define m8 [esp+16*0] %define m9 [esp+16*1] %define m10 [esp+16*2] %define m11 [esp+16*3] %define m12 [esp+16*4] %define m13 [esp+16*5] %define m14 [esp+16*6] %define m15 [esp+16*7] %else cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %endif %define base t2-put_ssse3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v LEA t2, put_ssse3 movifnidn wd, wm movifnidn srcq, srcmp movifnidn ssq, ssmp movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [base+put_ssse3_table+wq*2] movifnidn dstq, dstmp movifnidn dsq, dsmp add wq, t2 %if WIN64 pop r8 pop r7 %endif jmp wq .h: test myd, 0xf00 jnz .hv mov myd, r8m movd m5, r8m shr myd, 11 movddup m4, [base+put_8tap_h_rnd+myq*8] movifnidn dsq, dsmp pshufb m5, [base+pw_256] cmp wd, 4 jg .h_w8 movzx mxd, mxb lea srcq, [srcq-2] movq m3, [base+subpel_filters+mxq*8] movifnidn dstq, dstmp punpcklbw m3, m3 psraw m3, 8 ; sign-extend je .h_w4 .h_w2: mova m2, [base+spel_h_shuf2] pshufd m3, m3, q2121 .h_w2_loop: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m2 pshufb m1, m2 pmaddwd m0, m3 pmaddwd m1, m3 phaddd m0, m1 paddd m0, m4 psrad m0, 6 packssdw m0, m0 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 movd [dstq+dsq*0], m0 pshuflw m0, m0, q3232 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: WIN64_SPILL_XMM 8 mova m6, [base+spel_h_shufA] mova m7, [base+spel_h_shufB] pshufd m2, m3, q1111 pshufd m3, m3, q2222 .h_w4_loop: movu m1, [srcq] add srcq, ssq pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 pshufb m1, m7 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m2 pmaddwd m1, m3 paddd m0, m4 paddd m0, m1 psrad m0, 6 packssdw m0, m0 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 movq [dstq], m0 add dstq, dsq dec hd jg .h_w4_loop RET .h_w8: %if WIN64 %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 %endif shr mxd, 16 movq m3, [base+subpel_filters+mxq*8] movifnidn dstq, dstmp mova m6, [base+spel_h_shufA] mova m7, [base+spel_h_shufB] %if UNIX64 mov wd, wd %endif lea srcq, [srcq+wq*2] punpcklbw m3, m3 lea dstq, [dstq+wq*2] psraw m3, 8 neg wq %if ARCH_X86_32 ALLOC_STACK -16*4 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 %else pshufd m8, m3, q0000 pshufd m9, m3, q1111 pshufd m10, m3, q2222 pshufd m11, m3, q3333 %endif .h_w8_loop0: mov r6, wq .h_w8_loop: movu m0, [srcq+r6*2- 6] movu m1, [srcq+r6*2+ 2] pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 pshufb m0, m7 ; 2 3 3 4 4 5 5 6 pmaddwd m2, m8 ; abcd0 pmaddwd m0, m9 ; abcd1 pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 pshufb m1, m7 ; 6 7 7 8 8 9 9 a paddd m2, m4 paddd m0, m2 pmaddwd m2, m10, m3 ; abcd2 pmaddwd m3, m8 ; efgh0 paddd m0, m2 pmaddwd m2, m11, m1 ; abcd3 pmaddwd m1, m9 ; efgh1 paddd m0, m2 movu m2, [srcq+r6*2+10] paddd m3, m4 paddd m1, m3 pshufb m3, m2, m6 ; 8 9 9 a a b b c pshufb m2, m7 ; a b b c c d d e pmaddwd m3, m10 ; efgh2 pmaddwd m2, m11 ; efgh3 paddd m1, m3 paddd m1, m2 psrad m0, 6 psrad m1, 6 packssdw m0, m1 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 mova [dstq+r6*2], m0 add r6, 8 jl .h_w8_loop add srcq, ssq add dstq, dsq dec hd jg .h_w8_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif %if WIN64 WIN64_SPILL_XMM 15 %endif movd m7, r8m movifnidn dstq, dstmp movifnidn dsq, dsmp punpcklbw m3, m3 pshufb m7, [base+pw_256] psraw m3, 8 ; sign-extend %if ARCH_X86_32 ALLOC_STACK -16*7 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 %else pshufd m8, m3, q0000 pshufd m9, m3, q1111 pshufd m10, m3, q2222 pshufd m11, m3, q3333 %endif lea r6, [ssq*3] sub srcq, r6 cmp wd, 2 jne .v_w4 .v_w2: movd m1, [srcq+ssq*0] movd m4, [srcq+ssq*1] movd m2, [srcq+ssq*2] add srcq, r6 movd m5, [srcq+ssq*0] movd m3, [srcq+ssq*1] movd m6, [srcq+ssq*2] add srcq, r6 movd m0, [srcq+ssq*0] punpckldq m1, m4 ; 0 1 punpckldq m4, m2 ; 1 2 punpckldq m2, m5 ; 2 3 punpckldq m5, m3 ; 3 4 punpckldq m3, m6 ; 4 5 punpckldq m6, m0 ; 5 6 punpcklwd m1, m4 ; 01 12 punpcklwd m2, m5 ; 23 34 punpcklwd m3, m6 ; 45 56 pxor m6, m6 .v_w2_loop: movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 punpckldq m3, m0, m4 ; 6 7 movd m0, [srcq+ssq*0] punpckldq m4, m0 ; 7 8 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 5 packssdw m5, m5 pmaxsw m5, m6 pavgw m5, m6 pminsw m5, m7 movd [dstq+dsq*0], m5 pshuflw m5, m5, q3232 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: %if ARCH_X86_32 shl wd, 14 %if STACK_ALIGNMENT < 16 mov [esp+4*29], srcq mov [esp+4*30], dstq %else mov srcmp, srcq %endif lea wd, [wq+hq-(1<<16)] %else shl wd, 6 mov r7, srcq mov r8, dstq lea wd, [wq+hq-(1<<8)] %endif .v_w4_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] movq m3, [srcq+ssq*2] add srcq, r6 movq m4, [srcq+ssq*0] movq m5, [srcq+ssq*1] movq m6, [srcq+ssq*2] add srcq, r6 movq m0, [srcq+ssq*0] punpcklwd m1, m2 ; 01 punpcklwd m2, m3 ; 12 punpcklwd m3, m4 ; 23 punpcklwd m4, m5 ; 34 punpcklwd m5, m6 ; 45 punpcklwd m6, m0 ; 56 %if ARCH_X86_32 jmp .v_w4_loop_start .v_w4_loop: mova m1, m12 mova m2, m13 mova m3, m14 .v_w4_loop_start: pmaddwd m1, m8 ; a0 pmaddwd m2, m8 ; b0 mova m12, m3 mova m13, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m1, m3 paddd m2, m4 mova m14, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m1, m5 paddd m2, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m3, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m1, m3 pmaddwd m3, m11, m6 ; b3 paddd m2, m3 psrad m1, 5 psrad m2, 5 packssdw m1, m2 pxor m2, m2 pmaxsw m1, m2 pavgw m1, m2 pminsw m1, m7 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop %if STACK_ALIGNMENT < 16 mov srcq, [esp+4*29] mov dstq, [esp+4*30] movzx hd, ww add srcq, 8 add dstq, 8 mov [esp+4*29], srcq mov [esp+4*30], dstq %else mov srcq, srcmp mov dstq, dstmp movzx hd, ww add srcq, 8 add dstq, 8 mov srcmp, srcq mov dstmp, dstq %endif sub wd, 1<<16 %else .v_w4_loop: pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 paddd m13, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m14, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 5 psrad m13, 5 packssdw m12, m13 pxor m13, m13 pmaxsw m12, m13 pavgw m12, m13 pminsw m12, m7 movq [dstq+dsq*0], m12 movhps [dstq+dsq*1], m12 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop add r7, 8 add r8, 8 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 %endif jg .v_w4_loop0 RET .hv: %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif %if ARCH_X86_32 movd m4, r8m mova m6, [base+pd_512] pshufb m4, [base+pw_256] %else %if WIN64 ALLOC_STACK 16*6, 16 %endif movd m15, r8m pshufb m15, [base+pw_256] %endif cmp wd, 4 jg .hv_w8 movzx mxd, mxb je .hv_w4 movq m0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 mov dstq, dstmp mov dsq, dsmp mova m5, [base+spel_h_shuf2] ALLOC_STACK -16*8 %else mova m6, [base+pd_512] mova m9, [base+spel_h_shuf2] %endif pshuflw m0, m0, q2121 pxor m7, m7 punpcklbw m7, m0 punpcklbw m3, m3 psraw m3, 8 ; sign-extend test dword r8m, 0x800 jz .hv_w2_10bpc psraw m7, 2 psllw m3, 2 .hv_w2_10bpc: lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 %if ARCH_X86_32 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m9, m5 mova m11, m0 mova m12, m1 mova m13, m2 mova m14, m3 mova m15, m4 %else pshufd m11, m3, q0000 pshufd m12, m3, q1111 pshufd m13, m3, q2222 pshufd m14, m3, q3333 %endif movu m2, [srcq+ssq*0] movu m3, [srcq+ssq*1] movu m1, [srcq+ssq*2] add srcq, r6 movu m4, [srcq+ssq*0] %if ARCH_X86_32 REPX {pshufb x, m5}, m2, m3, m1, m4 %else REPX {pshufb x, m9}, m2, m3, m1, m4 %endif REPX {pmaddwd x, m7}, m2, m3, m1, m4 phaddd m2, m3 ; 0 1 phaddd m1, m4 ; 2 3 movu m3, [srcq+ssq*1] movu m4, [srcq+ssq*2] add srcq, r6 movu m0, [srcq+ssq*0] %if ARCH_X86_32 REPX {pshufb x, m5}, m3, m4, m0 %else REPX {pshufb x, m9}, m3, m4, m0 %endif REPX {pmaddwd x, m7}, m3, m4, m0 phaddd m3, m4 ; 4 5 phaddd m0, m0 ; 6 6 REPX {paddd x, m6}, m2, m1, m3, m0 REPX {psrad x, 10}, m2, m1, m3, m0 packssdw m2, m1 ; 0 1 2 3 packssdw m3, m0 ; 4 5 6 _ palignr m4, m3, m2, 4 ; 1 2 3 4 pshufd m5, m3, q0321 ; 5 6 _ _ punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 punpcklwd m3, m5 ; 45 56 .hv_w2_loop: movu m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu m5, [srcq+ssq*0] pshufb m4, m9 pshufb m5, m9 pmaddwd m4, m7 pmaddwd m5, m7 phaddd m4, m5 pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 paddd m4, m6 psrad m4, 10 ; 7 8 packssdw m0, m4 pshufd m3, m0, q2103 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m5, m6 paddd m5, m4 psrad m5, 10 packssdw m5, m5 pxor m4, m4 pminsw m5, m15 pmaxsw m5, m4 movd [dstq+dsq*0], m5 pshuflw m5, m5, q3232 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w8: shr mxd, 16 .hv_w4: movq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif mov dstq, dstmp mov dsq, dsmp mova m0, [base+spel_h_shufA] mova m1, [base+spel_h_shufB] ALLOC_STACK -16*15 mova m8, m0 mova m9, m1 mova m14, m6 %else mova m8, [base+spel_h_shufA] mova m9, [base+spel_h_shufB] %endif pxor m0, m0 punpcklbw m0, m2 punpcklbw m3, m3 psraw m3, 8 test dword r8m, 0x800 jz .hv_w4_10bpc psraw m0, 2 psllw m3, 2 .hv_w4_10bpc: lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 %if ARCH_X86_32 %define tmp esp+16*8 shl wd, 14 %if STACK_ALIGNMENT < 16 mov [esp+4*61], srcq mov [esp+4*62], dstq %else mov srcmp, srcq %endif mova [tmp+16*5], m4 lea wd, [wq+hq-(1<<16)] pshufd m1, m0, q0000 pshufd m2, m0, q1111 pshufd m5, m0, q2222 pshufd m0, m0, q3333 mova m10, m1 mova m11, m2 mova m12, m5 mova m13, m0 %else %if WIN64 %define tmp rsp %else %define tmp rsp-104 ; red zone %endif shl wd, 6 mov r7, srcq mov r8, dstq lea wd, [wq+hq-(1<<8)] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 mova [tmp+16*5], m15 %endif pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [tmp+16*1], m0 mova [tmp+16*2], m1 mova [tmp+16*3], m2 mova [tmp+16*4], m3 %macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 pmaddwd m%3, m10 pmaddwd m%1, m11 paddd m%3, %5 paddd m%1, m%3 pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a pmaddwd m%3, m12 pmaddwd m%2, m13 paddd m%1, m%3 paddd m%1, m%2 psrad m%1, %4 %endmacro .hv_w4_loop0: %if ARCH_X86_64 mova m14, [pd_512] %endif movu m4, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] movu m5, [srcq+ssq*1+0] movu m2, [srcq+ssq*1+8] movu m6, [srcq+ssq*2+0] movu m3, [srcq+ssq*2+8] add srcq, r6 PUT_8TAP_HV_H 4, 1, 0, 10 PUT_8TAP_HV_H 5, 2, 0, 10 PUT_8TAP_HV_H 6, 3, 0, 10 movu m7, [srcq+ssq*0+0] movu m2, [srcq+ssq*0+8] movu m1, [srcq+ssq*1+0] movu m3, [srcq+ssq*1+8] PUT_8TAP_HV_H 7, 2, 0, 10 PUT_8TAP_HV_H 1, 3, 0, 10 movu m2, [srcq+ssq*2+0] movu m3, [srcq+ssq*2+8] add srcq, r6 PUT_8TAP_HV_H 2, 3, 0, 10 packssdw m4, m7 ; 0 3 packssdw m5, m1 ; 1 4 movu m0, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 1, 3, 10 packssdw m6, m2 ; 2 5 packssdw m7, m0 ; 3 6 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 %if ARCH_X86_32 jmp .hv_w4_loop_start .hv_w4_loop: mova m1, [tmp+16*6] mova m2, m15 .hv_w4_loop_start: mova m7, [tmp+16*1] pmaddwd m1, m7 ; a0 pmaddwd m2, m7 ; b0 mova m7, [tmp+16*2] mova [tmp+16*6], m3 pmaddwd m3, m7 ; a1 mova m15, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m1, m3 paddd m2, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m1, m5 paddd m2, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 10 packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 10 mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m1, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m2, m7 ; b3 psrad m1, 9 psrad m2, 9 packssdw m1, m2 pxor m7, m7 pmaxsw m1, m7 pavgw m7, m1 pminsw m7, [tmp+16*5] movq [dstq+dsq*0], m7 movhps [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop %if STACK_ALIGNMENT < 16 mov srcq, [esp+4*61] mov dstq, [esp+4*62] add srcq, 8 add dstq, 8 mov [esp+4*61], srcq mov [esp+4*62], dstq %else mov srcq, srcmp mov dstq, dstmp add srcq, 8 add dstq, 8 mov srcmp, srcq mov dstmp, dstq %endif movzx hd, ww sub wd, 1<<16 %else .hv_w4_loop: mova m15, [tmp+16*1] pmaddwd m14, m15, m1 ; a0 pmaddwd m15, m2 ; b0 mova m7, [tmp+16*2] mova m1, m3 pmaddwd m3, m7 ; a1 mova m2, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m14, m3 paddd m15, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m14, m5 paddd m15, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m14, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m15, m7 ; b3 psrad m14, 9 psrad m15, 9 packssdw m14, m15 pxor m7, m7 pmaxsw m14, m7 pavgw m7, m14 pminsw m7, [tmp+16*5] movq [dstq+dsq*0], m7 movhps [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop add r7, 8 add r8, 8 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 %endif jg .hv_w4_loop0 RET %undef tmp %if ARCH_X86_32 DECLARE_REG_TMP 2, 1, 6, 4 %elif WIN64 DECLARE_REG_TMP 6, 4, 7, 4 %else DECLARE_REG_TMP 6, 7, 7, 8 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_FN smooth, SMOOTH, SMOOTH PREP_8TAP_FN sharp_regular, SHARP, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_FN regular, REGULAR, REGULAR %if ARCH_X86_32 cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my %define mxb r0b %define mxd r0 %define mxq r0 %define myb r2b %define myd r2 %define myq r2 %else cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my %endif %define base t2-prep_ssse3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v LEA t2, prep_ssse3 movifnidn wd, wm movifnidn srcq, srcmp test mxd, 0xf00 jnz .h movifnidn hd, hm test myd, 0xf00 jnz .v tzcnt wd, wd mov myd, r7m ; bitdepth_max movzx wd, word [base+prep_ssse3_table+wq*2] mova m5, [base+pw_8192] shr myd, 11 add wq, t2 movddup m4, [base+prep_mul+myq*8] movifnidn ssq, ssmp movifnidn tmpq, tmpmp lea r6, [ssq*3] %if WIN64 pop r7 %endif jmp wq .h: test myd, 0xf00 jnz .hv movifnidn ssq, r2mp movifnidn hd, r4m movddup m5, [base+prep_8tap_1d_rnd] cmp wd, 4 jne .h_w8 movzx mxd, mxb movq m0, [base+subpel_filters+mxq*8] mova m3, [base+spel_h_shufA] mova m4, [base+spel_h_shufB] movifnidn tmpq, tmpmp sub srcq, 2 WIN64_SPILL_XMM 8 punpcklbw m0, m0 psraw m0, 8 test dword r7m, 0x800 jnz .h_w4_12bpc psllw m0, 2 .h_w4_12bpc: pshufd m6, m0, q1111 pshufd m7, m0, q2222 .h_w4_loop: movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m6 pmaddwd m1, m7 paddd m0, m5 paddd m0, m1 pshufb m1, m2, m3 pshufb m2, m4 pmaddwd m1, m6 pmaddwd m2, m7 paddd m1, m5 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq], m0 add tmpq, 16 sub hd, 2 jg .h_w4_loop RET .h_w8: WIN64_SPILL_XMM 11 shr mxd, 16 movq m2, [base+subpel_filters+mxq*8] mova m4, [base+spel_h_shufA] mova m6, [base+spel_h_shufB] movifnidn tmpq, r0mp add wd, wd punpcklbw m2, m2 add srcq, wq psraw m2, 8 add tmpq, wq neg wq test dword r7m, 0x800 jnz .h_w8_12bpc psllw m2, 2 .h_w8_12bpc: pshufd m7, m2, q0000 %if ARCH_X86_32 ALLOC_STACK -16*3 pshufd m0, m2, q1111 pshufd m1, m2, q2222 pshufd m2, m2, q3333 mova m8, m0 mova m9, m1 mova m10, m2 %else pshufd m8, m2, q1111 pshufd m9, m2, q2222 pshufd m10, m2, q3333 %endif .h_w8_loop0: mov r6, wq .h_w8_loop: movu m0, [srcq+r6- 6] movu m1, [srcq+r6+ 2] pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 pshufb m0, m6 ; 2 3 3 4 4 5 5 6 pmaddwd m2, m7 ; abcd0 pmaddwd m0, m8 ; abcd1 pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 pshufb m1, m6 ; 6 7 7 8 8 9 9 a paddd m2, m5 paddd m0, m2 pmaddwd m2, m9, m3 ; abcd2 pmaddwd m3, m7 ; efgh0 paddd m0, m2 pmaddwd m2, m10, m1 ; abcd3 pmaddwd m1, m8 ; efgh1 paddd m0, m2 movu m2, [srcq+r6+10] paddd m3, m5 paddd m1, m3 pshufb m3, m2, m4 ; a b b c c d d e pshufb m2, m6 ; 8 9 9 a a b b c pmaddwd m3, m9 ; efgh2 pmaddwd m2, m10 ; efgh3 paddd m1, m3 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq+r6], m0 add r6, 16 jl .h_w8_loop add srcq, ssq sub tmpq, wq dec hd jg .h_w8_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd movq m3, [base+subpel_filters+myq*8] %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif WIN64_SPILL_XMM 15 movddup m7, [base+prep_8tap_1d_rnd] movifnidn ssq, r2mp movifnidn tmpq, r0mp punpcklbw m3, m3 psraw m3, 8 ; sign-extend test dword r7m, 0x800 jnz .v_12bpc psllw m3, 2 .v_12bpc: %if ARCH_X86_32 ALLOC_STACK -16*7 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 %else pshufd m8, m3, q0000 pshufd m9, m3, q1111 pshufd m10, m3, q2222 pshufd m11, m3, q3333 %endif lea r6, [ssq*3] sub srcq, r6 mov r6d, wd shl wd, 6 mov r5, srcq %if ARCH_X86_64 mov r7, tmpq %elif STACK_ALIGNMENT < 16 mov [esp+4*29], tmpq %endif lea wd, [wq+hq-(1<<8)] .v_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m3, [srcq+ssq*0] movq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m5, [srcq+ssq*0] movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m0, [srcq+ssq*0] punpcklwd m1, m2 ; 01 punpcklwd m2, m3 ; 12 punpcklwd m3, m4 ; 23 punpcklwd m4, m5 ; 34 punpcklwd m5, m6 ; 45 punpcklwd m6, m0 ; 56 %if ARCH_X86_32 jmp .v_loop_start .v_loop: mova m1, m12 mova m2, m13 mova m3, m14 .v_loop_start: pmaddwd m1, m8 ; a0 pmaddwd m2, m8 ; b0 mova m12, m3 mova m13, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m1, m3 paddd m2, m4 mova m14, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m1, m5 paddd m2, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m3, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m1, m7 paddd m1, m3 pmaddwd m3, m11, m6 ; b3 paddd m2, m7 paddd m2, m3 psrad m1, 4 psrad m2, 4 packssdw m1, m2 movq [tmpq+r6*0], m1 movhps [tmpq+r6*2], m1 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .v_loop %if STACK_ALIGNMENT < 16 mov tmpq, [esp+4*29] add r5, 8 add tmpq, 8 mov srcq, r5 mov [esp+4*29], tmpq %else mov tmpq, tmpmp add r5, 8 add tmpq, 8 mov srcq, r5 mov tmpmp, tmpq %endif %else .v_loop: pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 paddd m13, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m14, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m12, m7 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m7 paddd m13, m14 psrad m12, 4 psrad m13, 4 packssdw m12, m13 movq [tmpq+r6*0], m12 movhps [tmpq+r6*2], m12 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .v_loop add r5, 8 add r7, 8 mov srcq, r5 mov tmpq, r7 %endif movzx hd, wb sub wd, 1<<8 jg .v_loop0 RET .hv: %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif movzx t3d, mxb shr mxd, 16 cmp wd, 4 cmove mxd, t3d movifnidn hd, r4m movq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 mov ssq, r2mp mov tmpq, r0mp mova m0, [base+spel_h_shufA] mova m1, [base+spel_h_shufB] mova m4, [base+prep_8tap_2d_rnd] ALLOC_STACK -16*14 mova m8, m0 mova m9, m1 mova m14, m4 %else %if WIN64 ALLOC_STACK 16*6, 16 %endif mova m8, [base+spel_h_shufA] mova m9, [base+spel_h_shufB] %endif pxor m0, m0 punpcklbw m0, m2 punpcklbw m3, m3 psraw m0, 4 psraw m3, 8 test dword r7m, 0x800 jz .hv_10bpc psraw m0, 2 .hv_10bpc: lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 mov r6d, wd shl wd, 6 mov r5, srcq %if ARCH_X86_32 %define tmp esp+16*8 %if STACK_ALIGNMENT < 16 mov [esp+4*61], tmpq %endif pshufd m1, m0, q0000 pshufd m2, m0, q1111 pshufd m5, m0, q2222 pshufd m0, m0, q3333 mova m10, m1 mova m11, m2 mova m12, m5 mova m13, m0 %else %if WIN64 %define tmp rsp %else %define tmp rsp-88 ; red zone %endif mov r7, tmpq pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 %endif lea wd, [wq+hq-(1<<8)] pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [tmp+16*1], m0 mova [tmp+16*2], m1 mova [tmp+16*3], m2 mova [tmp+16*4], m3 .hv_loop0: %if ARCH_X86_64 mova m14, [prep_8tap_2d_rnd] %endif movu m4, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] movu m5, [srcq+ssq*1+0] movu m2, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] movu m6, [srcq+ssq*0+0] movu m3, [srcq+ssq*0+8] PUT_8TAP_HV_H 4, 1, 0, 6 PUT_8TAP_HV_H 5, 2, 0, 6 PUT_8TAP_HV_H 6, 3, 0, 6 movu m7, [srcq+ssq*1+0] movu m2, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] movu m1, [srcq+ssq*0+0] movu m3, [srcq+ssq*0+8] PUT_8TAP_HV_H 7, 2, 0, 6 PUT_8TAP_HV_H 1, 3, 0, 6 movu m2, [srcq+ssq*1+0] movu m3, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 2, 3, 0, 6 packssdw m4, m7 ; 0 3 packssdw m5, m1 ; 1 4 movu m0, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 1, 3, 6 packssdw m6, m2 ; 2 5 packssdw m7, m0 ; 3 6 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 %if ARCH_X86_32 jmp .hv_loop_start .hv_loop: mova m1, [tmp+16*5] mova m2, m15 .hv_loop_start: mova m7, [tmp+16*1] pmaddwd m1, m7 ; a0 pmaddwd m2, m7 ; b0 mova m7, [tmp+16*2] mova [tmp+16*5], m3 pmaddwd m3, m7 ; a1 mova m15, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m1, m14 paddd m2, m14 paddd m1, m3 paddd m2, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m1, m5 paddd m2, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 6 packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 6 mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m1, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m2, m7 ; b3 psrad m1, 6 psrad m2, 6 packssdw m1, m2 movq [tmpq+r6*0], m1 movhps [tmpq+r6*2], m1 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .hv_loop %if STACK_ALIGNMENT < 16 mov tmpq, [esp+4*61] add r5, 8 add tmpq, 8 mov srcq, r5 mov [esp+4*61], tmpq %else mov tmpq, tmpmp add r5, 8 add tmpq, 8 mov srcq, r5 mov tmpmp, tmpq %endif %else .hv_loop: mova m15, [tmp+16*1] mova m7, [prep_8tap_2d_rnd] pmaddwd m14, m15, m1 ; a0 pmaddwd m15, m2 ; b0 paddd m14, m7 paddd m15, m7 mova m7, [tmp+16*2] mova m1, m3 pmaddwd m3, m7 ; a1 mova m2, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m14, m3 paddd m15, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m14, m5 paddd m15, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m14, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m15, m7 ; b3 psrad m14, 6 psrad m15, 6 packssdw m14, m15 movq [tmpq+r6*0], m14 movhps [tmpq+r6*2], m14 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .hv_loop add r5, 8 add r7, 8 mov srcq, r5 mov tmpq, r7 %endif movzx hd, wb sub wd, 1<<8 jg .hv_loop0 RET %undef tmp %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro SAVE_REG 1 %xdefine r%1_save r%1 %xdefine r%1q_save r%1q %xdefine r%1d_save r%1d %if ARCH_X86_32 %define r%1m_save [rstk+stack_offset+(%1+1)*4] %endif %endmacro %macro LOAD_REG 1 %xdefine r%1 r%1_save %xdefine r%1q r%1q_save %xdefine r%1d r%1d_save %if ARCH_X86_32 %define r%1m r%1m_save %endif %undef r%1d_save %undef r%1q_save %undef r%1_save %endmacro %macro REMAP_REG 2-3 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %if ARCH_X86_32 %if %3 == 0 %xdefine r%1m r%2m %else %define r%1m [rstk+stack_offset+(%1+1)*4] %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %if ARCH_X86_64 SAVE_REG 14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %else SAVE_REG 5 %assign %%i 5 %rep 5 %assign %%j %%i-1 REMAP_REG %%i, %%j, 0 %assign %%i %%i-1 %endrep %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %if ARCH_X86_64 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep LOAD_REG 14 %else %rep 4 %assign %%j %%i+1 REMAP_REG %%i, %%j, 1 %assign %%i %%i+1 %endrep LOAD_REG 5 %endif %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %if ARCH_X86_32 %macro MC_4TAP_SCALED_H 1 ; dst_mem movu m7, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m5, [r4 +ssq*0] movu m6, [r4 +ssq*1] lea srcq, [srcq+ssq*2] lea r4, [r4 +ssq*2] REPX {pshufb x, m12}, m7, m2 REPX {pmaddwd x, m13}, m7, m2 REPX {pshufb x, m14}, m5, m6 REPX {pmaddwd x, m15}, m5, m6 phaddd m7, m5 phaddd m2, m6 mova m5, [esp+0x00] movd m6, [esp+0x10] paddd m7, m5 paddd m2, m5 psrad m7, m6 psrad m2, m6 packssdw m7, m2 mova [stk+%1], m7 %endmacro %endif %if ARCH_X86_64 %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] movu m%1, [srcq+ r4*2] movu m%2, [srcq+ r6*2] movu m%3, [srcq+ r7*2] movu m%4, [srcq+ r9*2] movu m%5, [srcq+r10*2] movu m%6, [srcq+r11*2] movu m%7, [srcq+r13*2] movu m%8, [srcq+ rX*2] add srcq, ssq pmaddwd m%1, [stk+0x10] pmaddwd m%2, [stk+0x20] pmaddwd m%3, [stk+0x30] pmaddwd m%4, [stk+0x40] pmaddwd m%5, [stk+0x50] pmaddwd m%6, [stk+0x60] pmaddwd m%7, [stk+0x70] pmaddwd m%8, [stk+0x80] phaddd m%1, m%2 phaddd m%3, m%4 phaddd m%5, m%6 phaddd m%7, m%8 phaddd m%1, m%3 phaddd m%5, m%7 paddd m%1, hround paddd m%5, hround psrad m%1, m12 psrad m%5, m12 packssdw m%1, m%5 %endmacro %else %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets %if %3 == 1 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] %endif movu m0, [srcq+r0*2] movu m1, [srcq+rX*2] movu m2, [srcq+r4*2] movu m3, [srcq+r5*2] mov r0, [stk+16] mov rX, [stk+20] mov r4, [stk+24] mov r5, [stk+28] pmaddwd m0, [stk+%1+0x00] pmaddwd m1, [stk+%1+0x10] pmaddwd m2, [stk+%1+0x20] pmaddwd m3, [stk+%1+0x30] phaddd m0, m1 phaddd m2, m3 movu m4, [srcq+r0*2] movu m5, [srcq+rX*2] movu m6, [srcq+r4*2] movu m7, [srcq+r5*2] add srcq, ssq pmaddwd m4, [stk+%1+0xa0] pmaddwd m5, [stk+%1+0xb0] pmaddwd m6, [stk+%1+0xc0] pmaddwd m7, [stk+%1+0xd0] phaddd m4, m5 phaddd m6, m7 phaddd m0, m2 phaddd m4, m6 paddd m0, hround paddd m4, hround psrad m0, m12 psrad m4, m12 packssdw m0, m4 %if %2 != 0 mova [stk+%2], m0 %endif %endmacro %endif %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isput 1 %assign isprep 0 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %endif %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %endif %endif %xdefine base_reg r12 %else ; prep %assign isput 0 %assign isprep 1 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %xdefine tmp_stridem r14q %else cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %define tmp_stridem qword [stk+0x138] %endif %xdefine base_reg r11 %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %endif %define tmp_stridem dword [stk+0x138] %endif %endif %if ARCH_X86_32 mov [esp+0x1f0], t0d mov [esp+0x1f4], t1d %if isput && required_stack_alignment > STACK_ALIGNMENT mov dstd, dstm mov dsd, dsm mov srcd, srcm mov ssd, ssm mov hd, hm mov r4, mxm %define r0m [esp+0x200] %define dsm [esp+0x204] %define dsmp dsm %define r1m dsm %define r2m [esp+0x208] %define ssm [esp+0x20c] %define r3m ssm %define hm [esp+0x210] %define mxm [esp+0x214] mov r0m, dstd mov dsm, dsd mov r2m, srcd mov ssm, ssd mov hm, hd mov r0, mym mov r1, dxm mov r2, dym %define mym [esp+0x218] %define dxm [esp+0x21c] %define dym [esp+0x220] mov mxm, r4 mov mym, r0 mov dxm, r1 mov dym, r2 tzcnt wd, wm %endif %if isput mov r3, pxmaxm %define pxmaxm r3 %else mov r2, pxmaxm %endif %if isprep && required_stack_alignment > STACK_ALIGNMENT %xdefine base_reg r5 %else %xdefine base_reg r6 %endif %endif LEA base_reg, %1_8tap_scaled_16bpc_ssse3 %xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3 %if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT tzcnt wd, wm %endif %if ARCH_X86_64 %if isput mov r7d, pxmaxm %endif %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %endif movd m8, dxm movd m14, mxm %if isput movd m15, pxmaxm %endif pshufd m8, m8, q0000 pshufd m14, m14, q0000 %if isput pshuflw m15, m15, q0000 punpcklqdq m15, m15 %endif %if isprep %if UNIX64 mov r5d, t0d DECLARE_REG_TMP 5, 7 %endif %if ARCH_X86_64 mov r6d, pxmaxm %endif %endif %if ARCH_X86_64 mov dyd, dym %endif %if isput %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %elif ARCH_X86_64 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %else %endif %if ARCH_X86_64 %if required_stack_alignment > STACK_ALIGNMENT %define dsm [rsp+0x138] %define rX r1 %define rXd r1d %else %define dsm dsq %define rX r14 %define rXd r14d %endif %else %define rX r1 %endif %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %elif ARCH_X86_64 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %xdefine hm r7m %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %if ARCH_X86_64 %define rX r14 %define rXd r14d %else %define rX r3 %endif %endif %if ARCH_X86_64 shr r7d, 11 mova m10, [base+pd_0x3ff] movddup m11, [base+s_8tap_h_rnd+r7*8] movd m12, [base+s_8tap_h_sh+r7*4] %if isput movddup m13, [base+put_s_8tap_v_rnd+r7*8] movd m7, [base+put_s_8tap_v_sh+r7*4] %define pxmaxm [rsp] mova pxmaxm, m15 punpcklqdq m12, m7 %endif lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q %else %define m10 [base+pd_0x3ff] %define m11 [esp+0x00] %define m12 [esp+0x10] shr r3, 11 movddup m1, [base+s_8tap_h_rnd+r3*8] movd m2, [base+s_8tap_h_sh+r3*4] %if isput %define m13 [esp+0x20] %define pxmaxm [esp+0x30] %define stk esp+0x40 movddup m5, [base+put_s_8tap_v_rnd+r3*8] movd m6, [base+put_s_8tap_v_sh+r3*4] mova pxmaxm, m15 punpcklqdq m2, m6 mova m13, m5 %else %define m13 [base+pd_m524256] %endif mov ssd, ssm mova m11, m1 mova m12, m2 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT mov r1, [esp+0x1f4] lea r0, [ssd*3] movzx r2, r1b shr r1, 16 cmp dword hm, 6 cmovs r1, r2 mov [esp+0x1f4], r1 %if isprep mov r1, r1m %endif mov r2, r2m sub srcq, r0 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define ss3q r0 %define myd r4 %define dyd dword dym %define hd dword hm %endif cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] add wq, base_reg jmp wq %if isput .w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b sub srcq, 2 movd m15, t0d %else movzx r4, byte [esp+0x1f0] sub srcq, 2 movd m15, r4 %endif pxor m9, m9 punpckldq m9, m8 paddd m14, m9 ; mx+dx*[0-1] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 pshufd m15, m15, q0321 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_q] mova m6, [base+spel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m2, m2 pcmpeqd m8, m2 psrld m14, 10 paddd m14, m14 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [stk], m14 SWAP m5, m0 SWAP m6, m3 %define m15 m6 %endif movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpckldq m15, m7 %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 pand m9, m8 pandn m8, m15 SWAP m15, m8 por m15, m9 movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m6, [srcq+ssq*2] movu m7, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %else pand m7, m5, [base+pd_0x4000] pandn m5, m15 por m5, m7 %define m15 m5 %endif punpcklbw m15, m15 psraw m15, 8 REPX {pshufb x, m14}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 %if ARCH_X86_64 REPX {pshufb x, m14}, m4, m5, m6, m7 REPX {pmaddwd x, m15}, m4, m5, m6, m7 phaddd m0, m1 phaddd m2, m3 phaddd m4, m5 phaddd m6, m7 REPX {paddd x, m11}, m0, m2, m4, m6 REPX {psrad x, m12}, m0, m2, m4, m6 packssdw m0, m2 ; 0 1 2 3 packssdw m4, m6 ; 4 5 6 7 SWAP m1, m4 %else mova [stk+0x10], m15 phaddd m0, m1 phaddd m2, m3 movu m1, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m14}, m1, m7, m6, m3 REPX {pmaddwd x, m15}, m1, m7, m6, m3 phaddd m1, m7 phaddd m6, m3 REPX {paddd x, m11}, m0, m2, m1, m6 REPX {psrad x, m12}, m0, m2, m1, m6 packssdw m0, m2 packssdw m1, m6 %define m14 [stk+0x00] %define m15 [stk+0x10] %endif palignr m2, m1, m0, 4 ; 1 2 3 4 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mov myd, mym mov r0, r0m mova [stk+0x20], m3 mova [stk+0x30], m0 mova [stk+0x40], m2 mova [stk+0x50], m4 %endif .w2_loop: and myd, 0x3ff %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m10, r6q punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pmaddwd m5, m3, m7 pmaddwd m6, m0, m8 pshufd m9, m10, q2222 pshufd m10, m10, q3333 pmaddwd m7, m2, m9 pmaddwd m8, m4, m10 paddd m5, m6 paddd m7, m8 %else mov r1, [esp+0x1f4] xor r3, r3 mov r5, myd shr r5, 6 lea r1, [r1+r5] mov r5, 64 << 24 cmovnz r3, [base+subpel_filters+r1*8+4] cmovnz r5, [base+subpel_filters+r1*8+0] movd m6, r3 movd m7, r5 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m5, m7, q0000 pshufd m6, m7, q1111 pmaddwd m3, m5 pmaddwd m0, m6 pshufd m5, m7, q2222 pshufd m7, m7, q3333 pmaddwd m2, m5 pmaddwd m4, m7 paddd m3, m0 paddd m2, m4 SWAP m5, m3 SWAP m7, m2 %define m8 m3 %endif paddd m5, m13 pshufd m6, m12, q1032 pxor m8, m8 paddd m5, m7 psrad m5, m6 packssdw m5, m5 pmaxsw m5, m8 pminsw m5, pxmaxm movd [dstq], m5 add dstq, dsmp dec hd jz .ret %if ARCH_X86_64 add myd, dyd %else add myd, dym %endif test myd, ~0x3ff %if ARCH_X86_32 SWAP m3, m5 SWAP m2, m7 mova m3, [stk+0x20] mova m0, [stk+0x30] mova m2, [stk+0x40] mova m4, [stk+0x50] %endif jz .w2_loop %if ARCH_X86_32 mov r3, r3m %endif movu m5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps m3, m0, q1032 ; 01 12 shufps m0, m2, q1032 ; 23 34 shufps m2, m4, q1032 ; 45 56 pshufb m5, m14 pmaddwd m5, m15 phaddd m5, m5 paddd m5, m11 psrad m5, m12 packssdw m5, m5 palignr m4, m5, m1, 12 punpcklqdq m1, m4, m4 ; 6 7 6 7 punpcklwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mova [stk+0x20], m3 mova [stk+0x30], m0 mova [stk+0x40], m2 mova [stk+0x50], m4 %endif jmp .w2_loop .w2_skip_line: movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m3, m0 ; 01 12 mova m0, m2 ; 23 34 pshufb m5, m14 pshufb m6, m14 pmaddwd m5, m15 pmaddwd m6, m15 phaddd m5, m6 paddd m5, m11 psrad m5, m12 packssdw m5, m5 ; 6 7 6 7 punpckhqdq m1, m5 ; 4 5 6 7 pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mova [stk+0x20], m3 mova [stk+0x30], m0 mova [stk+0x40], m2 mova [stk+0x50], m4 %endif jmp .w2_loop %endif INIT_XMM ssse3 .w4: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m11 mova [rsp+0x20], m12 %if isput mova [rsp+0x30], m13 %endif movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m8 m0 %xdefine m14 m4 %define m15 m3 movzx r4, byte [esp+0x1f0] sub srcq, 2 movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %else %define m9 [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 pshufd m7, m15, q1032 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r6d, m15 movd r13d, m7 mova m10, [base+bdct_lb_q+ 0] mova m11, [base+bdct_lb_q+16] movd m13, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+ r6*8+2] movd m15, [base+subpel_filters+r11*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r0, m15 movd r4, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd rX, m15 movd r5, m7 mova m5, [base+bdct_lb_q+ 0] mova m6, [base+bdct_lb_q+16] movd m1, [base+subpel_filters+r0*8+2] movd m2, [base+subpel_filters+rX*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] movifprep r3, r3m SWAP m4, m7 %define m10 m5 %define m11 m6 %define m12 m1 %define m13 m1 %endif psrld m14, 10 paddd m14, m14 punpckldq m13, m2 punpckldq m15, m4 punpcklqdq m13, m15 pxor m2, m2 pcmpeqd m0, m2 %if ARCH_X86_64 pand m9, m0 %else pand m2, m9, m0 %define m9 m2 SWAP m7, m4 %endif pandn m0, m13 %if ARCH_X86_64 SWAP m13, m0 %else %define m13 m0 %endif por m13, m9 punpckhbw m15, m13, m13 punpcklbw m13, m13 psraw m15, 8 psraw m13, 8 pshufb m12, m14, m10 pshufb m14, m11 mova m10, [base+spel_s_shuf2] movd r4d, m14 shr r4d, 24 %if ARCH_X86_32 mova [stk+0x20], m13 mova [stk+0x30], m15 pxor m2, m2 %endif pshufb m7, m14, m2 psubb m14, m7 paddb m12, m10 paddb m14, m10 %if ARCH_X86_64 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu m7, [srcq+ssq*0] movu m9, [srcq+ssq*1] movu m8, [srcq+ssq*2] movu m10, [srcq+ss3q ] movu m1, [srcq+r4 ] movu m3, [srcq+r6 ] movu m2, [srcq+r11 ] movu m4, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m13}, m7, m9, m8, m10 REPX {pshufb x, m14}, m1, m2, m3, m4 REPX {pmaddwd x, m15}, m1, m2, m3, m4 mova m5, [rsp+0x10] movd xm6, [rsp+0x20] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m4, [srcq+ss3q ] REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 packssdw m8, m10 ; 2 3 movu m0, [srcq+r4 ] movu m9, [srcq+r6 ] movu m10, [srcq+r11 ] movu m11, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m1, m2, m3, m4 REPX {pmaddwd x, m13}, m1, m2, m3, m4 REPX {pshufb x, m14}, m0, m9, m10, m11 REPX {pmaddwd x, m15}, m0, m9, m10, m11 phaddd m1, m0 phaddd m2, m9 phaddd m3, m10 phaddd m4, m11 REPX {paddd x, m5}, m1, m2, m3, m4 REPX {psrad x, xm6}, m1, m2, m3, m4 packssdw m1, m2 ; 4 5 packssdw m3, m4 ; 6 7 SWAP m9, m1 shufps m4, m7, m8, q1032 ; 1 2 shufps m5, m8, m9, q1032 ; 3 4 shufps m6, m9, m3, q1032 ; 5 6 pshufd m10, m3, q1032 ; 7 _ punpcklwd m0, m7, m4 ; 01 punpckhwd m7, m4 ; 12 punpcklwd m1, m8, m5 ; 23 punpckhwd m8, m5 ; 34 punpcklwd m2, m9, m6 ; 45 punpckhwd m9, m6 ; 56 punpcklwd m3, m10 ; 67 mova [rsp+0x40], m7 mova [rsp+0x50], m8 mova [rsp+0x60], m9 %else mova [stk+0x00], m12 mova [stk+0x10], m14 add r4, srcq MC_4TAP_SCALED_H 0x40 ; 0 1 MC_4TAP_SCALED_H 0x50 ; 2 3 MC_4TAP_SCALED_H 0x60 ; 4 5 MC_4TAP_SCALED_H 0x70 ; 6 7 mova m4, [stk+0x40] mova m5, [stk+0x50] mova m6, [stk+0x60] mova m7, [stk+0x70] mov [stk+0xc0], r4 shufps m1, m4, m5, q1032 ; 1 2 shufps m2, m5, m6, q1032 ; 3 4 shufps m3, m6, m7, q1032 ; 5 6 pshufd m0, m7, q1032 ; 7 _ mova [stk+0xb0], m0 punpcklwd m0, m4, m1 ; 01 punpckhwd m4, m1 ; 12 punpcklwd m1, m5, m2 ; 23 punpckhwd m5, m2 ; 34 punpcklwd m2, m6, m3 ; 45 punpckhwd m6, m3 ; 56 punpcklwd m3, m7, [stk+0xb0] ; 67 mov myd, mym mov r0, r0m mova [stk+0x40], m0 ; 01 mova [stk+0x50], m1 ; 23 mova [stk+0x60], m2 ; 45 mova [stk+0x70], m3 ; 67 mova [stk+0x80], m4 ; 12 mova [stk+0x90], m5 ; 34 mova [stk+0xa0], m6 ; 56 %define m12 [stk+0x00] %define m14 [stk+0x10] %define m13 [stk+0x20] %define m15 [stk+0x30] %define hrnd_mem [esp+0x00] %define hsh_mem [esp+0x10] %if isput %define vrnd_mem [esp+0x20] %else %define vrnd_mem [base+pd_m524256] %endif %endif .w4_loop: and myd, 0x3ff %if ARCH_X86_64 mov r11d, 64 << 24 mov r13d, myd shr r13d, 6 lea r13d, [t1+r13] cmovnz r11q, [base+subpel_filters+r13*8] movq m9, r11q punpcklbw m9, m9 psraw m9, 8 pshufd m7, m9, q0000 pshufd m8, m9, q1111 pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pshufd m7, m9, q2222 pshufd m9, m9, q3333 pmaddwd m6, m2, m7 pmaddwd m8, m3, m9 %if isput movd m9, [rsp+0x28] %define vrnd_mem [rsp+0x30] %else %define vrnd_mem [base+pd_m524256] %endif paddd m4, m5 paddd m6, m8 paddd m4, m6 paddd m4, vrnd_mem %else mov mym, myd mov r5, [esp+0x1f4] xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pshufd m6, m7, q2222 pshufd m7, m7, q3333 pmaddwd m0, m4 pmaddwd m1, m5 pmaddwd m2, m6 pmaddwd m3, m7 %if isput movd m4, [esp+0x18] %endif paddd m0, m1 paddd m2, m3 paddd m0, vrnd_mem paddd m0, m2 SWAP m4, m0 %define m9 m0 %endif %if isput pxor m5, m5 psrad m4, m9 packssdw m4, m4 pmaxsw m4, m5 pminsw m4, pxmaxm movq [dstq], m4 add dstq, dsmp %else psrad m4, 6 packssdw m4, m4 movq [tmpq], m4 add tmpq, 8 %endif dec hd jz .ret %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .w4_loop mova m8, [rsp+0x10] movd m9, [rsp+0x20] movu m4, [srcq] movu m5, [srcq+r4] test myd, 0x400 jz .w4_skip_line mova m0, [rsp+0x40] mova [rsp+0x40], m1 mova m1, [rsp+0x50] mova [rsp+0x50], m2 mova m2, [rsp+0x60] mova [rsp+0x60], m3 pshufb m4, m12 pshufb m5, m14 pmaddwd m4, m13 pmaddwd m5, m15 phaddd m4, m5 paddd m4, m8 psrad m4, m9 packssdw m4, m4 punpcklwd m3, m10, m4 mova m10, m4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu m6, [srcq+ssq*1] movu m7, [srcq+r6] mova m0, [rsp+0x50] mova m11, [rsp+0x60] pshufb m4, m12 pshufb m6, m12 pshufb m5, m14 pshufb m7, m14 pmaddwd m4, m13 pmaddwd m6, m13 pmaddwd m5, m15 pmaddwd m7, m15 mova [rsp+0x40], m0 mova [rsp+0x50], m11 phaddd m4, m5 phaddd m6, m7 paddd m4, m8 paddd m6, m8 psrad m4, m9 psrad m6, m9 packssdw m4, m6 punpcklwd m9, m10, m4 mova [rsp+0x60], m9 pshufd m10, m4, q1032 mova m0, m1 mova m1, m2 mova m2, m3 punpcklwd m3, m4, m10 lea srcq, [srcq+ssq*2] jmp .w4_loop %else SWAP m0, m4 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff jnz .w4_next_line mova m0, [stk+0x40] mova m1, [stk+0x50] mova m2, [stk+0x60] mova m3, [stk+0x70] jmp .w4_loop .w4_next_line: mov r5, [stk+0xc0] movu m4, [srcq] movu m5, [r5] test myd, 0x400 jz .w4_skip_line add [stk+0xc0], ssq mova m0, [stk+0x80] mova m3, [stk+0x50] mova [stk+0x40], m0 mova [stk+0x80], m3 mova m1, [stk+0x90] mova m6, [stk+0x60] mova [stk+0x50], m1 mova [stk+0x90], m6 mova m2, [stk+0xa0] mova m7, [stk+0x70] mova [stk+0x60], m2 mova [stk+0xa0], m7 pshufb m4, m12 pshufb m5, m14 pmaddwd m4, m13 pmaddwd m5, m15 phaddd m4, m5 paddd m4, hrnd_mem psrad m4, hsh_mem packssdw m4, m4 punpcklwd m3, [stk+0xb0], m4 mova [stk+0xb0], m4 mova [stk+0x70], m3 add srcq, ssq jmp .w4_loop .w4_skip_line: movu m6, [srcq+ssq*1] movu m7, [r5 +ssq*1] lea r5, [r5 +ssq*2] mov [stk+0xc0], r5 mova m0, [stk+0x50] mova m1, [stk+0x60] mova m2, [stk+0x70] mova m3, [stk+0x90] pshufb m4, m12 pshufb m6, m12 pshufb m5, m14 pshufb m7, m14 pmaddwd m4, m13 pmaddwd m6, m13 pmaddwd m5, m15 pmaddwd m7, m15 mova [stk+0x40], m0 mova [stk+0x50], m1 mova [stk+0x60], m2 mova [stk+0x80], m3 phaddd m4, m5 phaddd m6, m7 mova m5, [stk+0xa0] mova m7, [stk+0xb0] paddd m4, hrnd_mem paddd m6, hrnd_mem psrad m4, hsh_mem psrad m6, hsh_mem packssdw m4, m6 punpcklwd m7, m4 pshufd m6, m4, q1032 mova [stk+0x90], m5 mova [stk+0xa0], m7 mova [stk+0xb0], m6 punpcklwd m3, m4, m6 mova [stk+0x70], m3 lea srcq, [srcq+ssq*2] jmp .w4_loop %endif INIT_XMM ssse3 %if ARCH_X86_64 %define stk rsp+0x20 %endif .w8: mov dword [stk+0xf0], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [stk+0xf0], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [stk+0xf0], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [stk+0xf0], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [stk+0xf0], 16 movifprep tmp_stridem, 256 .w_start: %if ARCH_X86_64 %ifidn %1, put movifnidn dsm, dsq %endif mova [rsp+0x10], m11 %define hround m11 shr t0d, 16 movd m15, t0d %if isprep mova m13, [base+pd_m524256] %endif %else %define hround [esp+0x00] %define m12 [esp+0x10] %define m10 [base+pd_0x3ff] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq ssm %endif mov r4, [esp+0x1f0] shr r4, 16 movd m15, r4 mov r0, r0m mov myd, mym %endif sub srcq, 6 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] mova [stk+0x100], m7 mova [stk+0x120], m15 mov [stk+0x0f8], srcq mov [stk+0x130], r0q ; dstq / tmpq %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 mov r5, hm mov [stk+0x0f4], myd mov [stk+0x134], r5 %endif jmp .hloop .hloop_prep: dec dword [stk+0x0f0] jz .ret %if ARCH_X86_64 add qword [stk+0x130], 16 mov hd, hm %else add dword [stk+0x130], 16 mov myd, [stk+0x0f4] mov r5, [stk+0x134] mov r0, [stk+0x130] %endif mova m7, [stk+0x100] mova m14, [stk+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m11, [rsp+0x10] %endif mova m15, [stk+0x120] mov srcq, [stk+0x0f8] %if ARCH_X86_64 mov r0q, [stk+0x130] ; dstq / tmpq %else mov mym, myd mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .hloop: %if ARCH_X86_64 mova m9, [base+pq_0x40000000] %else %define m9 [base+pq_0x40000000] %endif pxor m1, m1 psrld m2, m14, 10 mova [stk], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m1 pshufd m2, m5, q1032 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pxor m2, m2 pcmpeqd m5, m2 mova [stk+0x110], m14 pshufd m4, m15, q1032 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 movq r11, m14 punpckhqdq m14, m14 movq rX, m14 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m9, m4 pand m8, m9, m6 pand m15, m9, m14 pand m9, m9, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m9, m5 punpcklbw m0, m7, m7 punpckhbw m7, m7 punpcklbw m1, m8, m8 punpckhbw m8, m8 psraw m0, 8 psraw m7, 8 psraw m1, 8 psraw m8, 8 punpcklbw m2, m15, m15 punpckhbw m15, m15 punpcklbw m3, m9, m9 punpckhbw m9, m9 psraw m2, 8 psraw m15, 8 psraw m3, 8 psraw m9, 8 mova [stk+0x10], m0 mova [stk+0x20], m7 mova [stk+0x30], m1 mova [stk+0x40], m8 mova [stk+0x50], m2 mova [stk+0x60], m15 mova [stk+0x70], m3 mova [stk+0x80], m9 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 mova [stk+0x90], m1 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 mova [stk+0xa0], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 mova [stk+0xb0], m3 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 mova [stk+0xc0], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 mova [stk+0xd0], m5 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 mova m5, [stk+0xd0] mova m1, [stk+0x90] mova m2, [stk+0xa0] mova m3, [stk+0xb0] mova m9, [stk+0xc0] mov myd, mym mov dyd, dym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova [stk+0x90], m4 mova [stk+0xa0], m5 mova [stk+0xb0], m6 mova [stk+0xc0], m7 %define hround [rsp+0x10] .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m11, r6q punpcklbw m11, m11 psraw m11, 8 pshufd m5, m11, q0000 pshufd m7, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 pmaddwd m4, m5, m0 pmaddwd m5, m5, m1 pmaddwd m6, m7, m2 pmaddwd m7, m7, m3 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [stk+0x90], m10 pmaddwd m7, [stk+0xa0], m10 pmaddwd m8, [stk+0xb0], m11 pmaddwd m9, [stk+0xc0], m11 paddd m4, m6 paddd m5, m7 %if isput pshufd m6, m12, q1032 %endif paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r4, m15 movd r5, m4 mova m14, [stk+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [stk+16], m14 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m9, m4 pand m1, m9, m6 pand m2, m9, m7 pand m3, m9, m5 pandn m4, [stk+0x20] pandn m6, [stk+0x30] pandn m7, [stk+0x40] pandn m5, [stk+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 punpcklbw m4, m0, m0 punpckhbw m0, m0 punpcklbw m5, m1, m1 punpckhbw m1, m1 psraw m4, 8 psraw m0, 8 psraw m5, 8 psraw m1, 8 punpcklbw m6, m2, m2 punpckhbw m2, m2 punpcklbw m7, m3, m3 punpckhbw m3, m3 psraw m6, 8 psraw m2, 8 psraw m7, 8 psraw m3, 8 mova [stk+0x0a0], m4 mova [stk+0x0b0], m0 mova [stk+0x0c0], m5 mova [stk+0x0d0], m1 mova [stk+0x140], m6 mova [stk+0x150], m2 mova [stk+0x160], m7 mova [stk+0x170], m3 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 mova m5, [stk+0x60] mova m6, [stk+0x70] mova m7, [stk+0x80] mova m0, [stk+0x90] mov myd, mym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova m1, [stk+0x20] mova m2, [stk+0x30] mova m3, [stk+0x40] mova m4, [stk+0x50] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 .vloop: mov r0, r0m mov r5, [esp+0x1f4] and myd, 0x3ff mov mym, myd xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 pshufd m6, m7, q2222 pshufd m7, m7, q3333 paddd m0, m2 paddd m1, m3 pmaddwd m2, [stk+0x60], m6 pmaddwd m3, [stk+0x70], m6 pmaddwd m4, [stk+0x80], m7 pmaddwd m5, [stk+0x90], m7 %if isput movd m6, [esp+0x18] %endif paddd m0, m2 paddd m1, m3 paddd m0, vrnd_mem paddd m1, vrnd_mem paddd m4, m0 paddd m5, m1 %endif %ifidn %1, put psrad m4, m6 psrad m5, m6 packssdw m4, m5 pxor m7, m7 pmaxsw m4, m7 pminsw m4, pxmaxm mova [dstq], m4 add dstq, dsm %else psrad m4, 6 psrad m5, 6 packssdw m4, m5 mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [stk+0x140], myd mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] jz .skip_line mova m14, [base+unpckw] movu m8, [srcq+r10*2] movu m9, [srcq+r11*2] movu m10, [srcq+r13*2] movu m11, [srcq+ rX*2] movu m4, [srcq+ r4*2] movu m5, [srcq+ r6*2] movu m6, [srcq+ r7*2] movu m7, [srcq+ r9*2] add srcq, ssq mov myd, [stk+0x140] mov dyd, dym pshufd m15, m14, q1032 pshufb m0, m14 ; 0a 1a pshufb m1, m14 ; 0b 1b pshufb m2, m15 ; 3a 2a pshufb m3, m15 ; 3b 2b pmaddwd m8, [stk+0x50] pmaddwd m9, [stk+0x60] pmaddwd m10, [stk+0x70] pmaddwd m11, [stk+0x80] pmaddwd m4, [stk+0x10] pmaddwd m5, [stk+0x20] pmaddwd m6, [stk+0x30] pmaddwd m7, [stk+0x40] phaddd m8, m9 phaddd m10, m11 mova m11, hround phaddd m4, m5 phaddd m6, m7 phaddd m8, m10 phaddd m4, m6 paddd m4, m11 paddd m8, m11 psrad m4, m12 psrad m8, m12 packssdw m4, m8 pshufb m5, [stk+0x90], m14 ; 4a 5a pshufb m6, [stk+0xa0], m14 ; 4b 5b pshufb m7, [stk+0xb0], m15 ; 7a 6a pshufb m8, [stk+0xc0], m15 ; 7b 6b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b punpckhwd m5, m7 ; 56a punpckhwd m6, m8 ; 56b punpcklwd m7, m4 ; 78a punpckhqdq m4, m4 punpcklwd m8, m4 ; 78b mova [stk+0x90], m5 mova [stk+0xa0], m6 mova [stk+0xb0], m7 mova [stk+0xc0], m8 jmp .vloop .skip_line: MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11 mov myd, [stk+0x140] mov dyd, dym mova m0, m2 ; 01a mova m1, m3 ; 01b mova m2, [stk+0x90] ; 23a mova m3, [stk+0xa0] ; 23b mova m5, [stk+0xb0] ; 45a mova m6, [stk+0xc0] ; 45b punpcklwd m7, m4, m8 ; 67a punpckhwd m4, m8 ; 67b mova [stk+0x90], m5 mova [stk+0xa0], m6 mova [stk+0xb0], m7 mova [stk+0xc0], m4 %else mov r0m, r0 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff mov mym, myd jnz .next_line mova m0, [stk+0x20] mova m1, [stk+0x30] mova m2, [stk+0x40] mova m3, [stk+0x50] jmp .vloop .next_line: test myd, 0x400 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] jz .skip_line MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 mova m7, [base+unpckw] pshufd m4, m7, q1032 pshufb m0, [stk+0x20], m7 ; 0a 1a pshufb m1, [stk+0x30], m7 ; 0b 1b pshufb m2, [stk+0x40], m4 ; 3a 2a pshufb m3, [stk+0x50], m4 ; 3b 2b pshufb m5, [stk+0x60], m7 ; 4a 5a pshufb m6, [stk+0x70], m7 ; 4b 5b pshufb m7, [stk+0x80], m4 ; 7a 6a punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 punpckhwd m5, m7 ; 56a mova [stk+0x60], m5 pshufb m5, [stk+0x90], m4 ; 7b 6b punpcklwd m7, [stk+0xe0] ; 78a punpckhwd m6, m5 ; 56b mova [stk+0x70], m6 movq m6, [stk+0xe8] mova [stk+0x80], m7 punpcklwd m5, m6 mov myd, mym mova [stk+0x90], m5 jmp .vloop .skip_line: MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 MC_8TAP_SCALED_H 0xa0, 0 ; 9 mova m7, [stk+0xe0] mova m2, [stk+0x60] ; 23a mova m3, [stk+0x70] ; 23b mova m4, [stk+0x80] ; 45a mova m5, [stk+0x90] ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova m0, [stk+0x40] ; 01a mova m1, [stk+0x50] ; 01b mov myd, mym mova [stk+0x40], m2 mova [stk+0x50], m3 mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova [stk+0x20], m0 mova [stk+0x30], m1 %endif jmp .vloop INIT_XMM ssse3 .dy1: movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] add wq, base_reg jmp wq %if isput .dy1_w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %define m11 [esp+0x00] %define m12 [esp+0x10] %define m13 [esp+0x20] movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 mov r1, r1m %endif pxor m9, m9 punpckldq m9, m8 paddd m14, m9 ; mx+dx*[0-1] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 pshufd m15, m15, q0321 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_q] mova m6, [base+spel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m2, m2 pcmpeqd m8, m2 psrld m14, 10 paddd m14, m14 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [stk], m14 SWAP m5, m0 SWAP m6, m3 %define m15 m6 %endif movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpckldq m15, m7 %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 pand m9, m8 pandn m8, m15 SWAP m15, m8 por m15, m9 movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else pand m7, m5, [base+pd_0x4000] pandn m5, m15 por m5, m7 %define m15 m5 mov myd, mym mov r5, [esp+0x1f4] xor r3, r3 shr myd, 6 lea r5, [r5+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] mov [stk+0x20], r3 mov r3, r3m %endif punpcklbw m15, m15 psraw m15, 8 REPX {pshufb x, m14}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 %if ARCH_X86_64 REPX {pshufb x, m14}, m4, m5, m6 REPX {pmaddwd x, m15}, m4, m5, m6 phaddd m0, m1 phaddd m2, m3 phaddd m4, m5 phaddd m6, m6 REPX {paddd x, m11}, m0, m2, m4, m6 REPX {psrad x, m12}, m0, m2, m4, m6 packssdw m0, m2 ; 0 1 2 3 packssdw m4, m6 ; 4 5 6 SWAP m1, m4 movq m10, r4 %else mova [stk+0x10], m15 phaddd m0, m1 phaddd m2, m3 movu m1, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q REPX {pshufb x, m14}, m1, m7, m6 REPX {pmaddwd x, m15}, m1, m7, m6 %define m14 [stk+0x00] %define m15 [stk+0x10] phaddd m1, m7 phaddd m6, m6 REPX {paddd x, m11}, m0, m2, m1, m6 REPX {psrad x, m12}, m0, m2, m1, m6 packssdw m0, m2 packssdw m1, m6 %define m8 m6 %define m9 m4 %define m10 m5 movd m10, r4 movd m9, [stk+0x20] punpckldq m10, m9 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 mova [stk+0x50], m7 mova [stk+0x60], m8 mova [stk+0x70], m9 mova [stk+0x80], m10 %define m7 [stk+0x50] %define m8 [stk+0x60] %define m9 [stk+0x70] %define m10 [stk+0x80] %endif palignr m2, m1, m0, 4 ; 1 2 3 4 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 pshufd m4, m1, q2121 ; 5 6 5 6 punpcklwd m2, m1, m4 ; 45 56 %if ARCH_X86_32 mov r0, r0m %endif .dy1_w2_loop: movu m1, [srcq+ssq*0] movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m3, m7 mova m3, m0 pmaddwd m0, m8 pshufb m1, m14 pshufb m6, m14 pmaddwd m1, m15 pmaddwd m6, m15 phaddd m1, m6 paddd m1, m11 psrad m1, m12 packssdw m1, m1 paddd m5, m0 mova m0, m2 pmaddwd m2, m9 paddd m5, m2 palignr m2, m1, m4, 12 punpcklwd m2, m1 ; 67 78 pmaddwd m4, m2, m10 paddd m5, m13 paddd m5, m4 pxor m6, m6 mova m4, m1 pshufd m1, m12, q1032 psrad m5, m1 packssdw m5, m5 pmaxsw m5, m6 pminsw m5, pxmaxm movd [dstq+dsq*0], m5 pshuflw m5, m5, q1032 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif INIT_XMM ssse3 .dy1_w4: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m11 mova [rsp+0x20], m12 %if isput mova [rsp+0x30], m13 %define vrnd_mem [rsp+0x30] %define stk rsp+0x40 %else %define vrnd_mem [base+pd_m524256] %define stk rsp+0x30 %endif movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m9 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq r3 %endif movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 pshufd m7, m15, q1032 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r6d, m15 movd r13d, m7 mova m10, [base+bdct_lb_q+ 0] mova m11, [base+bdct_lb_q+16] movd m13, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+ r6*8+2] movd m15, [base+subpel_filters+r11*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r0, m15 movd r4, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd rX, m15 movd r5, m7 mova m5, [base+bdct_lb_q+ 0] mova m6, [base+bdct_lb_q+16] movd m1, [base+subpel_filters+r0*8+2] movd m2, [base+subpel_filters+rX*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] SWAP m4, m7 %if isprep mov r3, r3m %endif %define m10 m5 %define m11 m6 %define m12 m1 %define m13 m1 %endif psrld m14, 10 paddd m14, m14 punpckldq m13, m2 punpckldq m15, m4 punpcklqdq m13, m15 pxor m2, m2 pcmpeqd m0, m2 %if ARCH_X86_64 pand m9, m0 %else pand m2, m9, m0 %define m9 m2 SWAP m7, m4 %endif pandn m0, m13 %if ARCH_X86_64 SWAP m13, m0 %else %define m13 m0 %endif por m13, m9 punpckhbw m15, m13, m13 punpcklbw m13, m13 psraw m15, 8 psraw m13, 8 pshufb m12, m14, m10 pshufb m14, m11 mova m10, [base+spel_s_shuf2] movd r4d, m14 shr r4d, 24 %if ARCH_X86_32 mova [stk+0x40], m13 mova [stk+0x50], m15 pxor m2, m2 %endif pshufb m7, m14, m2 psubb m14, m7 paddb m12, m10 paddb m14, m10 %if ARCH_X86_64 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu m7, [srcq+ssq*0] movu m9, [srcq+ssq*1] movu m8, [srcq+ssq*2] movu m10, [srcq+ss3q ] movu m1, [srcq+r4 ] movu m3, [srcq+r6 ] movu m2, [srcq+r11 ] movu m4, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m13}, m7, m9, m8, m10 REPX {pshufb x, m14}, m1, m3, m2, m4 REPX {pmaddwd x, m15}, m1, m3, m2, m4 mova m5, [rsp+0x10] movd xm6, [rsp+0x20] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m3, [srcq+ssq*2] REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 packssdw m8, m10 ; 2 3 movu m0, [srcq+r4 ] movu m9, [srcq+r6 ] movu m10, [srcq+r11 ] add srcq, ss3q REPX {pshufb x, m12}, m1, m2, m3 REPX {pmaddwd x, m13}, m1, m2, m3 REPX {pshufb x, m14}, m0, m9, m10 REPX {pmaddwd x, m15}, m0, m9, m10 phaddd m1, m0 phaddd m2, m9 phaddd m3, m10 shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] REPX {paddd x, m5}, m1, m2, m3 REPX {psrad x, xm6}, m1, m2, m3 packssdw m1, m2 ; 4 5 packssdw m3, m3 ; 6 6 SWAP m9, m1 shufps m4, m7, m8, q1032 ; 1 2 shufps m5, m8, m9, q1032 ; 3 4 shufps m6, m9, m3, q1032 ; 5 6 punpcklwd m0, m7, m4 ; 01 punpckhwd m7, m4 ; 12 punpcklwd m1, m8, m5 ; 23 punpckhwd m8, m5 ; 34 punpcklwd m2, m9, m6 ; 45 punpckhwd m9, m6 ; 56 movq m10, r13 mova [stk+0x00], m1 mova [stk+0x10], m8 mova [stk+0x20], m2 mova [stk+0x30], m9 mova [stk+0x40], m3 %define hrnd_mem [rsp+0x10] %define hsh_mem [rsp+0x20] %define vsh_mem [rsp+0x28] %if isput %define vrnd_mem [rsp+0x30] %else %define vrnd_mem [base+pd_m524256] %endif %else mova [stk+0x20], m12 mova [stk+0x30], m14 add r4, srcq MC_4TAP_SCALED_H 0x60 ; 0 1 MC_4TAP_SCALED_H 0x70 ; 2 3 MC_4TAP_SCALED_H 0x80 ; 4 5 movu m7, [srcq] movu m2, [r4] add srcq, ssq add r4, ssq mov [stk+0xb0], r4 pshufb m7, m12 pshufb m2, m14 pmaddwd m7, m13 pmaddwd m2, m15 phaddd m7, m2 paddd m7, [esp+0x00] psrad m7, [esp+0x10] packssdw m7, m7 ; 6 6 mova m4, [stk+0x60] mova m5, [stk+0x70] mova m6, [stk+0x80] mov myd, mym mov rX, [esp+0x1f4] xor r5, r5 shr myd, 6 lea rX, [rX+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+rX*8+0] cmovnz r5, [base+subpel_filters+rX*8+4] mov r3, r3m shufps m1, m4, m5, q1032 ; 1 2 shufps m2, m5, m6, q1032 ; 3 4 shufps m3, m6, m7, q1032 ; 5 6 mova [stk+0xa0], m7 punpcklwd m0, m4, m1 ; 01 punpckhwd m4, m1 ; 12 punpcklwd m1, m5, m2 ; 23 punpckhwd m5, m2 ; 34 punpcklwd m2, m6, m3 ; 45 punpckhwd m6, m3 ; 56 movd m7, r4 movd m3, r5 mov r0, r0m %if isput mov r1, r1m %endif mov r4, [stk+0xb0] mova [stk+0xc0], m4 ; 12 mova [stk+0x60], m1 ; 23 mova [stk+0x70], m2 ; 45 mova [stk+0x80], m5 ; 34 mova [stk+0x90], m6 ; 56 %define m12 [stk+0x20] %define m14 [stk+0x30] %define m13 [stk+0x40] %define m15 [stk+0x50] %define hrnd_mem [esp+0x00] %define hsh_mem [esp+0x10] %define vsh_mem [esp+0x18] %if isput %define vrnd_mem [esp+0x20] %else %define vrnd_mem [base+pd_m524256] %endif %define m10 m7 punpckldq m10, m3 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m3, m10, q0000 pshufd m4, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 %xdefine m8 m3 %xdefine m9 m6 %xdefine m11 m5 %xdefine m6 m4 mova [stk+0x100], m3 mova [stk+0x110], m4 mova [stk+0x120], m5 mova [stk+0x130], m10 %define m3 [stk+0x100] %define m4 [stk+0x110] %define m5 [stk+0x120] %define m10 [stk+0x130] mova m7, [stk+0xc0] mova m8, [stk+0x80] %endif .dy1_w4_loop: movu m11, [srcq+ssq*0] movu m6, [srcq+ssq*1] pmaddwd m0, m3 pmaddwd m7, m3 pmaddwd m1, m4 pmaddwd m8, m4 pmaddwd m2, m5 pmaddwd m9, m5 paddd m1, m0 paddd m8, m7 %if ARCH_X86_64 movu m0, [srcq+r4] movu m7, [srcq+r6] %else movu m0, [r4+ssq*0] movu m7, [r4+ssq*1] lea r4, [r4+ssq*2] %endif lea srcq, [srcq+ssq*2] paddd m1, m2 paddd m8, m9 pshufb m11, m12 pshufb m6, m12 pmaddwd m11, m13 pmaddwd m6, m13 pshufb m0, m14 pshufb m7, m14 pmaddwd m0, m15 pmaddwd m7, m15 phaddd m11, m0 phaddd m6, m7 paddd m11, hrnd_mem paddd m6, hrnd_mem psrad m11, hsh_mem psrad m6, hsh_mem packssdw m11, m6 ; 7 8 %if ARCH_X86_64 shufps m9, [stk+0x40], m11, q1032 ; 6 7 mova m0, [stk+0x00] mova [stk+0x40], m11 %else shufps m9, [stk+0xa0], m11, q1032 ; 6 7 mova m0, [stk+0x60] mova [stk+0xa0], m11 %endif punpcklwd m2, m9, m11 ; 67 punpckhwd m9, m11 ; 78 pmaddwd m6, m2, m10 pmaddwd m7, m9, m10 %if isput movd m11, vsh_mem %endif paddd m1, vrnd_mem paddd m8, vrnd_mem paddd m1, m6 paddd m8, m7 %if ARCH_X86_64 mova m7, [stk+0x10] %else mova m7, [stk+0x80] %endif %if isput psrad m1, m11 psrad m8, m11 %else psrad m1, 6 psrad m8, 6 %endif packssdw m1, m8 %if ARCH_X86_64 mova m8, [stk+0x30] %else mova m8, [stk+0x90] %endif %if isput pxor m6, m6 pmaxsw m1, m6 pminsw m1, pxmaxm movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] %else mova [tmpq], m1 add tmpq, 16 %endif %if ARCH_X86_64 mova m1, [stk+0x20] mova [stk+0x10], m8 mova [stk+0x00], m1 mova [stk+0x20], m2 mova [stk+0x30], m9 %else mova m1, [stk+0x70] mova [stk+0x80], m8 mova [stk+0x60], m1 mova [stk+0x70], m2 mova [stk+0x90], m9 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET ; why not jz .ret? INIT_XMM ssse3 .dy1_w8: mov dword [stk+0xf0], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [stk+0xf0], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [stk+0xf0], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [stk+0xf0], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [stk+0xf0], 16 movifprep tmp_stridem, 256 .dy1_w_start: mov myd, mym %if ARCH_X86_64 %ifidn %1, put movifnidn dsm, dsq %endif mova [rsp+0x10], m11 mova [rsp+0x20], m12 %define hround m11 %if isput mova [rsp+0x30], m13 %else mova m13, [base+pd_m524256] %endif shr t0d, 16 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define hround [esp+0x00] %define m12 [esp+0x10] %define m10 [base+pd_0x3ff] %define m8 m0 %xdefine m14 m4 %xdefine m15 m3 %if isprep %define ssq ssm %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif sub srcq, 6 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q %else movd m5, r4 movd m6, r5 punpckldq m5, m6 SWAP m3, m5 %endif punpcklbw m3, m3 psraw m3, 8 mova [stk+0x100], m7 mova [stk+0x120], m15 mov [stk+0x0f8], srcq mov [stk+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 %if ARCH_X86_64 mova [stk+0x140], m0 mova [stk+0x150], m1 mova [stk+0x160], m2 mova [stk+0x170], m3 %if UNIX64 mov hm, hd %endif %else mova [stk+0x180], m0 mova [stk+0x190], m1 mova [stk+0x1a0], m2 mova [stk+0x1b0], m3 SWAP m5, m3 mov r5, hm mov [stk+0x134], r5 %endif jmp .dy1_hloop .dy1_hloop_prep: dec dword [stk+0x0f0] jz .ret %if ARCH_X86_64 add qword [stk+0x130], 16 mov hd, hm %else add dword [stk+0x130], 16 mov r5, [stk+0x134] mov r0, [stk+0x130] %endif mova m7, [stk+0x100] mova m14, [stk+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m11, [rsp+0x10] %endif mova m15, [stk+0x120] mov srcq, [stk+0x0f8] %if ARCH_X86_64 mov r0q, [stk+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy1_hloop: %if ARCH_X86_64 mova m9, [base+pq_0x40000000] %else %define m9 [base+pq_0x40000000] %endif pxor m1, m1 psrld m2, m14, 10 mova [stk], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m1 pshufd m2, m5, q1032 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pxor m2, m2 pcmpeqd m5, m2 mova [stk+0x110], m14 pshufd m4, m15, q1032 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 movq r11, m14 punpckhqdq m14, m14 movq rX, m14 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m9, m4 pand m8, m9, m6 pand m15, m9, m14 pand m9, m9, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m9, m5 punpcklbw m0, m7, m7 punpckhbw m7, m7 punpcklbw m1, m8, m8 punpckhbw m8, m8 psraw m0, 8 psraw m7, 8 psraw m1, 8 psraw m8, 8 punpcklbw m2, m15, m15 punpckhbw m15, m15 punpcklbw m3, m9, m9 punpckhbw m9, m9 psraw m2, 8 psraw m15, 8 psraw m3, 8 psraw m9, 8 mova [stk+0x10], m0 mova [stk+0x20], m7 mova [stk+0x30], m1 mova [stk+0x40], m8 mova [stk+0x50], m2 mova [stk+0x60], m15 mova [stk+0x70], m3 mova [stk+0x80], m9 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 mova [stk+0x90], m1 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 mova [stk+0xa0], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 mova [stk+0xb0], m3 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 mova [stk+0xc0], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 mova [stk+0xd0], m5 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 mova m5, [stk+0xd0] mova m1, [stk+0x90] mova m2, [stk+0xa0] mova m3, [stk+0xb0] mova m9, [stk+0xc0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova m10, [stk+0x140] mova m11, [stk+0x150] mova m14, [stk+0x160] mova m15, [stk+0x170] mova [stk+0x90], m4 mova [stk+0xa0], m5 mova [stk+0xb0], m6 mova [stk+0xc0], m7 %define hround [rsp+0x10] %define shift [rsp+0x20] %if isput %define vround [rsp+0x30] %else %define vround [base+pd_m524256] %endif .dy1_vloop: pmaddwd m4, m0, m10 pmaddwd m5, m1, m10 pmaddwd m6, m2, m11 pmaddwd m7, m3, m11 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [stk+0x90], m14 pmaddwd m7, [stk+0xa0], m14 pmaddwd m8, [stk+0xb0], m15 pmaddwd m9, [stk+0xc0], m15 paddd m4, m6 paddd m5, m7 %if isput pshufd m6, m12, q1032 %endif paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r4, m15 movd r5, m4 mova m14, [stk+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [stk+16], m14 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m9, m4 pand m1, m9, m6 pand m2, m9, m7 pand m3, m9, m5 pandn m4, [stk+0x20] pandn m6, [stk+0x30] pandn m7, [stk+0x40] pandn m5, [stk+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 punpcklbw m4, m0, m0 punpckhbw m0, m0 punpcklbw m5, m1, m1 punpckhbw m1, m1 psraw m4, 8 psraw m0, 8 psraw m5, 8 psraw m1, 8 punpcklbw m6, m2, m2 punpckhbw m2, m2 punpcklbw m7, m3, m3 punpckhbw m3, m3 psraw m6, 8 psraw m2, 8 psraw m7, 8 psraw m3, 8 mova [stk+0x0a0], m4 mova [stk+0x0b0], m0 mova [stk+0x0c0], m5 mova [stk+0x0d0], m1 mova [stk+0x140], m6 mova [stk+0x150], m2 mova [stk+0x160], m7 mova [stk+0x170], m3 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 mova m5, [stk+0x60] mova m6, [stk+0x70] mova m7, [stk+0x80] mova m0, [stk+0x90] mov r0, r0m punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova m1, [stk+0x20] mova m2, [stk+0x30] mova m3, [stk+0x40] mova m4, [stk+0x50] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova m4, [stk+0x180] mova m5, [stk+0x190] mova m6, [stk+0x1a0] mova m7, [stk+0x1b0] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 .dy1_vloop: pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 pmaddwd m2, [stk+0x60], m6 pmaddwd m3, [stk+0x70], m6 pmaddwd m4, [stk+0x80], m7 pmaddwd m5, [stk+0x90], m7 %if isput movd m6, [esp+0x18] %endif paddd m0, m2 paddd m1, m3 paddd m0, vrnd_mem paddd m1, vrnd_mem paddd m4, m0 paddd m5, m1 %endif %ifidn %1, put psrad m4, m6 psrad m5, m6 packssdw m4, m5 pxor m7, m7 pmaxsw m4, m7 pminsw m4, pxmaxm mova [dstq], m4 add dstq, dsm %else psrad m4, 6 psrad m5, 6 packssdw m4, m5 mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep %if ARCH_X86_64 movu m8, [srcq+r10*2] movu m9, [srcq+r11*2] movu m12, [srcq+r13*2] movu m13, [srcq+ rX*2] movu m4, [srcq+ r4*2] movu m5, [srcq+ r6*2] movu m6, [srcq+ r7*2] movu m7, [srcq+ r9*2] add srcq, ssq pmaddwd m8, [stk+0x50] pmaddwd m9, [stk+0x60] pmaddwd m12, [stk+0x70] pmaddwd m13, [stk+0x80] pmaddwd m4, [stk+0x10] pmaddwd m5, [stk+0x20] pmaddwd m6, [stk+0x30] pmaddwd m7, [stk+0x40] phaddd m8, m9 phaddd m12, m13 mova m9, [base+unpckw] mova m13, hround phaddd m4, m5 phaddd m6, m7 phaddd m8, m12 phaddd m4, m6 pshufd m5, m9, q1032 pshufb m0, m9 ; 0a 1a pshufb m1, m9 ; 0b 1b pshufb m2, m5 ; 3a 2a pshufb m3, m5 ; 3b 2b mova m12, shift paddd m4, m13 paddd m8, m13 psrad m4, m12 psrad m8, m12 packssdw m4, m8 pshufb m6, [stk+0x90], m9 ; 4a 5a pshufb m7, [stk+0xa0], m9 ; 4b 5b pshufb m8, [stk+0xb0], m5 ; 7a 6a pshufb m13, [stk+0xc0], m5 ; 7b 6b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m6 ; 34a punpcklwd m3, m7 ; 34b punpckhwd m6, m8 ; 56a punpckhwd m7, m13 ; 56b punpcklwd m8, m4 ; 78a punpckhqdq m4, m4 punpcklwd m13, m4 ; 78b mova [stk+0x90], m6 mova [stk+0xa0], m7 mova [stk+0xb0], m8 mova [stk+0xc0], m13 mova m13, vround %else mov r0m, r0 mov r3, r3m mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 mova m7, [base+unpckw] pshufd m4, m7, q1032 pshufb m0, [stk+0x20], m7 ; 0a 1a pshufb m1, [stk+0x30], m7 ; 0b 1b pshufb m2, [stk+0x40], m4 ; 3a 2a pshufb m3, [stk+0x50], m4 ; 3b 2b pshufb m5, [stk+0x60], m7 ; 4a 5a pshufb m6, [stk+0x70], m7 ; 4b 5b pshufb m7, [stk+0x80], m4 ; 7a 6a punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 punpckhwd m5, m7 ; 56a mova [stk+0x60], m5 pshufb m5, [stk+0x90], m4 ; 7b 6b punpcklwd m7, [stk+0xe0] ; 78a mova m4, [stk+0x180] punpckhwd m6, m5 ; 56b mova [stk+0x70], m6 movq m6, [stk+0xe8] mova [stk+0x80], m7 mova m7, [stk+0x1b0] punpcklwd m5, m6 mova m6, [stk+0x1a0] mova [stk+0x90], m5 mova m5, [stk+0x190] mov r0, r0m %endif jmp .dy1_vloop INIT_XMM ssse3 %if ARCH_X86_64 %define stk rsp+0x20 %endif .dy2: movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] add wq, base_reg jmp wq %if isput .dy2_w2: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m13 %define vrnd_mem [rsp+0x10] movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %define m11 [esp+0x00] %define m12 [esp+0x10] %define vrnd_mem [esp+0x20] mov r1, r1m movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 %endif pxor m9, m9 punpckldq m9, m8 paddd m14, m9 ; mx+dx*[0-1] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 pshufd m15, m15, q0321 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_q] mova m6, [base+spel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m2, m2 pcmpeqd m8, m2 psrld m14, 10 paddd m14, m14 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [stk], m14 SWAP m5, m0 SWAP m6, m3 %define m15 m6 %endif movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*2] movu m2, [srcq+ssq*4] punpckldq m15, m7 %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 pand m9, m8 pandn m8, m15 SWAP m15, m8 por m15, m9 movu m4, [srcq+ssq*1] movu m5, [srcq+ss3q ] lea srcq, [srcq+ssq*4] movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else pand m7, m5, [base+pd_0x4000] pandn m5, m15 por m5, m7 %define m15 m5 mov myd, mym mov r5, [esp+0x1f4] xor r3, r3 shr myd, 6 lea r5, [r5+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] mov [stk+0x20], r3 mov r3, r3m %endif punpcklbw m15, m15 psraw m15, 8 REPX {pshufb x, m14}, m0, m1, m2 REPX {pmaddwd x, m15}, m0, m1, m2 %if ARCH_X86_64 REPX {pshufb x, m14}, m4, m5, m6 REPX {pmaddwd x, m15}, m4, m5, m6 phaddd m0, m1 phaddd m1, m2 phaddd m4, m5 phaddd m5, m6 REPX {paddd x, m11}, m0, m1, m4, m5 REPX {psrad x, m12}, m0, m1, m4, m5 packssdw m0, m1 ; 0 2 2 4 packssdw m4, m5 ; 1 3 3 5 SWAP m2, m4 movq m10, r4 %else mova [stk+0x10], m15 phaddd m0, m1 phaddd m1, m2 movu m2, [srcq+ssq*1] movu m7, [srcq+ss3q ] lea srcq, [srcq+ssq*4] movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] REPX {pshufb x, m14}, m2, m7, m6 REPX {pmaddwd x, m15}, m2, m7, m6 %define m14 [stk+0x00] %define m15 [stk+0x10] phaddd m2, m7 phaddd m7, m6 REPX {paddd x, m11}, m0, m1, m2, m7 REPX {psrad x, m12}, m0, m1, m2, m7 packssdw m0, m1 packssdw m2, m7 %define m8 m6 %define m9 m4 %define m10 m5 movd m10, r4 movd m9, [stk+0x20] punpckldq m10, m9 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 mova [stk+0x50], m7 mova [stk+0x60], m8 mova [stk+0x70], m9 mova [stk+0x80], m10 %xdefine m13 m7 %define m7 [stk+0x50] %define m8 [stk+0x60] %define m9 [stk+0x70] %define m10 [stk+0x80] %endif punpcklwd m1, m0, m2 ; 01 23 punpckhwd m3, m0, m2 ; 23 45 %if ARCH_X86_32 mov r4, r0m %define dstq r4 mova [stk+0x20], m3 mova [stk+0x30], m0 %endif .dy2_w2_loop: movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m6, [srcq+ssq*2] movu m13, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pmaddwd m3, m8 REPX {pshufb x, m14}, m4, m5, m6, m13 REPX {pmaddwd x, m15}, m4, m5, m6, m13 phaddd m4, m5 phaddd m6, m13 pmaddwd m5, m1, m7 paddd m4, m11 paddd m6, m11 psrad m4, m12 psrad m6, m12 packssdw m4, m6 ; 6 7 8 9 paddd m5, m3 pshufd m3, m4, q2200 pshufd m4, m4, q3311 palignr m3, m0, 12 ; 4 6 6 8 palignr m4, m2, 12 ; 5 7 7 9 mova m0, m3 mova m2, m4 punpcklwd m1, m3, m4 punpckhwd m3, m4 pmaddwd m6, m1, m9 pmaddwd m4, m3, m10 paddd m5, vrnd_mem paddd m6, m4 paddd m5, m6 pshufd m4, m12, q1032 pxor m6, m6 psrad m5, m4 packssdw m5, m5 pmaxsw m5, m6 pminsw m5, pxmaxm movd [dstq+dsq*0], m5 pshuflw m5, m5, q1032 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif INIT_XMM ssse3 .dy2_w4: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m11 mova [rsp+0x20], m12 %if isput mova [rsp+0x30], m13 %define vrnd_mem [rsp+0x30] %define stk rsp+0x40 %else %define vrnd_mem [base+pd_m524256] %define stk rsp+0x30 %endif movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m9 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq r3 %endif movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 pshufd m7, m15, q1032 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r6d, m15 movd r13d, m7 mova m10, [base+bdct_lb_q+ 0] mova m11, [base+bdct_lb_q+16] movd m13, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+ r6*8+2] movd m15, [base+subpel_filters+r11*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r1, m15 movd r4, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r3, m15 movd r5, m7 mova m5, [base+bdct_lb_q+ 0] mova m6, [base+bdct_lb_q+16] movd m1, [base+subpel_filters+r1*8+2] movd m2, [base+subpel_filters+r3*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] SWAP m4, m7 mov r3, r3m %if isprep lea ss3q, [ssq*3] %endif %define m10 m5 %define m11 m6 %define m12 m1 %define m13 m1 %endif psrld m14, 10 paddd m14, m14 punpckldq m13, m2 punpckldq m15, m4 punpcklqdq m13, m15 pxor m2, m2 pcmpeqd m0, m2 %if ARCH_X86_64 pand m9, m0 %else pand m2, m9, m0 %define m9 m2 SWAP m7, m4 %endif pandn m0, m13 %if ARCH_X86_64 SWAP m13, m0 %else %define m13 m0 %endif por m13, m9 punpckhbw m15, m13, m13 punpcklbw m13, m13 psraw m15, 8 psraw m13, 8 pshufb m12, m14, m10 pshufb m14, m11 mova m10, [base+spel_s_shuf2] movd r4d, m14 shr r4d, 24 %if ARCH_X86_32 mova [stk+0x40], m13 mova [stk+0x50], m15 pxor m2, m2 %endif pshufb m7, m14, m2 psubb m14, m7 paddb m12, m10 paddb m14, m10 %if ARCH_X86_64 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu m1, [srcq+ssq*0] movu m8, [srcq+ssq*2] movu m9, [srcq+ssq*1] movu m10, [srcq+ss3q ] movu m7, [srcq+r4 ] movu m2, [srcq+r11 ] movu m3, [srcq+r6 ] movu m4, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m1, m9, m8, m10 REPX {pmaddwd x, m13}, m1, m9, m8, m10 REPX {pshufb x, m14}, m7, m3, m2, m4 REPX {pmaddwd x, m15}, m7, m3, m2, m4 mova m5, [rsp+0x10] movd xm6, [rsp+0x20] phaddd m1, m7 phaddd m8, m2 phaddd m9, m3 phaddd m10, m4 movu m2, [srcq+ssq*0] movu m3, [srcq+ssq*1] REPX {paddd x, m5}, m1, m9, m8, m10 REPX {psrad x, xm6}, m1, m9, m8, m10 packssdw m1, m8 ; 0 2 packssdw m9, m10 ; 1 3 movu m0, [srcq+r4 ] movu m8, [srcq+r6 ] lea srcq, [srcq+ssq*2] REPX {pshufb x, m12}, m2, m3 REPX {pmaddwd x, m13}, m2, m3 REPX {pshufb x, m14}, m0, m8 REPX {pmaddwd x, m15}, m0, m8 phaddd m2, m0 phaddd m3, m8 shr myd, 6 mov r9d, 64 << 24 lea myd, [t1+myq] cmovnz r9q, [base+subpel_filters+myq*8] REPX {paddd x, m5}, m2, m3 REPX {psrad x, xm6}, m2, m3 packssdw m2, m3 ; 4 5 pshufd m3, m2, q1032 ; 5 _ punpcklwd m0, m1, m9 ; 01 punpckhwd m1, m9 ; 23 punpcklwd m2, m3 ; 45 movq m10, r9 %define hrnd_mem [rsp+0x10] %define hsh_mem [rsp+0x20] %define vsh_mem [rsp+0x28] %if isput %define vrnd_mem [rsp+0x30] %else %define vrnd_mem [base+pd_m524256] %endif %else mova [stk+0x20], m12 mova [stk+0x30], m14 add r4, srcq MC_4TAP_SCALED_H 0x60 ; 0 1 MC_4TAP_SCALED_H 0x70 ; 2 3 MC_4TAP_SCALED_H 0x80 ; 4 5 mov [stk+0xe0], r4 mova m3, [base+spel_s_shuf8] mova m0, [stk+0x60] mova m1, [stk+0x70] mova m2, [stk+0x80] mov myd, mym mov rX, [esp+0x1f4] xor r5, r5 shr myd, 6 lea rX, [rX+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+rX*8+0] cmovnz r5, [base+subpel_filters+rX*8+4] mov r3, r3m pshufb m0, m3 ; 01 pshufb m1, m3 ; 23 pshufb m2, m3 ; 45 movd m7, r4 movd m4, r5 mov r5, r0m %if isput mov r1, r1m %endif mov r4, [stk+0xe0] %define dstq r5 %define tmpq r5 %define m12 [stk+0x20] %define m14 [stk+0x30] %define m13 [stk+0x40] %define m15 [stk+0x50] %define hrnd_mem [esp+0x00] %define hsh_mem [esp+0x10] %define vsh_mem [esp+0x18] %if isput %define vrnd_mem [esp+0x20] %else %define vrnd_mem [base+pd_m524256] %endif %define m10 m7 punpckldq m10, m4 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m3, m10, q0000 pshufd m4, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 %xdefine m8 m3 %xdefine m9 m6 %xdefine m11 m5 %xdefine m6 m4 mova [stk+0x100], m3 mova [stk+0x110], m4 mova [stk+0x120], m5 mova [stk+0x130], m10 %define m3 [stk+0x100] %define m4 [stk+0x110] %define m5 [stk+0x120] %define m10 [stk+0x130] %endif .dy2_w4_loop: pmaddwd m8, m0, m3 pmaddwd m9, m1, m3 mova m0, m2 pmaddwd m1, m4 pmaddwd m11, m2, m4 paddd m8, vrnd_mem paddd m9, vrnd_mem pmaddwd m2, m5 paddd m8, m1 paddd m9, m11 paddd m8, m2 movu m6, [srcq+ssq*0] movu m1, [srcq+ssq*2] %if ARCH_X86_64 movu m11, [srcq+r4 ] movu m2, [srcq+r11] %else movu m11, [r4+ssq*0] movu m2, [r4+ssq*2] %endif pshufb m6, m12 pshufb m1, m12 pmaddwd m6, m13 pmaddwd m1, m13 pshufb m11, m14 pshufb m2, m14 pmaddwd m11, m15 pmaddwd m2, m15 phaddd m6, m11 phaddd m1, m2 paddd m6, hrnd_mem paddd m1, hrnd_mem psrad m6, hsh_mem psrad m1, hsh_mem movu m7, [srcq+ssq*1] movu m11, [srcq+ss3q ] packssdw m6, m1 ; 6 8 %if ARCH_X86_64 movu m2, [srcq+r6 ] movu m1, [srcq+r13] %else movu m2, [r4+ssq*1] movu m1, [r4+ss3q ] %endif pshufb m7, m12 pshufb m11, m12 pmaddwd m7, m13 pmaddwd m11, m13 pshufb m2, m14 pshufb m1, m14 pmaddwd m2, m15 pmaddwd m1, m15 phaddd m7, m2 phaddd m11, m1 paddd m7, hrnd_mem paddd m11, hrnd_mem psrad m7, hsh_mem psrad m11, hsh_mem packssdw m7, m11 ; 7 9 %if ARCH_X86_32 lea r4, [r4+ssq*4] %endif lea srcq, [srcq+ssq*4] punpcklwd m1, m6, m7 ; 67 punpckhwd m6, m7 ; 89 mova m2, m6 pmaddwd m11, m1, m5 pmaddwd m7, m1, m10 pmaddwd m6, m10 paddd m9, m11 %if isput movd m11, vsh_mem %endif paddd m8, m7 paddd m9, m6 %if isput psrad m8, m11 psrad m9, m11 packssdw m8, m9 pxor m7, m7 pmaxsw m8, m7 pminsw m8, pxmaxm movq [dstq+dsq*0], m8 movhps [dstq+dsq*1], m8 lea dstq, [dstq+dsq*2] %else psrad m8, 6 psrad m9, 6 packssdw m8, m9 mova [tmpq], m8 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET ; why not jz .ret? INIT_XMM ssse3 .dy2_w8: mov dword [stk+0xf0], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [stk+0xf0], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [stk+0xf0], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [stk+0xf0], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [stk+0xf0], 16 movifprep tmp_stridem, 256 .dy2_w_start: mov myd, mym %if ARCH_X86_64 %ifidn %1, put movifnidn dsm, dsq %endif mova [rsp+0x10], m11 mova [rsp+0x20], m12 %define hround m11 %if isput mova [rsp+0x30], m13 %else mova m13, [base+pd_m524256] %endif shr t0d, 16 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define hround [esp+0x00] %define m12 [esp+0x10] %define m10 [base+pd_0x3ff] %define m8 m0 %xdefine m14 m4 %xdefine m15 m3 %if isput %define dstq r0 %else %define tmpq r0 %define ssq ssm %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif sub srcq, 6 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q %else movd m5, r4 movd m6, r5 punpckldq m5, m6 SWAP m3, m5 %endif punpcklbw m3, m3 psraw m3, 8 mova [stk+0x100], m7 mova [stk+0x120], m15 mov [stk+0x0f8], srcq mov [stk+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 %if ARCH_X86_64 mova [stk+0x140], m0 mova [stk+0x150], m1 mova [stk+0x160], m2 mova [stk+0x170], m3 %if UNIX64 mov hm, hd %endif %else mova [stk+0x180], m0 mova [stk+0x190], m1 mova [stk+0x1a0], m2 mova [stk+0x1b0], m3 SWAP m5, m3 mov r5, hm mov [stk+0x134], r5 %endif jmp .dy2_hloop .dy2_hloop_prep: dec dword [stk+0x0f0] jz .ret %if ARCH_X86_64 add qword [stk+0x130], 16 mov hd, hm %else add dword [stk+0x130], 16 mov r5, [stk+0x134] mov r0, [stk+0x130] %endif mova m7, [stk+0x100] mova m14, [stk+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m11, [rsp+0x10] %endif mova m15, [stk+0x120] mov srcq, [stk+0x0f8] %if ARCH_X86_64 mov r0q, [stk+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy2_hloop: %if ARCH_X86_64 mova m9, [base+pq_0x40000000] %else %define m9 [base+pq_0x40000000] %endif pxor m1, m1 psrld m2, m14, 10 mova [stk], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m1 pshufd m2, m5, q1032 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pxor m2, m2 pcmpeqd m5, m2 mova [stk+0x110], m14 pshufd m4, m15, q1032 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 movq r11, m14 punpckhqdq m14, m14 movq rX, m14 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m9, m4 pand m8, m9, m6 pand m15, m9, m14 pand m9, m9, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m9, m5 punpcklbw m0, m7, m7 punpckhbw m7, m7 punpcklbw m1, m8, m8 punpckhbw m8, m8 psraw m0, 8 psraw m7, 8 psraw m1, 8 psraw m8, 8 punpcklbw m2, m15, m15 punpckhbw m15, m15 punpcklbw m3, m9, m9 punpckhbw m9, m9 psraw m2, 8 psraw m15, 8 psraw m3, 8 psraw m9, 8 mova [stk+0x10], m0 mova [stk+0x20], m7 mova [stk+0x30], m1 mova [stk+0x40], m8 mova [stk+0x50], m2 mova [stk+0x60], m15 mova [stk+0x70], m3 mova [stk+0x80], m9 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 mova [stk+0x90], m1 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 mova [stk+0xa0], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 mova [stk+0xb0], m3 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 mova [stk+0xc0], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 mova [stk+0xd0], m5 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 mova m5, [stk+0xd0] mova m1, [stk+0x90] mova m2, [stk+0xa0] mova m3, [stk+0xb0] mova m9, [stk+0xc0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova m10, [stk+0x140] mova m11, [stk+0x150] mova m14, [stk+0x160] mova m15, [stk+0x170] mova [stk+0x90], m4 mova [stk+0xa0], m5 mova [stk+0xb0], m6 mova [stk+0xc0], m7 %define hround [rsp+0x10] %define shift [rsp+0x20] %if isput %define vround [rsp+0x30] %else %define vround [base+pd_m524256] %endif .dy2_vloop: pmaddwd m4, m0, m10 pmaddwd m5, m1, m10 pmaddwd m6, m2, m11 pmaddwd m7, m3, m11 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [stk+0x90], m14 pmaddwd m7, [stk+0xa0], m14 pmaddwd m8, [stk+0xb0], m15 pmaddwd m9, [stk+0xc0], m15 paddd m4, m6 paddd m5, m7 %if isput pshufd m6, m12, q1032 %endif paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r4, m15 movd r5, m4 mova m14, [stk+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [stk+16], m14 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m9, m4 pand m1, m9, m6 pand m2, m9, m7 pand m3, m9, m5 pandn m4, [stk+0x20] pandn m6, [stk+0x30] pandn m7, [stk+0x40] pandn m5, [stk+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 punpcklbw m4, m0, m0 punpckhbw m0, m0 punpcklbw m5, m1, m1 punpckhbw m1, m1 psraw m4, 8 psraw m0, 8 psraw m5, 8 psraw m1, 8 punpcklbw m6, m2, m2 punpckhbw m2, m2 punpcklbw m7, m3, m3 punpckhbw m3, m3 psraw m6, 8 psraw m2, 8 psraw m7, 8 psraw m3, 8 mova [stk+0x0a0], m4 mova [stk+0x0b0], m0 mova [stk+0x0c0], m5 mova [stk+0x0d0], m1 mova [stk+0x140], m6 mova [stk+0x150], m2 mova [stk+0x160], m7 mova [stk+0x170], m3 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 mova m5, [stk+0x60] mova m6, [stk+0x70] mova m7, [stk+0x80] mova m0, [stk+0x90] mov r0, r0m punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova m1, [stk+0x20] mova m2, [stk+0x30] mova m3, [stk+0x40] mova m4, [stk+0x50] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova m4, [stk+0x180] mova m5, [stk+0x190] mova m6, [stk+0x1a0] mova m7, [stk+0x1b0] mova [stk+0x40], m2 mova [stk+0x50], m3 .dy2_vloop: pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 pmaddwd m2, [stk+0x60], m6 pmaddwd m3, [stk+0x70], m6 pmaddwd m4, [stk+0x80], m7 pmaddwd m5, [stk+0x90], m7 %if isput movd m6, [esp+0x18] %endif paddd m0, m2 paddd m1, m3 paddd m0, vrnd_mem paddd m1, vrnd_mem paddd m4, m0 paddd m5, m1 %endif %ifidn %1, put psrad m4, m6 psrad m5, m6 packssdw m4, m5 pxor m7, m7 pmaxsw m4, m7 pminsw m4, pxmaxm mova [dstq], m4 add dstq, dsm %else psrad m4, 6 psrad m5, 6 packssdw m4, m5 mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep %if ARCH_X86_64 MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1 mova [stk+0xd0], m4 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1 mova m4, [stk+0xd0] mova m0, m2 ; 01a mova m1, m3 ; 01b mova m2, [stk+0x90] ; 23a mova m3, [stk+0xa0] ; 23b mova m5, [stk+0xb0] ; 45a mova m6, [stk+0xc0] ; 45b punpcklwd m7, m4, m8 ; 67a punpckhwd m4, m8 ; 67b mova [stk+0x90], m5 mova [stk+0xa0], m6 mova [stk+0xb0], m7 mova [stk+0xc0], m4 %else mov r0m, r0 mov r3, r3m MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8 MC_8TAP_SCALED_H 0xa0, 0 ; 9 mova m7, [stk+0xe0] mova m2, [stk+0x60] ; 23a mova m3, [stk+0x70] ; 23b mova m4, [stk+0x80] ; 45a mova m5, [stk+0x90] ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova m0, [stk+0x40] ; 01a mova m1, [stk+0x50] ; 01b mova [stk+0x40], m2 mova [stk+0x50], m3 mova [stk+0x60], m4 mova [stk+0x70], m5 mova m4, [stk+0x180] mova m5, [stk+0x190] mova [stk+0x80], m6 mova [stk+0x90], m7 mova m6, [stk+0x1a0] mova m7, [stk+0x1b0] mov r0, r0m %endif jmp .dy2_vloop INIT_XMM ssse3 .ret: MC_8TAP_SCALED_RET 0 %if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT %define r0m [rstk+stack_offset+ 4] %define r1m [rstk+stack_offset+ 8] %define r2m [rstk+stack_offset+12] %define r3m [rstk+stack_offset+16] %endif %undef isput %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_16bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, (5*15 << 16) | 5*15 jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 8 %else DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN put FN put_8tap_scaled, sharp, SHARP, SHARP FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP FN put_8tap_scaled, smooth, SMOOTH, SMOOTH FN put_8tap_scaled, sharp_regular, SHARP, REGULAR FN put_8tap_scaled, regular_sharp, REGULAR, SHARP FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH FN put_8tap_scaled, regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 7 %else DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN prep FN prep_8tap_scaled, sharp, SHARP, SHARP FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH FN prep_8tap_scaled, regular, REGULAR, REGULAR MC_8TAP_SCALED prep %if ARCH_X86_64 DECLARE_REG_TMP 6 %else DECLARE_REG_TMP 2 %endif %if ARCH_X86_64 ; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that ; by allocating 16 bytes more stack space so that stack offsets match up. %if WIN64 && STACK_ALIGNMENT == 16 %assign stksz 16*14 %else %assign stksz 16*13 %endif cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ mx, tmp, alpha, beta, \ filter, my, gamma, cnt %assign stack_size_padded_8x8t stack_size_padded %else cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ filter, mx, my %define m8 [esp+16*13] %define m9 [esp+16*14] %define cntd dword [esp+4*63] %define dstq tmpq %define dsq 0 %if STACK_ALIGNMENT < 16 %define dstm [esp+4*65] %define dsm [esp+4*66] %else %define dstm r0m %define dsm r1m %endif %endif %define base filterq-$$ mov t0d, r7m LEA filterq, $$ shr t0d, 11 %if ARCH_X86_64 movddup m8, [base+warp8x8t_rnd] %else movddup m1, [base+warp8x8t_rnd] mov r1, r1m add r1, r1 mova m8, m1 mov r1m, r1 ; ds *= 2 %endif call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main jmp .start .loop: %if ARCH_X86_64 lea dstq, [dstq+dsq*4] %else add dstq, dsm mov dstm, dstq %endif call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 .start: %if ARCH_X86_32 mov dstq, dstm %endif paddd m1, m8 paddd m2, m8 psrad m1, 15 psrad m2, 15 packssdw m1, m2 mova [dstq+dsq*0], m1 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 %if ARCH_X86_32 mov dstq, dstm add dstq, dsm %endif paddd m1, m8 paddd m2, m8 psrad m1, 15 psrad m2, 15 packssdw m1, m2 mova [dstq+dsq*2], m1 dec cntd jg .loop RET %if ARCH_X86_64 cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ mx, tmp, alpha, beta, \ filter, my, gamma, cnt ASSERT stack_size_padded == stack_size_padded_8x8t %else cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ filter, mx, my %endif mov t0d, r7m LEA filterq, $$ shr t0d, 11 %if ARCH_X86_64 movddup m8, [base+warp8x8_rnd2+t0*8] movd m9, r7m ; pixel_max pshufb m9, [base+pw_256] %else movddup m1, [base+warp8x8_rnd2+t0*8] movd m2, r7m ; pixel_max pshufb m2, [base+pw_256] mova m8, m1 mova m9, m2 %endif call .main jmp .start .loop: %if ARCH_X86_64 lea dstq, [dstq+dsq*2] %else add dstq, dsm mov dstm, dstq %endif call .main2 .start: %if ARCH_X86_32 mov dstq, dstm %endif psrad m1, 16 psrad m2, 16 packssdw m1, m2 pmaxsw m1, m6 pmulhrsw m1, m8 pminsw m1, m9 mova [dstq+dsq*0], m1 call .main3 %if ARCH_X86_32 mov dstq, dstm add dstq, dsm %endif psrad m1, 16 psrad m2, 16 packssdw m1, m2 pmaxsw m1, m6 pmulhrsw m1, m8 pminsw m1, m9 mova [dstq+dsq*1], m1 dec cntd jg .loop RET ALIGN function_align .main: ; Stack args offset by one (r4m -> r5m etc.) due to call %if WIN64 mov deltaq, r5m mov mxd, r6m %endif movd m0, [base+warp8x8_shift+t0*4] movddup m7, [base+warp8x8_rnd1+t0*8] add filterq, mc_warp_filter-$$ %if ARCH_X86_64 movsx alphad, word [deltaq+2*0] movsx betad, word [deltaq+2*1] movsx gammad, word [deltaq+2*2] movsx deltad, word [deltaq+2*3] lea tmpq, [ssq*3] add mxd, 512+(64<<10) sub srcq, tmpq ; src -= ss*3 imul tmpd, alphad, -7 mov myd, r7m add betad, tmpd ; beta -= alpha*7 imul tmpd, gammad, -7 add myd, 512+(64<<10) mov cntd, 4 add deltad, tmpd ; delta -= gamma*7 %else %if STACK_ALIGNMENT < 16 %assign stack_offset stack_offset - gprsize %endif mov r3d, r5m ; abcd %if STACK_ALIGNMENT < 16 mov r0, r1m ; dst mov r1, r2m ; ds mov [esp+gprsize+4*65], r0 mov [esp+gprsize+4*66], r1 %endif movsx alphad, word [r3+2*0] movsx r2d, word [r3+2*1] movsx gammad, word [r3+2*2] movsx r3d, word [r3+2*3] imul r5d, alphad, -7 add r2d, r5d ; beta -= alpha*7 imul r5d, gammad, -7 mov [esp+gprsize+4*60], r2d add r3d, r5d ; delta -= gamma*7 mov [esp+gprsize+4*61], r3d mov r3d, r4m ; ss mov srcq, r3m mov mxd, r6m mov myd, r7m mov dword [esp+gprsize+4*63], 4 ; cnt mov [esp+gprsize+4*62], r3 lea r3, [r3*3] add mxd, 512+(64<<10) add myd, 512+(64<<10) sub srcq, r3 ; src -= ss*3 %if STACK_ALIGNMENT < 16 %assign stack_offset stack_offset + gprsize %endif %endif mova [rsp+gprsize], m0 pxor m6, m6 call .h mova m5, m0 call .h punpcklwd m1, m5, m0 ; 01 punpckhwd m5, m0 mova [rsp+gprsize+16* 1], m1 mova [rsp+gprsize+16* 4], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 12 punpckhwd m5, m0 mova [rsp+gprsize+16* 7], m1 mova [rsp+gprsize+16*10], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 23 punpckhwd m5, m0 mova [rsp+gprsize+16* 2], m1 mova [rsp+gprsize+16* 5], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 34 punpckhwd m5, m0 mova [rsp+gprsize+16* 8], m1 mova [rsp+gprsize+16*11], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 45 punpckhwd m5, m0 mova [rsp+gprsize+16* 3], m1 mova [rsp+gprsize+16* 6], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 56 punpckhwd m5, m0 mova [rsp+gprsize+16* 9], m1 mova [rsp+gprsize+16*12], m5 mova m5, m0 .main2: call .h %macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h lea tmpd, [myq+gammaq] shr myd, 10 movq m4, [filterq+myq*8] ; a lea myd, [tmpq+gammaq] shr tmpd, 10 movq m2, [filterq+tmpq*8] ; b lea tmpd, [myq+gammaq] shr myd, 10 movq m3, [filterq+myq*8] ; c lea myd, [tmpq+gammaq] shr tmpd, 10 movq m1, [filterq+tmpq*8] ; d lea tmpd, [myq+gammaq] shr myd, 10 punpcklwd m4, m2 punpcklwd m3, m1 punpckldq m2, m4, m3 punpckhdq m4, m3 punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 pmaddwd m1, [rsp+gprsize+16*%1] punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 mova m2, [rsp+gprsize+16*%2] pmaddwd m3, m2 mova [rsp+gprsize+16*%1], m2 paddd m1, m3 punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 mova m2, [rsp+gprsize+16*%3] pmaddwd m3, m2 mova [rsp+gprsize+16*%2], m2 paddd m1, m3 punpcklwd m3, m5, m0 ; 67 punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 pmaddwd m2, m3 mova [rsp+gprsize+16*%3], m3 paddd m1, m2 movq m4, [filterq+myq*8] ; e lea myd, [tmpq+gammaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] ; f lea tmpd, [myq+gammaq] shr myd, 10 movq m2, [filterq+myq*8] ; g %if ARCH_X86_64 lea myd, [tmpq+deltaq] ; my += delta %else mov myd, [esp+gprsize+4*61] add myd, tmpd %endif shr tmpd, 10 punpcklwd m4, m3 movq m3, [filterq+tmpq*8] ; h punpcklwd m2, m3 punpckldq m3, m4, m2 punpckhdq m4, m2 punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 pmaddwd m2, [rsp+gprsize+16*%4] punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 mova m3, [rsp+gprsize+16*%5] pmaddwd m6, m3 mova [rsp+gprsize+16*%4], m3 pxor m3, m3 paddd m2, m6 punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 mova m6, [rsp+gprsize+16*%6] pmaddwd m3, m6 mova [rsp+gprsize+16*%5], m6 punpckhwd m5, m0 pxor m6, m6 paddd m2, m3 punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 pmaddwd m3, m5 mova [rsp+gprsize+16*%6], m5 mova m5, m0 paddd m2, m3 %endmacro WARP_V 1, 2, 3, 4, 5, 6 ret .main3: call .h WARP_V 7, 8, 9, 10, 11, 12 ret ALIGN function_align .h: lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] punpcklbw m0, m6, m3 movu m3, [srcq-6] pmaddwd m0, m3 ; 0 lea mxd, [tmpq+alphaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] punpcklbw m2, m6, m3 movu m3, [srcq-4] pmaddwd m2, m3 ; 1 lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] phaddd m0, m2 ; 0 1 punpcklbw m2, m6, m3 movu m3, [srcq-2] pmaddwd m2, m3 ; 2 lea mxd, [tmpq+alphaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] punpcklbw m1, m6, m3 movu m3, [srcq+0] pmaddwd m1, m3 ; 3 lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] phaddd m2, m1 ; 2 3 punpcklbw m1, m6, m3 movu m3, [srcq+2] pmaddwd m1, m3 ; 4 lea mxd, [tmpq+alphaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] phaddd m0, m2 ; 0 1 2 3 punpcklbw m2, m6, m3 movu m3, [srcq+4] pmaddwd m2, m3 ; 5 lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] phaddd m1, m2 ; 4 5 punpcklbw m2, m6, m3 movu m3, [srcq+6] pmaddwd m2, m3 ; 6 %if ARCH_X86_64 lea mxd, [tmpq+betaq] ; mx += beta %else mov mxd, [esp+gprsize*2+4*60] add mxd, tmpd %endif shr tmpd, 10 movq m3, [filterq+tmpq*8] punpcklbw m4, m6, m3 movu m3, [srcq+8] %if ARCH_X86_64 add srcq, ssq %else add srcq, [esp+gprsize*2+4*62] %endif pmaddwd m3, m4 ; 7 phaddd m2, m3 ; 6 7 phaddd m1, m2 ; 4 5 6 7 paddd m0, m7 paddd m1, m7 psrad m0, [rsp+gprsize*2] psrad m1, [rsp+gprsize*2] packssdw m0, m1 ret %macro BIDIR_FN 0 call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop .ret: RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jne .w8_loop RET .w16_loop: call .main add dstq, strideq .w16: mova [dstq+16*0], m0 mova [dstq+16*1], m1 dec hd jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 call .main mova [dstq+16*4], m0 mova [dstq+16*5], m1 call .main mova [dstq+16*6], m0 mova [dstq+16*7], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16* 0], m0 mova [dstq+16* 1], m1 call .main mova [dstq+16* 2], m0 mova [dstq+16* 3], m1 call .main mova [dstq+16* 4], m0 mova [dstq+16* 5], m1 call .main mova [dstq+16* 6], m0 mova [dstq+16* 7], m1 call .main mova [dstq+16* 8], m0 mova [dstq+16* 9], m1 call .main mova [dstq+16*10], m0 mova [dstq+16*11], m1 call .main mova [dstq+16*12], m0 mova [dstq+16*13], m1 call .main mova [dstq+16*14], m0 mova [dstq+16*15], m1 dec hd jg .w128_loop RET %endmacro %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h %define base r6-avg_ssse3_table LEA r6, avg_ssse3_table tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 movddup m2, [base+bidir_rnd+t0*8] movddup m3, [base+bidir_mul+t0*8] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+16*0] paddsw m0, [tmp2q+16*0] mova m1, [tmp1q+16*1] paddsw m1, [tmp2q+16*1] add tmp1q, 16*2 add tmp2q, 16*2 pmaxsw m0, m2 pmaxsw m1, m2 psubsw m0, m2 psubsw m1, m2 pmulhw m0, m3 pmulhw m1, m3 ret cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h %define base r6-w_avg_ssse3_table LEA r6, w_avg_ssse3_table tzcnt wd, wm mov t0d, r6m ; weight movd m6, r7m ; pixel_max movddup m5, [base+pd_65538] movsxd wq, [r6+wq*4] pshufb m6, [base+pw_256] add wq, r6 lea r6d, [t0-16] shl t0d, 16 sub t0d, r6d ; 16-weight, weight paddw m5, m6 mov r6d, t0d shl t0d, 2 test dword r7m, 0x800 cmovnz r6d, t0d movifnidn hd, hm movd m4, r6d pslld m5, 7 pxor m7, m7 pshufd m4, m4, q0000 BIDIR_FN ALIGN function_align .main: mova m2, [tmp1q+16*0] mova m0, [tmp2q+16*0] punpckhwd m3, m0, m2 punpcklwd m0, m2 mova m2, [tmp1q+16*1] mova m1, [tmp2q+16*1] add tmp1q, 16*2 add tmp2q, 16*2 pmaddwd m3, m4 pmaddwd m0, m4 paddd m3, m5 paddd m0, m5 psrad m3, 8 psrad m0, 8 packssdw m0, m3 punpckhwd m3, m1, m2 punpcklwd m1, m2 pmaddwd m3, m4 pmaddwd m1, m4 paddd m3, m5 paddd m1, m5 psrad m3, 8 psrad m1, 8 packssdw m1, m3 pminsw m0, m6 pminsw m1, m6 pmaxsw m0, m7 pmaxsw m1, m7 ret %if ARCH_X86_64 cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask %else cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask %define hd dword r5m %define m8 [base+pw_64] %endif %define base r6-mask_ssse3_table LEA r6, mask_ssse3_table tzcnt wd, wm mov t0d, r7m ; pixel_max shr t0d, 11 movsxd wq, [r6+wq*4] movddup m6, [base+bidir_rnd+t0*8] movddup m7, [base+bidir_mul+t0*8] %if ARCH_X86_64 mova m8, [base+pw_64] movifnidn hd, hm %endif add wq, r6 mov maskq, r6mp BIDIR_FN ALIGN function_align .main: movq m3, [maskq+8*0] mova m0, [tmp1q+16*0] mova m4, [tmp2q+16*0] pxor m5, m5 punpcklbw m3, m5 punpckhwd m2, m0, m4 punpcklwd m0, m4 psubw m1, m8, m3 punpckhwd m4, m3, m1 ; m, 64-m punpcklwd m3, m1 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) pmaddwd m0, m3 movq m3, [maskq+8*1] mova m1, [tmp1q+16*1] mova m4, [tmp2q+16*1] add maskq, 8*2 add tmp1q, 16*2 add tmp2q, 16*2 psrad m2, 5 psrad m0, 5 packssdw m0, m2 punpcklbw m3, m5 punpckhwd m2, m1, m4 punpcklwd m1, m4 psubw m5, m8, m3 punpckhwd m4, m3, m5 ; m, 64-m punpcklwd m3, m5 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) pmaddwd m1, m3 psrad m2, 5 psrad m1, 5 packssdw m1, m2 pmaxsw m0, m6 pmaxsw m1, m6 psubsw m0, m6 psubsw m1, m6 pmulhw m0, m7 pmulhw m1, m7 ret cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_420_ssse3_table LEA t0, w_mask_420_ssse3_table tzcnt wd, wm mov r6d, r8m ; pixel_max movd m0, r7m ; sign shr r6d, 11 movsxd wq, [t0+wq*4] %if ARCH_X86_64 mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 mova m9, [base+pw_64] movddup m10, [base+bidir_rnd+r6*8] movddup m11, [base+bidir_mul+r6*8] %else mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 mova m2, [base+pw_64] movddup m3, [base+bidir_rnd+r6*8] movddup m4, [base+bidir_mul+r6*8] ALLOC_STACK -16*4 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 mova [rsp+16*3], m4 %define m8 [rsp+gprsize+16*0] %define m9 [rsp+gprsize+16*1] %define m10 [rsp+gprsize+16*2] %define m11 [rsp+gprsize+16*3] %endif movd m7, [base+pw_2] psubw m7, m0 pshufb m7, [base+pw_256] add wq, t0 movifnidn hd, r5m mov maskq, r6mp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 4 .w4: movq [dstq+strideq*0], m0 phaddw m2, m3 movhps [dstq+strideq*1], m0 phaddd m2, m2 lea dstq, [dstq+strideq*2] paddw m2, m7 movq [dstq+strideq*0], m1 psrlw m2, 2 movhps [dstq+strideq*1], m1 packuswb m2, m2 movd [maskq], m2 sub hd, 4 jg .w4_loop RET .w8_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 4 .w8: mova [dstq+strideq*0], m0 paddw m2, m3 phaddw m2, m2 mova [dstq+strideq*1], m1 paddw m2, m7 psrlw m2, 2 packuswb m2, m2 movd [maskq], m2 sub hd, 2 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 8 .w16: mova [dstq+strideq*1+16*0], m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*1+16*1], m3 mova [dstq+strideq*0+16*1], m1 call .main paddw m2, [dstq+strideq*1+16*0] paddw m3, [dstq+strideq*1+16*1] mova [dstq+strideq*1+16*0], m0 phaddw m2, m3 mova [dstq+strideq*1+16*1], m1 paddw m2, m7 psrlw m2, 2 packuswb m2, m2 movq [maskq], m2 sub hd, 2 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 16 .w32: mova [dstq+strideq*1+16*0], m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*1+16*1], m3 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*0+16*2], m0 phaddw m2, m3 mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*3], m1 call .main paddw m2, [dstq+strideq*1+16*0] paddw m3, [dstq+strideq*1+16*1] mova [dstq+strideq*1+16*0], m0 phaddw m2, m3 mova [dstq+strideq*1+16*2], m2 mova [dstq+strideq*1+16*1], m1 call .main phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*2] paddw m2, [dstq+strideq*1+16*3] mova [dstq+strideq*1+16*2], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*3], m1 packuswb m3, m2 mova [maskq], m3 sub hd, 2 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 16*2 .w64: mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*1+16*2], m3 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*2], m0 mova [dstq+strideq*1+16*4], m3 mova [dstq+strideq*0+16*3], m1 call .main mova [dstq+strideq*1+16*5], m2 mova [dstq+strideq*0+16*4], m0 mova [dstq+strideq*1+16*6], m3 mova [dstq+strideq*0+16*5], m1 call .main mova [dstq+strideq*0+16*6], m0 phaddw m2, m3 mova [dstq+strideq*1+16*7], m2 mova [dstq+strideq*0+16*7], m1 call .main paddw m2, [dstq+strideq*1+16*1] paddw m3, [dstq+strideq*1+16*2] mova [dstq+strideq*1+16*0], m0 phaddw m2, m3 mova [dstq+strideq*1+16*2], m2 mova [dstq+strideq*1+16*1], m1 call .main paddw m2, [dstq+strideq*1+16*3] paddw m3, [dstq+strideq*1+16*4] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*2] mova [dstq+strideq*1+16*2], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*3], m1 packuswb m3, m2 mova [maskq+16*0], m3 call .main paddw m2, [dstq+strideq*1+16*5] paddw m3, [dstq+strideq*1+16*6] mova [dstq+strideq*1+16*4], m0 phaddw m2, m3 mova [dstq+strideq*1+16*6], m2 mova [dstq+strideq*1+16*5], m1 call .main phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*6] paddw m2, [dstq+strideq*1+16*7] mova [dstq+strideq*1+16*6], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*7], m1 packuswb m3, m2 mova [maskq+16*1], m3 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 16*4 .w128: mova [dstq+strideq*1+16* 1], m2 mova [dstq+strideq*0+16* 0], m0 mova [dstq+strideq*1+16* 2], m3 mova [dstq+strideq*0+16* 1], m1 call .main mova [dstq+strideq*1+16* 3], m2 mova [dstq+strideq*0+16* 2], m0 mova [dstq+strideq*1+16* 4], m3 mova [dstq+strideq*0+16* 3], m1 call .main mova [dstq+strideq*1+16* 5], m2 mova [dstq+strideq*0+16* 4], m0 mova [dstq+strideq*1+16* 6], m3 mova [dstq+strideq*0+16* 5], m1 call .main mova [dstq+strideq*1+16* 7], m2 mova [dstq+strideq*0+16* 6], m0 mova [dstq+strideq*1+16* 8], m3 mova [dstq+strideq*0+16* 7], m1 call .main mova [dstq+strideq*1+16* 9], m2 mova [dstq+strideq*0+16* 8], m0 mova [dstq+strideq*1+16*10], m3 mova [dstq+strideq*0+16* 9], m1 call .main mova [dstq+strideq*1+16*11], m2 mova [dstq+strideq*0+16*10], m0 mova [dstq+strideq*1+16*12], m3 mova [dstq+strideq*0+16*11], m1 call .main mova [dstq+strideq*1+16*13], m2 mova [dstq+strideq*0+16*12], m0 mova [dstq+strideq*1+16*14], m3 mova [dstq+strideq*0+16*13], m1 call .main mova [dstq+strideq*0+16*14], m0 phaddw m2, m3 mova [dstq+strideq*1+16*15], m2 mova [dstq+strideq*0+16*15], m1 call .main paddw m2, [dstq+strideq*1+16* 1] paddw m3, [dstq+strideq*1+16* 2] mova [dstq+strideq*1+16* 0], m0 phaddw m2, m3 mova [dstq+strideq*1+16* 2], m2 mova [dstq+strideq*1+16* 1], m1 call .main paddw m2, [dstq+strideq*1+16* 3] paddw m3, [dstq+strideq*1+16* 4] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16* 2] mova [dstq+strideq*1+16* 2], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16* 3], m1 packuswb m3, m2 mova [maskq+16*0], m3 call .main paddw m2, [dstq+strideq*1+16* 5] paddw m3, [dstq+strideq*1+16* 6] mova [dstq+strideq*1+16* 4], m0 phaddw m2, m3 mova [dstq+strideq*1+16* 6], m2 mova [dstq+strideq*1+16* 5], m1 call .main paddw m2, [dstq+strideq*1+16* 7] paddw m3, [dstq+strideq*1+16* 8] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16* 6] mova [dstq+strideq*1+16* 6], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16* 7], m1 packuswb m3, m2 mova [maskq+16*1], m3 call .main paddw m2, [dstq+strideq*1+16* 9] paddw m3, [dstq+strideq*1+16*10] mova [dstq+strideq*1+16* 8], m0 phaddw m2, m3 mova [dstq+strideq*1+16*10], m2 mova [dstq+strideq*1+16* 9], m1 call .main paddw m2, [dstq+strideq*1+16*11] paddw m3, [dstq+strideq*1+16*12] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*10] mova [dstq+strideq*1+16*10], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*11], m1 packuswb m3, m2 mova [maskq+16*2], m3 call .main paddw m2, [dstq+strideq*1+16*13] paddw m3, [dstq+strideq*1+16*14] mova [dstq+strideq*1+16*12], m0 phaddw m2, m3 mova [dstq+strideq*1+16*14], m2 mova [dstq+strideq*1+16*13], m1 call .main phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*14] paddw m2, [dstq+strideq*1+16*15] mova [dstq+strideq*1+16*14], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*15], m1 packuswb m3, m2 mova [maskq+16*3], m3 sub hd, 2 jg .w128_loop RET ALIGN function_align .main: %macro W_MASK 2 ; dst/tmp_offset, mask mova m%1, [tmp1q+16*%1] mova m%2, [tmp2q+16*%1] punpcklwd m4, m%2, m%1 punpckhwd m5, m%2, m%1 psubsw m%1, m%2 pabsw m%1, m%1 psubusw m6, m8, m%1 psrlw m6, 10 ; 64-m psubw m%2, m9, m6 ; m punpcklwd m%1, m6, m%2 punpckhwd m6, m%2 pmaddwd m%1, m4 pmaddwd m6, m5 psrad m%1, 5 psrad m6, 5 packssdw m%1, m6 pmaxsw m%1, m10 psubsw m%1, m10 pmulhw m%1, m11 %endmacro W_MASK 0, 2 W_MASK 1, 3 add tmp1q, 16*2 add tmp2q, 16*2 ret cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_422_ssse3_table LEA t0, w_mask_422_ssse3_table tzcnt wd, wm mov r6d, r8m ; pixel_max movd m7, r7m ; sign shr r6d, 11 movsxd wq, [t0+wq*4] %if ARCH_X86_64 mova m8, [base+pw_27615] mova m9, [base+pw_64] movddup m10, [base+bidir_rnd+r6*8] movddup m11, [base+bidir_mul+r6*8] %else mova m1, [base+pw_27615] mova m2, [base+pw_64] movddup m3, [base+bidir_rnd+r6*8] movddup m4, [base+bidir_mul+r6*8] ALLOC_STACK -16*4 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 mova [rsp+16*3], m4 %endif pxor m0, m0 add wq, t0 pshufb m7, m0 movifnidn hd, r5m mov maskq, r6mp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop .end: RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 call .main mova [dstq+16*4], m0 mova [dstq+16*5], m1 call .main mova [dstq+16*6], m0 mova [dstq+16*7], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16* 0], m0 mova [dstq+16* 1], m1 call .main mova [dstq+16* 2], m0 mova [dstq+16* 3], m1 call .main mova [dstq+16* 4], m0 mova [dstq+16* 5], m1 call .main mova [dstq+16* 6], m0 mova [dstq+16* 7], m1 call .main mova [dstq+16* 8], m0 mova [dstq+16* 9], m1 call .main mova [dstq+16*10], m0 mova [dstq+16*11], m1 call .main mova [dstq+16*12], m0 mova [dstq+16*13], m1 call .main mova [dstq+16*14], m0 mova [dstq+16*15], m1 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 2 W_MASK 1, 3 phaddw m2, m3 add tmp1q, 16*2 add tmp2q, 16*2 packuswb m2, m2 pxor m3, m3 psubb m2, m7 pavgb m2, m3 movq [maskq], m2 add maskq, 8 ret cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_444_ssse3_table LEA t0, w_mask_444_ssse3_table tzcnt wd, wm mov r6d, r8m ; pixel_max shr r6d, 11 movsxd wq, [t0+wq*4] %if ARCH_X86_64 mova m8, [base+pw_27615] mova m9, [base+pw_64] movddup m10, [base+bidir_rnd+r6*8] movddup m11, [base+bidir_mul+r6*8] %else mova m1, [base+pw_27615] mova m2, [base+pw_64] movddup m3, [base+bidir_rnd+r6*8] movddup m7, [base+bidir_mul+r6*8] ALLOC_STACK -16*3 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 %define m11 m7 %endif add wq, t0 movifnidn hd, r5m mov maskq, r6mp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop .end: RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 call .main mova [dstq+16*4], m0 mova [dstq+16*5], m1 call .main mova [dstq+16*6], m0 mova [dstq+16*7], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16* 0], m0 mova [dstq+16* 1], m1 call .main mova [dstq+16* 2], m0 mova [dstq+16* 3], m1 call .main mova [dstq+16* 4], m0 mova [dstq+16* 5], m1 call .main mova [dstq+16* 6], m0 mova [dstq+16* 7], m1 call .main mova [dstq+16* 8], m0 mova [dstq+16* 9], m1 call .main mova [dstq+16*10], m0 mova [dstq+16*11], m1 call .main mova [dstq+16*12], m0 mova [dstq+16*13], m1 call .main mova [dstq+16*14], m0 mova [dstq+16*15], m1 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 2 W_MASK 1, 3 packuswb m2, m3 add tmp1q, 16*2 add tmp2q, 16*2 mova [maskq], m2 add maskq, 16 ret ; (a * (64 - m) + b * m + 32) >> 6 ; = (((b - a) * m + 32) >> 6) + a ; = (((b - a) * (m << 9) + 16384) >> 15) + a ; except m << 9 overflows int16_t when m == 64 (which is possible), ; but if we negate m it works out (-64 << 9 == -32768). ; = (((a - b) * (m * -512) + 16384) >> 15) + a cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 %define base r6-blend_ssse3_table LEA r6, blend_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp mova m7, [base+pw_m512] add wq, r6 lea stride3q, [strideq*3] pxor m6, m6 jmp wq .w4: mova m5, [maskq] movq m0, [dstq+strideq*0] movhps m0, [dstq+strideq*1] movq m1, [dstq+strideq*2] movhps m1, [dstq+stride3q ] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] add maskq, 16 add tmpq, 32 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+stride3q ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: mova m5, [maskq] mova m0, [dstq+strideq*0] mova m1, [dstq+strideq*1] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] add maskq, 16 add tmpq, 32 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8 RET .w16: mova m5, [maskq] mova m0, [dstq+16*0] mova m1, [dstq+16*1] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] add maskq, 16 add tmpq, 32 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, strideq dec hd jg .w16 RET .w32: mova m5, [maskq+16*0] mova m0, [dstq+16*0] mova m1, [dstq+16*1] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova m5, [maskq+16*1] mova m0, [dstq+16*2] mova m1, [dstq+16*3] psubw m2, m0, [tmpq+16*2] psubw m3, m1, [tmpq+16*3] add maskq, 32 add tmpq, 64 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*2], m0 mova [dstq+16*3], m1 add dstq, strideq dec hd jg .w32 RET cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h %define base r5-blend_v_ssse3_table LEA r5, blend_v_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 jmp wq .w2: movd m4, [base+obmc_masks+2*2] .w2_loop: movd m0, [dstq+strideq*0] movd m2, [tmpq+4*0] movd m1, [dstq+strideq*1] movd m3, [tmpq+4*1] add tmpq, 4*2 psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w2_loop RET .w4: movddup m2, [base+obmc_masks+4*2] .w4_loop: movq m0, [dstq+strideq*0] movhps m0, [dstq+strideq*1] mova m1, [tmpq] add tmpq, 8*2 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_loop RET .w8: mova m4, [base+obmc_masks+8*2] .w8_loop: mova m0, [dstq+strideq*0] mova m2, [tmpq+16*0] mova m1, [dstq+strideq*1] mova m3, [tmpq+16*1] add tmpq, 16*2 psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET .w16: mova m4, [base+obmc_masks+16*2] movq m5, [base+obmc_masks+16*3] .w16_loop: mova m0, [dstq+16*0] mova m2, [tmpq+16*0] mova m1, [dstq+16*1] mova m3, [tmpq+16*1] add tmpq, 16*2 psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, strideq dec hd jg .w16_loop RET .w32: %if WIN64 movaps [rsp+8], m6 %endif mova m4, [base+obmc_masks+16*4] mova m5, [base+obmc_masks+16*5] mova m6, [base+obmc_masks+16*6] .w32_loop: mova m0, [dstq+16*0] mova m2, [tmpq+16*0] mova m1, [dstq+16*1] mova m3, [tmpq+16*1] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 mova m2, [dstq+16*2] paddw m1, m3 mova m3, [tmpq+16*2] add tmpq, 16*4 psubw m3, m2 pmulhrsw m3, m6 paddw m2, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 add dstq, strideq dec hd jg .w32_loop %if WIN64 movaps m6, [rsp+8] %endif RET %macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp mova m0, [dstq+16*(%1+0)] mova m2, [tmpq+16*(%2+0)] mova m1, [dstq+16*(%1+1)] mova m3, [tmpq+16*(%2+1)] %if %3 add tmpq, 16*%3 %endif psubw m2, m0 psubw m3, m1 pmulhrsw m2, m5 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*(%1+0)], m0 mova [dstq+16*(%1+1)], m1 %endmacro cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask %define base r6-blend_h_ssse3_table LEA r6, blend_h_ssse3_table tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] movddup m4, [base+blend_shuf] lea maskq, [base+obmc_masks+hq*2] lea hd, [hq*3] add wq, r6 shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd m0, [dstq+dsq*0] movd m2, [dstq+dsq*1] movd m3, [maskq+hq*2] movq m1, [tmpq] add tmpq, 4*2 punpckldq m0, m2 punpcklwd m3, m3 psubw m1, m0 pmulhrsw m1, m3 paddw m0, m1 movd [dstq+dsq*0], m0 psrlq m0, 32 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova m3, [base+blend_shuf] .w4_loop: movq m0, [dstq+dsq*0] movhps m0, [dstq+dsq*1] movd m2, [maskq+hq*2] mova m1, [tmpq] add tmpq, 8*2 psubw m1, m0 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET .w8: movddup m5, [base+blend_shuf+8] %if WIN64 movaps [rsp+ 8], m6 movaps [rsp+24], m7 %endif .w8_loop: movd m7, [maskq+hq*2] mova m0, [dstq+dsq*0] mova m2, [tmpq+16*0] mova m1, [dstq+dsq*1] mova m3, [tmpq+16*1] add tmpq, 16*2 pshufb m6, m7, m4 psubw m2, m0 pshufb m7, m5 psubw m3, m1 pmulhrsw m2, m6 pmulhrsw m3, m7 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop %if WIN64 movaps m6, [rsp+ 8] movaps m7, [rsp+24] %endif RET .w16: movd m5, [maskq+hq*2] pshufb m5, m4 BLEND_H_ROW 0, 0, 2 add dstq, dsq inc hq jl .w16 RET .w32: movd m5, [maskq+hq*2] pshufb m5, m4 BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2, 4 add dstq, dsq inc hq jl .w32 RET .w64: movd m5, [maskq+hq*2] pshufb m5, m4 BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2 BLEND_H_ROW 4, 4 BLEND_H_ROW 6, 6, 8 add dstq, dsq inc hq jl .w64 RET .w128: movd m5, [maskq+hq*2] pshufb m5, m4 BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2 BLEND_H_ROW 4, 4 BLEND_H_ROW 6, 6, 16 BLEND_H_ROW 8, -8 BLEND_H_ROW 10, -6 BLEND_H_ROW 12, -4 BLEND_H_ROW 14, -2 add dstq, dsq inc hq jl .w128 RET ; emu_edge args: ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, ; const pixel *ref, const ptrdiff_t ref_stride ; ; bw, bh total filled size ; iw, ih, copied block -> fill bottom, right ; x, y, offset in bw/bh -> fill top, left cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ y, dst, dstride, src, sstride, \ bottomext, rightext, blk ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes %if ARCH_X86_64 %define reg_zero r12q %define reg_tmp r10 %define reg_src srcq %define reg_bottomext bottomextq %define reg_rightext rightextq %define reg_blkm r9m %else %define reg_zero r6 %define reg_tmp r0 %define reg_src r1 %define reg_bottomext r0 %define reg_rightext r1 %define reg_blkm r2m %endif ; ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor reg_zero, reg_zero lea reg_tmp, [ihq-1] cmp yq, ihq cmovs reg_tmp, yq test yq, yq cmovs reg_tmp, reg_zero %if ARCH_X86_64 imul reg_tmp, sstrideq add srcq, reg_tmp %else imul reg_tmp, sstridem mov reg_src, srcm add reg_src, reg_tmp %endif ; ; ref += iclip(x, 0, iw - 1) lea reg_tmp, [iwq-1] cmp xq, iwq cmovs reg_tmp, xq test xq, xq cmovs reg_tmp, reg_zero lea reg_src, [reg_src+reg_tmp*2] %if ARCH_X86_32 mov srcm, reg_src %endif ; ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) %if ARCH_X86_32 mov r1, r1m ; restore bh %endif lea reg_bottomext, [yq+bhq] sub reg_bottomext, ihq lea r3, [bhq-1] cmovs reg_bottomext, reg_zero ; DEFINE_ARGS bw, bh, iw, ih, x, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, reg_zero cmp reg_bottomext, bhq cmovns reg_bottomext, r3 cmp topextq, bhq cmovg topextq, r3 %if ARCH_X86_32 mov r4m, reg_bottomext ; ; right_ext = iclip(x + bw - iw, 0, bw - 1) mov r0, r0m ; restore bw %endif lea reg_rightext, [xq+bwq] sub reg_rightext, iwq lea r2, [bwq-1] cmovs reg_rightext, reg_zero DEFINE_ARGS bw, bh, iw, ih, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, reg_zero cmp reg_rightext, bwq cmovns reg_rightext, r2 %if ARCH_X86_32 mov r3m, r1 %endif cmp leftextq, bwq cmovns leftextq, r2 %undef reg_zero %undef reg_tmp %undef reg_src %undef reg_bottomext %undef reg_rightext DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; center_h = bh - top_ext - bottom_ext %if ARCH_X86_64 lea r3, [bottomextq+topextq] sub centerhq, r3 %else mov r1, centerhm ; restore r1 sub centerhq, topextq sub centerhq, r4m mov r1m, centerhq %endif ; ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq %if ARCH_X86_64 imul r2, dstrideq %else mov r6, r6m ; restore dstq imul r2, dstridem %endif add dstq, r2 mov reg_blkm, dstq ; save pointer for ext ; ; center_w = bw - left_ext - right_ext mov centerwq, bwq %if ARCH_X86_64 lea r3, [rightextq+leftextq] sub centerwq, r3 %else sub centerwq, r3m sub centerwq, leftextq %endif ; vloop Macro %macro v_loop 3 ; need_left_ext, need_right_ext, suffix %if ARCH_X86_64 %define reg_tmp r12 %else %define reg_tmp r0 %endif .v_loop_%3: %if ARCH_X86_32 mov r0, r0m mov r1, r1m %endif %if %1 ; left extension %if ARCH_X86_64 movd m0, [srcq] %else mov r3, srcm movd m0, [r3] %endif pshuflw m0, m0, q0000 punpcklqdq m0, m0 xor r3, r3 .left_loop_%3: mova [dstq+r3*2], m0 add r3, mmsize/2 cmp r3, leftextq jl .left_loop_%3 ; body lea reg_tmp, [dstq+leftextq*2] %endif xor r3, r3 .body_loop_%3: %if ARCH_X86_64 movu m0, [srcq+r3*2] %else mov r1, srcm movu m0, [r1+r3*2] %endif %if %1 movu [reg_tmp+r3*2], m0 %else movu [dstq+r3*2], m0 %endif add r3, mmsize/2 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 lea reg_tmp, [reg_tmp+centerwq*2] %else lea reg_tmp, [dstq+centerwq*2] %endif %if ARCH_X86_64 movd m0, [srcq+centerwq*2-2] %else mov r3, srcm movd m0, [r3+centerwq*2-2] %endif pshuflw m0, m0, q0000 punpcklqdq m0, m0 xor r3, r3 .right_loop_%3: movu [reg_tmp+r3*2], m0 add r3, mmsize/2 %if ARCH_X86_64 cmp r3, rightextq %else cmp r3, r3m %endif jl .right_loop_%3 %endif %if ARCH_X86_64 add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %else add dstq, dstridem mov r0, sstridem add srcm, r0 sub dword centerhm, 1 jg .v_loop_%3 mov r0, r0m ; restore r0 %endif %endmacro ; vloop MACRO test leftextq, leftextq jnz .need_left_ext %if ARCH_X86_64 test rightextq, rightextq jnz .need_right_ext %else cmp leftextq, r3m ; leftextq == 0 jne .need_right_ext %endif v_loop 0, 0, 0 jmp .body_done ;left right extensions .need_left_ext: %if ARCH_X86_64 test rightextq, rightextq %else mov r3, r3m test r3, r3 %endif jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; r0 ; bw ; r1 ;; x loop ; r4 ;; y loop ; r5 ; topextq ; r6 ;dstq ; r7 ;dstrideq ; r8 ; srcq %if ARCH_X86_64 %define reg_dstride dstrideq %else %define reg_dstride r2 %endif ; ; bottom edge extension %if ARCH_X86_64 test bottomextq, bottomextq jz .top %else xor r1, r1 cmp r1, r4m je .top %endif ; %if ARCH_X86_64 mov srcq, dstq sub srcq, dstrideq xor r1, r1 %else mov r3, dstq mov reg_dstride, dstridem sub r3, reg_dstride mov srcm, r3 %endif ; .bottom_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, bottomextq %else mov r3, srcm mova m0, [r3+r1*2] lea r3, [dstq+r1*2] mov r4, r4m %endif ; .bottom_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .bottom_y_loop add r1, mmsize/2 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end %if ARCH_X86_64 mov srcq, reg_blkm %else mov r3, reg_blkm mov reg_dstride, dstridem %endif mov dstq, dstm xor r1, r1 ; .top_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1*2] %else mov r3, reg_blkm mova m0, [r3+r1*2] %endif lea r3, [dstq+r1*2] mov r4, topextq ; .top_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .top_y_loop add r1, mmsize/2 cmp r1, bwq jl .top_x_loop .end: RET %undef reg_dstride %undef reg_blkm %undef reg_tmp %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro %if ARCH_X86_64 cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax %elif STACK_ALIGNMENT >= 16 cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax %else cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax %endif movifnidn dstq, dstmp movifnidn srcq, srcmp %if STACK_ALIGNMENT >= 16 movifnidn dst_wd, dst_wm %endif %if ARCH_X86_64 movifnidn hd, hm %endif sub dword mx0m, 4<<14 sub dword src_wm, 8 movd m4, pxmaxm movd m7, dxm movd m6, mx0m movd m5, src_wm punpcklwd m4, m4 pshufd m4, m4, q0000 pshufd m7, m7, q0000 pshufd m6, m6, q0000 pshufd m5, m5, q0000 mova [rsp+16*3*ARCH_X86_32], m4 %if ARCH_X86_64 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ %else DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x %define hd dword r5m %if STACK_ALIGNMENT >= 16 LEA r6, $$ %define base r6-$$ %else LEA r4, $$ %define base r4-$$ %endif %endif %if ARCH_X86_64 mova m12, [base+pd_64] mova m11, [base+pd_63] %else %define m12 [base+pd_64] %define m11 [base+pd_63] %endif pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] pslld m7, 2 ; dx*4 pslld m5, 14 paddd m6, m4 ; mx+[0..3]*dx SCRATCH 7, 15, 0 SCRATCH 6, 14, 1 SCRATCH 5, 13, 2 pxor m1, m1 .loop_y: xor xd, xd mova m0, m14 ; per-line working version of mx .loop_x: pcmpgtd m1, m0 pandn m1, m0 psrad m2, m0, 8 ; filter offset (unmasked) pcmpgtd m3, m13, m1 pand m1, m3 pandn m3, m13 por m1, m3 psubd m3, m0, m1 ; pshufb offset psrad m1, 14 ; clipped src_x offset psrad m3, 14 ; pshufb edge_emu offset pand m2, m11 ; filter offset (masked) ; load source pixels %if ARCH_X86_64 movd r8d, m1 pshuflw m1, m1, q3232 movd r9d, m1 punpckhqdq m1, m1 movd r10d, m1 psrlq m1, 32 movd r11d, m1 movu m4, [srcq+r8*2] movu m5, [srcq+r9*2] movu m6, [srcq+r10*2] movu m7, [srcq+r11*2] ; if no emulation is required, we don't need to shuffle or emulate edges packssdw m3, m3 movq r11, m3 test r11, r11 jz .filter movsx r8, r11w sar r11, 16 movsx r9, r11w sar r11, 16 movsx r10, r11w sar r11, 16 movu m1, [base+resize_shuf+8+r8*2] movu m3, [base+resize_shuf+8+r9*2] movu m8, [base+resize_shuf+8+r10*2] movu m9, [base+resize_shuf+8+r11*2] pshufb m4, m1 pshufb m5, m3 pshufb m6, m8 pshufb m7, m9 .filter: movd r8d, m2 pshuflw m2, m2, q3232 movd r9d, m2 punpckhqdq m2, m2 movd r10d, m2 psrlq m2, 32 movd r11d, m2 movq m8, [base+resize_filter+r8*8] movq m2, [base+resize_filter+r9*8] pxor m9, m9 punpcklbw m1, m9, m8 punpcklbw m3, m9, m2 psraw m1, 8 psraw m3, 8 movq m10, [base+resize_filter+r10*8] movq m2, [base+resize_filter+r11*8] punpcklbw m8, m9, m10 punpcklbw m9, m2 psraw m8, 8 psraw m9, 8 pmaddwd m4, m1 pmaddwd m5, m3 pmaddwd m6, m8 pmaddwd m7, m9 phaddd m4, m5 %else movd r3, m1 pshuflw m1, m1, q3232 movd r1, m1 punpckhqdq m1, m1 movu m4, [srcq+r3*2] movu m5, [srcq+r1*2] movd r3, m1 psrlq m1, 32 movd r1, m1 movu m6, [srcq+r3*2] movu m7, [srcq+r1*2] ; if no emulation is required, we don't need to shuffle or emulate edges pxor m1, m1 pcmpeqb m1, m3 pmovmskb r3d, m1 cmp r3d, 0xffff je .filter movd r3, m3 movu m1, [base+resize_shuf+8+r3*2] pshuflw m3, m3, q3232 movd r1, m3 pshufb m4, m1 movu m1, [base+resize_shuf+8+r1*2] punpckhqdq m3, m3 movd r3, m3 pshufb m5, m1 movu m1, [base+resize_shuf+8+r3*2] psrlq m3, 32 movd r1, m3 pshufb m6, m1 movu m1, [base+resize_shuf+8+r1*2] pshufb m7, m1 .filter: mova [esp+4*16], m6 mova [esp+5*16], m7 movd r3, m2 pshuflw m2, m2, q3232 movd r1, m2 movq m6, [base+resize_filter+r3*8] movq m7, [base+resize_filter+r1*8] pxor m3, m3 punpcklbw m1, m3, m6 punpcklbw m3, m7 psraw m1, 8 psraw m3, 8 pmaddwd m4, m1 pmaddwd m5, m3 punpckhqdq m2, m2 movd r3, m2 psrlq m2, 32 movd r1, m2 phaddd m4, m5 movq m2, [base+resize_filter+r3*8] movq m5, [base+resize_filter+r1*8] mova m6, [esp+4*16] mova m7, [esp+5*16] pxor m3, m3 punpcklbw m1, m3, m2 punpcklbw m3, m5 psraw m1, 8 psraw m3, 8 pmaddwd m6, m1 pmaddwd m7, m3 %endif phaddd m6, m7 phaddd m4, m6 pxor m1, m1 psubd m2, m12, m4 psrad m2, 7 packssdw m2, m2 pmaxsw m2, m1 pminsw m2, [rsp+16*3*ARCH_X86_32] movq [dstq+xq*2], m2 paddd m0, m15 add xd, 4 %if STACK_ALIGNMENT >= 16 cmp xd, dst_wd %else cmp xd, dst_wm %endif jl .loop_x add dstq, dst_stridemp add srcq, src_stridemp dec hd jg .loop_y RET av-scenechange-0.14.1/src/asm/x86/mc_avx2.asm000064400000000000000000005701011046102023000166170ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018-2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 ; dav1d_obmc_masks[] with 64-x interleaved obmc_masks: db 0, 0, 0, 0 ; 2 db 45, 19, 64, 0 ; 4 db 39, 25, 50, 14, 59, 5, 64, 0 ; 8 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 ; 16 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 ; 32 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 wm_420_sign: dd 0x01020102, 0x01010101 wm_422_sign: dd 0x80808080, 0x7f7f7f7f pb_64: times 4 db 64 pw_m256: times 2 dw -256 pw_15: times 2 dw 15 pw_32: times 2 dw 32 pw_34: times 2 dw 34 pw_258: times 2 dw 258 pw_512: times 2 dw 512 pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_6903: times 2 dw 6903 pw_8192: times 2 dw 8192 pd_32: dd 32 pd_63: dd 63 pd_512: dd 512 pd_32768: dd 32768 pd_0x3ff: dd 0x3ff pd_0x4000: dd 0x4000 pq_0x40000000: dq 0x40000000 cextern mc_subpel_filters cextern mc_warp_filter2 cextern resize_filter %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep) %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32 SECTION .text INIT_XMM avx2 cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx lea r7, [put_avx2] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: movzx r6d, word [srcq+ssq*0] movzx r7d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6w mov [dstq+dsq*1], r7w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET INIT_YMM avx2 .put_w32: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*0+32*1], m1 mova [dstq+dsq*1+32*0], m2 mova [dstq+dsq*1+32*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] movu m3, [srcq+32*3] add srcq, ssq mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 add dstq, dsq dec hd jg .put_w128 RET .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 255 vbroadcasti128 m4, [bilin_h_shuf8] add mxyd, 16 movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastw m5, xm5 test mxyd, mxyd jnz .hv movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] vpbroadcastd m3, [pw_2048] add wq, r7 jmp wq .h_w2: movd xm0, [srcq+ssq*0] pinsrd xm0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pmaddubsw xm0, xm5 pmulhrsw xm0, xm3 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: mova xm4, [bilin_h_shuf4] .h_w4_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pmaddubsw xm0, xm5 pmulhrsw xm0, xm3 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pshufb xm1, xm4 pmaddubsw xm0, xm5 pmaddubsw xm1, xm5 pmulhrsw xm0, xm3 pmulhrsw xm1, xm3 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .h_w32 RET .h_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movu m1, [srcq+8*4] movu m2, [srcq+8*5] add srcq, ssq pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 packuswb m1, m2 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, dsq dec hd jg .h_w64 RET .h_w128: mov r6, -32*3 .h_w128_loop: movu m0, [srcq+r6+32*3+8*0] movu m1, [srcq+r6+32*3+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+r6+32*3], m0 add r6, 32 jle .h_w128_loop add srcq, ssq add dstq, dsq dec hd jg .h_w128 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] imul mxyd, 255 vpbroadcastd m5, [pw_2048] add mxyd, 16 add wq, r7 movd xm4, mxyd vpbroadcastw m4, xm4 jmp wq .v_w2: movd xm0, [srcq+ssq*0] .v_w2_loop: pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xm1, xm1, q2301 ; 1 0 punpcklbw xm1, xm0 pmaddubsw xm1, xm4 pmulhrsw xm1, xm5 packuswb xm1, xm1 pextrw [dstq+dsq*0], xm1, 1 pextrw [dstq+dsq*1], xm1, 0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xm0, [srcq+ssq*0] .v_w4_loop: vpbroadcastd xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm1, xm2, xm0, 0x01 ; 0 1 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm2, xm0, 0x02 ; 1 2 punpcklbw xm1, xm2 pmaddubsw xm1, xm4 pmulhrsw xm1, xm5 packuswb xm1, xm1 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xm0, [srcq+ssq*0] .v_w8_loop: movq xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw xm1, xm0, xm2 movq xm0, [srcq+ssq*0] punpcklbw xm2, xm0 pmaddubsw xm1, xm4 pmaddubsw xm2, xm4 pmulhrsw xm1, xm5 pmulhrsw xm2, xm5 packuswb xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: movu xm0, [srcq+ssq*0] .v_w16_loop: vbroadcasti128 m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m2, m3, m0, 0x0f ; 0 1 vbroadcasti128 m0, [srcq+ssq*0] vpblendd m3, m0, 0xf0 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: %macro PUT_BILIN_V_W32 0 movu m0, [srcq+ssq*0] %%loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m1, m0, m3 punpckhbw m2, m0, m3 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 punpcklbw m2, m3, m0 punpckhbw m3, m0 pmaddubsw m2, m4 pmaddubsw m3, m4 pmulhrsw m2, m5 pmulhrsw m3, m5 packuswb m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg %%loop %endmacro PUT_BILIN_V_W32 RET .v_w64: movu m0, [srcq+32*0] movu m1, [srcq+32*1] .v_w64_loop: add srcq, ssq movu m3, [srcq+32*0] punpcklbw m2, m0, m3 punpckhbw m0, m3 pmaddubsw m2, m4 pmaddubsw m0, m4 pmulhrsw m2, m5 pmulhrsw m0, m5 packuswb m2, m0 mova m0, m3 movu m3, [srcq+32*1] mova [dstq+32*0], m2 punpcklbw m2, m1, m3 punpckhbw m1, m3 pmaddubsw m2, m4 pmaddubsw m1, m4 pmulhrsw m2, m5 pmulhrsw m1, m5 packuswb m2, m1 mova m1, m3 mova [dstq+32*1], m2 add dstq, dsq dec hd jg .v_w64_loop RET .v_w128: lea r6d, [hq+(3<<8)] mov r4, srcq mov r7, dstq .v_w128_loop: PUT_BILIN_V_W32 add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w128_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow vpbroadcastd m7, [pw_15] movd xm6, mxyd add wq, r7 paddb m5, m5 vpbroadcastw m6, xm6 jmp wq .hv_w2: vpbroadcastd xm0, [srcq+ssq*0] pshufb xm0, xm4 pmaddubsw xm0, xm5 .hv_w2_loop: movd xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pinsrd xm1, [srcq+ssq*0], 1 pshufb xm1, xm4 pmaddubsw xm1, xm5 ; 1 _ 2 _ shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ mova xm0, xm1 psubw xm1, xm2 pmulhw xm1, xm6 pavgw xm2, xm7 paddw xm1, xm2 psrlw xm1, 4 packuswb xm1, xm1 pextrw [dstq+dsq*0], xm1, 0 pextrw [dstq+dsq*1], xm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova xm4, [bilin_h_shuf4] movddup xm0, [srcq+ssq*0] pshufb xm0, xm4 pmaddubsw xm0, xm5 .hv_w4_loop: movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm1, [srcq+ssq*0] pshufb xm1, xm4 pmaddubsw xm1, xm5 ; 1 2 shufps xm2, xm0, xm1, q1032 ; 0 1 mova xm0, xm1 psubw xm1, xm2 pmulhw xm1, xm6 pavgw xm2, xm7 paddw xm1, xm2 psrlw xm1, 4 packuswb xm1, xm1 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: vbroadcasti128 m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 m1, [srcq+ssq*0], 1 pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 pmulhw m1, m6 pavgw m2, m7 paddw m1, m2 psrlw m1, 4 vextracti128 xm2, m1, 1 packuswb xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: movu m0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 pshufb m0, m4 pmaddubsw m0, m5 .hv_w16_loop: movu xm2, [srcq+ssq*1+8*0] vinserti128 m2, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] movu xm3, [srcq+ssq*0+8*0] vinserti128 m3, [srcq+ssq*0+8*1], 1 pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 psubw m1, m2, m0 pmulhw m1, m6 pavgw m0, m7 paddw m1, m0 pmaddubsw m0, m3, m5 psubw m3, m0, m2 pmulhw m3, m6 pavgw m2, m7 paddw m3, m2 psrlw m1, 4 psrlw m3, 4 packuswb m1, m3 vpermq m1, m1, q3120 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w128: lea r6d, [hq+(3<<16)] jmp .hv_w32_start .hv_w64: lea r6d, [hq+(1<<16)] .hv_w32_start: mov r4, srcq mov r7, dstq .hv_w32: %if WIN64 movaps r4m, xmm8 %endif .hv_w32_loop0: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w32_loop: add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 psubw m8, m2, m0 pmulhw m8, m6 pavgw m0, m7 paddw m8, m0 mova m0, m2 psubw m2, m3, m1 pmulhw m2, m6 pavgw m1, m7 paddw m2, m1 mova m1, m3 psrlw m8, 4 psrlw m2, 4 packuswb m8, m2 mova [dstq], m8 add dstq, dsq dec hd jg .hv_w32_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<16 jg .hv_w32_loop0 %if WIN64 movaps xmm8, r4m %endif RET cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep%+SUFFIX] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] add wq, r6 lea stride3q, [strideq*3] jmp wq .prep_w4: movd xm0, [srcq+strideq*0] pinsrd xm0, [srcq+strideq*1], 1 pinsrd xm0, [srcq+strideq*2], 2 pinsrd xm0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmovzxbw m0, xm0 psllw m0, 4 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .prep_w4 RET .prep_w8: movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] movq xm1, [srcq+strideq*2] movhps xm1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmovzxbw m0, xm0 pmovzxbw m1, xm1 psllw m0, 4 psllw m1, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 4 jg .prep_w8 RET .prep_w16: pmovzxbw m0, [srcq+strideq*0] pmovzxbw m1, [srcq+strideq*1] pmovzxbw m2, [srcq+strideq*2] pmovzxbw m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 4 jg .prep_w16 RET .prep_w32: pmovzxbw m0, [srcq+strideq*0+16*0] pmovzxbw m1, [srcq+strideq*0+16*1] pmovzxbw m2, [srcq+strideq*1+16*0] pmovzxbw m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 2 jg .prep_w32 RET .prep_w64: pmovzxbw m0, [srcq+16*0] pmovzxbw m1, [srcq+16*1] pmovzxbw m2, [srcq+16*2] pmovzxbw m3, [srcq+16*3] add srcq, strideq psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 dec hd jg .prep_w64 RET .prep_w128: pmovzxbw m0, [srcq+16*0] pmovzxbw m1, [srcq+16*1] pmovzxbw m2, [srcq+16*2] pmovzxbw m3, [srcq+16*3] psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 pmovzxbw m0, [srcq+16*4] pmovzxbw m1, [srcq+16*5] pmovzxbw m2, [srcq+16*6] pmovzxbw m3, [srcq+16*7] add tmpq, 32*8 add srcq, strideq psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq-32*4], m0 mova [tmpq-32*3], m1 mova [tmpq-32*2], m2 mova [tmpq-32*1], m3 dec hd jg .prep_w128 RET .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 255 vbroadcasti128 m4, [bilin_h_shuf8] add mxyd, 16 movd xm5, mxyd mov mxyd, r6m ; my vpbroadcastw m5, xm5 test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: vbroadcasti128 m4, [bilin_h_shuf4] .h_w4_loop: movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] movq xm1, [srcq+strideq*2] movhps xm1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti128 m0, xm1, 1 pshufb m0, m4 pmaddubsw m0, m5 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: .h_w8_loop: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 movu xm1, [srcq+strideq*2] vinserti128 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 4 jg .h_w8_loop RET .h_w16: .h_w16_loop: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 movu xm1, [srcq+strideq*1+8*0] vinserti128 m1, [srcq+strideq*1+8*1], 1 movu xm2, [srcq+strideq*2+8*0] vinserti128 m2, [srcq+strideq*2+8*1], 1 movu xm3, [srcq+stride3q +8*0] vinserti128 m3, [srcq+stride3q +8*1], 1 lea srcq, [srcq+strideq*4] pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 4 jg .h_w16_loop RET .h_w32: .h_w32_loop: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 movu xm1, [srcq+strideq*0+8*2] vinserti128 m1, [srcq+strideq*0+8*3], 1 movu xm2, [srcq+strideq*1+8*0] vinserti128 m2, [srcq+strideq*1+8*1], 1 movu xm3, [srcq+strideq*1+8*2] vinserti128 m3, [srcq+strideq*1+8*3], 1 lea srcq, [srcq+strideq*2] pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 2 jg .h_w32_loop RET .h_w64: movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] vinserti128 m1, [srcq+8*3], 1 movu xm2, [srcq+8*4] vinserti128 m2, [srcq+8*5], 1 movu xm3, [srcq+8*6] vinserti128 m3, [srcq+8*7], 1 add srcq, strideq pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 dec hd jg .h_w64 RET .h_w128: movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] vinserti128 m1, [srcq+8*3], 1 movu xm2, [srcq+8*4] vinserti128 m2, [srcq+8*5], 1 movu xm3, [srcq+8*6] vinserti128 m3, [srcq+8*7], 1 pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 movu xm0, [srcq+8* 8] vinserti128 m0, [srcq+8* 9], 1 movu xm1, [srcq+8*10] vinserti128 m1, [srcq+8*11], 1 movu xm2, [srcq+8*12] vinserti128 m2, [srcq+8*13], 1 movu xm3, [srcq+8*14] vinserti128 m3, [srcq+8*15], 1 add tmpq, 32*8 add srcq, strideq pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq-32*4], m0 mova [tmpq-32*3], m1 mova [tmpq-32*2], m2 mova [tmpq-32*1], m3 dec hd jg .h_w128 RET .v: WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 255 add mxyd, 16 add wq, r6 lea stride3q, [strideq*3] movd xm6, mxyd vpbroadcastw m6, xm6 jmp wq .v_w4: movd xm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastd m1, [srcq+strideq*2] vpbroadcastd xm2, [srcq+strideq*1] vpbroadcastd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m1, m0, 0x05 ; 0 2 2 2 vpbroadcastd m0, [srcq+strideq*0] vpblendd m3, m2, 0x0f ; 1 1 3 3 vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 vpblendd m1, m3, 0xaa ; 0 1 2 3 vpblendd m2, m3, 0x55 ; 1 2 3 4 punpcklbw m1, m2 pmaddubsw m1, m6 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: movq xm0, [srcq+strideq*0] .v_w8_loop: vpbroadcastq m1, [srcq+strideq*2] vpbroadcastq m2, [srcq+strideq*1] vpbroadcastq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m1, m0, 0x03 ; 0 2 2 2 vpbroadcastq m0, [srcq+strideq*0] vpblendd m2, m3, 0xcc ; 1 3 1 3 vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2 vpblendd m2, m1, 0x0f ; 0 2 1 3 vpblendd m3, m0, 0xc0 ; 1 3 2 4 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m6 pmaddubsw m2, m6 mova [tmpq+32*0], m1 mova [tmpq+32*1], m2 add tmpq, 32*2 sub hd, 4 jg .v_w8_loop RET .v_w16: vbroadcasti128 m0, [srcq+strideq*0] .v_w16_loop: vbroadcasti128 m1, [srcq+strideq*1] vbroadcasti128 m2, [srcq+strideq*2] vbroadcasti128 m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] shufpd m4, m0, m2, 0x0c ; 0 2 vbroadcasti128 m0, [srcq+strideq*0] shufpd m1, m3, 0x0c ; 1 3 shufpd m2, m0, 0x0c ; 2 4 punpcklbw m3, m4, m1 punpcklbw m5, m1, m2 punpckhbw m4, m1 punpckhbw m1, m2 pmaddubsw m3, m6 pmaddubsw m5, m6 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+32*0], m3 mova [tmpq+32*1], m5 mova [tmpq+32*2], m4 mova [tmpq+32*3], m1 add tmpq, 32*4 sub hd, 4 jg .v_w16_loop RET .v_w32: vpermq m0, [srcq+strideq*0], q3120 .v_w32_loop: vpermq m1, [srcq+strideq*1], q3120 vpermq m2, [srcq+strideq*2], q3120 vpermq m3, [srcq+stride3q ], q3120 lea srcq, [srcq+strideq*4] punpcklbw m4, m0, m1 punpckhbw m5, m0, m1 vpermq m0, [srcq+strideq*0], q3120 pmaddubsw m4, m6 pmaddubsw m5, m6 mova [tmpq+32*0], m4 mova [tmpq+32*1], m5 punpcklbw m4, m1, m2 punpckhbw m1, m2 pmaddubsw m4, m6 pmaddubsw m1, m6 punpcklbw m5, m2, m3 punpckhbw m2, m3 pmaddubsw m5, m6 pmaddubsw m2, m6 mova [tmpq+32*2], m4 mova [tmpq+32*3], m1 add tmpq, 32*8 punpcklbw m1, m3, m0 punpckhbw m3, m0 pmaddubsw m1, m6 pmaddubsw m3, m6 mova [tmpq-32*4], m5 mova [tmpq-32*3], m2 mova [tmpq-32*2], m1 mova [tmpq-32*1], m3 sub hd, 4 jg .v_w32_loop RET .v_w64: vpermq m0, [srcq+strideq*0+32*0], q3120 vpermq m1, [srcq+strideq*0+32*1], q3120 .v_w64_loop: vpermq m2, [srcq+strideq*1+32*0], q3120 vpermq m3, [srcq+strideq*1+32*1], q3120 lea srcq, [srcq+strideq*2] punpcklbw m4, m0, m2 punpckhbw m0, m2 pmaddubsw m4, m6 pmaddubsw m0, m6 mova [tmpq+32*0], m4 mova [tmpq+32*1], m0 punpcklbw m4, m1, m3 punpckhbw m5, m1, m3 vpermq m0, [srcq+strideq*0+32*0], q3120 vpermq m1, [srcq+strideq*0+32*1], q3120 pmaddubsw m4, m6 pmaddubsw m5, m6 mova [tmpq+32*2], m4 mova [tmpq+32*3], m5 add tmpq, 32*8 punpcklbw m4, m2, m0 punpckhbw m2, m0 punpcklbw m5, m3, m1 punpckhbw m3, m1 pmaddubsw m4, m6 pmaddubsw m2, m6 pmaddubsw m5, m6 pmaddubsw m3, m6 mova [tmpq-32*4], m4 mova [tmpq-32*3], m2 mova [tmpq-32*2], m5 mova [tmpq-32*1], m3 sub hd, 2 jg .v_w64_loop RET .v_w128: lea r6d, [hq+(3<<8)] mov r3, srcq mov r5, tmpq .v_w128_loop0: vpermq m0, [srcq+strideq*0], q3120 .v_w128_loop: vpermq m1, [srcq+strideq*1], q3120 lea srcq, [srcq+strideq*2] punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 vpermq m0, [srcq+strideq*0], q3120 pmaddubsw m2, m6 pmaddubsw m3, m6 punpcklbw m4, m1, m0 punpckhbw m1, m0 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+32*0], m2 mova [tmpq+32*1], m3 mova [tmpq+32*8], m4 mova [tmpq+32*9], m1 add tmpq, 32*16 sub hd, 2 jg .v_w128_loop add r3, 32 add r5, 64 movzx hd, r6b mov srcq, r3 mov tmpq, r5 sub r6d, 1<<8 jg .v_w128_loop0 RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 movd xm6, mxyd vpbroadcastw m6, xm6 add wq, r6 lea stride3q, [strideq*3] jmp wq .hv_w4: vbroadcasti128 m4, [bilin_h_shuf4] vpbroadcastq m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w4_loop: movq xm1, [srcq+strideq*1] movhps xm1, [srcq+strideq*2] movq xm2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] movhps xm2, [srcq+strideq*0] vinserti128 m1, xm2, 1 pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 3 4 vpblendd m2, m1, m0, 0xc0 vpermq m2, m2, q2103 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: vbroadcasti128 m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu xm1, [srcq+strideq*1] vinserti128 m1, [srcq+strideq*2], 1 movu xm2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti128 m2, [srcq+strideq*0], 1 pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 ; 1 2 vperm2i128 m3, m0, m1, 0x21 ; 0 1 pmaddubsw m0, m2, m5 ; 3 4 vperm2i128 m2, m1, m0, 0x21 ; 2 3 psubw m1, m3 pmulhrsw m1, m6 paddw m1, m3 psubw m3, m0, m2 pmulhrsw m3, m6 paddw m3, m2 mova [tmpq+32*0], m1 mova [tmpq+32*1], m3 add tmpq, 32*2 sub hd, 4 jg .hv_w8_loop RET .hv_w16: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 pshufb m0, m4 pmaddubsw m0, m5 .hv_w16_loop: movu xm1, [srcq+strideq*1+8*0] vinserti128 m1, [srcq+strideq*1+8*1], 1 lea srcq, [srcq+strideq*2] movu xm2, [srcq+strideq*0+8*0] vinserti128 m2, [srcq+strideq*0+8*1], 1 pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 pmaddubsw m0, m2, m5 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+32*0], m3 mova [tmpq+32*1], m2 add tmpq, 32*2 sub hd, 2 jg .hv_w16_loop RET .hv_w32: movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] vinserti128 m1, [srcq+8*3], 1 pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w32_loop: add srcq, strideq movu xm2, [srcq+8*0] vinserti128 m2, [srcq+8*1], 1 pshufb m2, m4 pmaddubsw m2, m5 psubw m3, m2, m0 pmulhrsw m3, m6 paddw m3, m0 mova m0, m2 movu xm2, [srcq+8*2] vinserti128 m2, [srcq+8*3], 1 pshufb m2, m4 pmaddubsw m2, m5 mova [tmpq+32*0], m3 psubw m3, m2, m1 pmulhrsw m3, m6 paddw m3, m1 mova m1, m2 mova [tmpq+32*1], m3 add tmpq, 32*2 dec hd jg .hv_w32_loop RET .hv_w128: lea r3d, [hq+(7<<8)] mov r6d, 256 jmp .hv_w64_start .hv_w64: lea r3d, [hq+(3<<8)] mov r6d, 128 .hv_w64_start: %if WIN64 PUSH r7 %endif mov r5, srcq mov r7, tmpq .hv_w64_loop0: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 pshufb m0, m4 pmaddubsw m0, m5 .hv_w64_loop: movu xm1, [srcq+strideq*1+8*0] vinserti128 m1, [srcq+strideq*1+8*1], 1 lea srcq, [srcq+strideq*2] movu xm2, [srcq+strideq*0+8*0] vinserti128 m2, [srcq+strideq*0+8*1], 1 pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 pmaddubsw m0, m2, m5 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+r6*0], m3 mova [tmpq+r6*1], m2 lea tmpq, [tmpq+r6*2] sub hd, 2 jg .hv_w64_loop add r5, 16 add r7, 32 movzx hd, r3b mov srcq, r5 mov tmpq, r7 sub r3d, 1<<8 jg .hv_w64_loop0 %if WIN64 POP r7 %endif RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; fn, type, type_h, type_v cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_FN smooth, SMOOTH, SMOOTH PUT_8TAP_FN sharp_regular, SHARP, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx2] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 lea r6, [ssq*3] lea r7, [dsq*3] %if WIN64 pop r8 %endif jmp wq .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) WIN64_SPILL_XMM 11 cmp wd, 4 jl .h_w2 vbroadcasti128 m6, [subpel_h_shufA] je .h_w4 tzcnt wd, wd vbroadcasti128 m7, [subpel_h_shufB] vbroadcasti128 m8, [subpel_h_shufC] shr mxd, 16 sub srcq, 3 movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] add wq, r8 jmp wq .h_w2: movzx mxd, mxb dec srcq mova xm4, [subpel_h_shuf4] vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] .h_w2_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pmaddubsw xm0, xm3 phaddw xm0, xm0 paddw xm0, xm5 psraw xm0, 6 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb dec srcq vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] .h_w4_loop: movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm6 pshufb xm1, xm6 pmaddubsw xm0, xm3 pmaddubsw xm1, xm3 phaddw xm0, xm1 paddw xm0, xm5 psraw xm0, 6 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] pshufb m%2, m%1, m7 pshufb m%3, m%1, m8 pshufb m%1, m6 pmaddubsw m%4, m%2, m9 pmaddubsw m%2, m10 pmaddubsw m%3, m10 pmaddubsw m%1, m9 paddw m%3, m%4 paddw m%1, m%2 phaddw m%1, m%3 paddw m%1, m5 psraw m%1, 6 %endmacro movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] PUT_8TAP_H 0, 1, 2, 3 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 PUT_8TAP_H 0, 2, 3, 4 lea srcq, [srcq+ssq*2] PUT_8TAP_H 1, 2, 3, 4 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: xor r6d, r6d jmp .h_start .h_w64: mov r6, -32*1 jmp .h_start .h_w128: mov r6, -32*3 .h_start: sub srcq, r6 sub dstq, r6 mov r4, r6 .h_loop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_8TAP_H 0, 2, 3, 4 PUT_8TAP_H 1, 2, 3, 4 packuswb m0, m1 mova [dstq+r6], m0 add r6, 32 jle .h_loop add srcq, ssq add dstq, dsq mov r6, r4 dec hd jg .h_loop RET .v: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] vpbroadcastd m7, [pw_512] lea myq, [r8+myq*8+subpel_filters-put_avx2] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] add r6, r8 lea ss3q, [ssq*3] sub srcq, ss3q jmp r6 .v_w2: movd xm2, [srcq+ssq*0] pinsrw xm2, [srcq+ssq*1], 2 pinsrw xm2, [srcq+ssq*2], 4 add srcq, ss3q pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 movd xm3, [srcq+ssq*1] vpbroadcastd xm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklbw xm3, xm1 ; 45 56 punpcklbw xm1, xm2, xm4 ; 01 12 punpckhbw xm2, xm4 ; 23 34 .v_w2_loop: pmaddubsw xm5, xm1, xm8 ; a0 b0 mova xm1, xm2 pmaddubsw xm2, xm9 ; a1 b1 paddw xm5, xm2 mova xm2, xm3 pmaddubsw xm3, xm10 ; a2 b2 paddw xm5, xm3 vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklbw xm3, xm4 ; 67 78 pmaddubsw xm4, xm3, xm11 ; a3 b3 paddw xm5, xm4 pmulhrsw xm5, xm7 packuswb xm5, xm5 pextrw [dstq+dsq*0], xm5, 0 pextrw [dstq+dsq*1], xm5, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xm2, [srcq+ssq*0] pinsrd xm2, [srcq+ssq*1], 1 pinsrd xm2, [srcq+ssq*2], 2 add srcq, ss3q pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xm3, [srcq+ssq*1] vpbroadcastd xm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklbw xm3, xm1 ; 45 56 punpcklbw xm1, xm2, xm4 ; 01 12 punpckhbw xm2, xm4 ; 23 34 .v_w4_loop: pmaddubsw xm5, xm1, xm8 ; a0 b0 mova xm1, xm2 pmaddubsw xm2, xm9 ; a1 b1 paddw xm5, xm2 mova xm2, xm3 pmaddubsw xm3, xm10 ; a2 b2 paddw xm5, xm3 vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklbw xm3, xm4 ; 67 78 pmaddubsw xm4, xm3, xm11 ; a3 b3 paddw xm5, xm4 pmulhrsw xm5, xm7 packuswb xm5, xm5 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xm1, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m5, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vpbroadcastq m6, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m0, [srcq+ssq*0] vpblendd m1, m4, 0x30 vpblendd m4, m2, 0x30 punpcklbw m1, m4 ; 01 12 vpblendd m2, m5, 0x30 vpblendd m5, m3, 0x30 punpcklbw m2, m5 ; 23 34 vpblendd m3, m6, 0x30 vpblendd m6, m0, 0x30 punpcklbw m3, m6 ; 45 56 .v_w8_loop: vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, m8 ; a0 b0 mova m1, m2 pmaddubsw m2, m9 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, m10 ; a2 b2 paddw m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*0] vpblendd m4, m0, 0x30 punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, m11 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 vextracti128 xm4, m5, 1 packuswb xm5, xm4 movq [dstq+dsq*0], xm5 movhps [dstq+dsq*1], xm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: .v_w32: .v_w64: .v_w128: lea r6d, [wq*8-128] mov r4, srcq mov r7, dstq lea r6d, [hq+r6*2] .v_w16_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] vbroadcasti128 m6, [srcq+ssq*2] add srcq, ss3q vbroadcasti128 m0, [srcq+ssq*0] vbroadcasti128 m1, [srcq+ssq*1] vbroadcasti128 m2, [srcq+ssq*2] add srcq, ss3q vbroadcasti128 m3, [srcq+ssq*0] shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: vbroadcasti128 m12, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti128 m13, [srcq+ssq*0] pmaddubsw m14, m1, m8 ; a0 pmaddubsw m15, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 paddw m14, m3 paddw m15, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 paddw m14, m5 paddw m15, m6 shufpd m6, m0, m12, 0x0d shufpd m0, m12, m13, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 pmaddubsw m13, m6, m11 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 packuswb m14, m15 vpermq m14, m14, q3120 mova [dstq+dsq*0], xm14 vextracti128 [dstq+dsq*1], m14, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET .hv: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] lea ss3q, [ssq*3] sub srcq, ss3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend vpbroadcastd m8, [pw_8192] vpbroadcastd m9, [pd_512] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 m6, [subpel_h_shuf4] movq xm2, [srcq+ssq*0] movhps xm2, [srcq+ssq*1] movq xm0, [srcq+ssq*2] add srcq, ss3q movhps xm0, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vpbroadcastq m4, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m1, [srcq+ssq*0] vpblendd m2, m3, 0x30 vpblendd m0, m1, 0x30 vpblendd m2, m4, 0xc0 pshufb m2, m6 pshufb m0, m6 pmaddubsw m2, m7 pmaddubsw m0, m7 phaddw m2, m0 pmulhrsw m2, m8 vextracti128 xm3, m2, 1 palignr xm4, xm3, xm2, 4 punpcklwd xm1, xm2, xm4 ; 01 12 punpckhwd xm2, xm4 ; 23 34 pshufd xm0, xm3, q2121 punpcklwd xm3, xm0 ; 45 56 .hv_w2_loop: movq xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm4, [srcq+ssq*0] pshufb xm4, xm6 pmaddubsw xm4, xm7 pmaddwd xm5, xm1, xm10 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm11 ; a1 b1 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm12 ; a2 b2 phaddw xm4, xm4 pmulhrsw xm4, xm8 paddd xm5, xm3 palignr xm3, xm4, xm0, 12 mova xm0, xm4 punpcklwd xm3, xm0 ; 67 78 pmaddwd xm4, xm3, xm13 ; a3 b3 paddd xm5, xm9 paddd xm5, xm4 psrad xm5, 10 packssdw xm5, xm5 packuswb xm5, xm5 pextrw [dstq+dsq*0], xm5, 0 pextrw [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova m6, [subpel_h_shuf4] vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m0, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m5, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vpblendd m2, m4, 0xcc ; 0 1 vpbroadcastq m4, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m1, [srcq+ssq*0] vpblendd m0, m5, 0xcc ; 2 3 vpblendd m3, m4, 0xcc ; 4 5 pshufb m2, m6 pshufb m0, m6 pshufb m3, m6 pshufb m1, m6 pmaddubsw m2, m7 pmaddubsw m0, m7 pmaddubsw m3, m7 pmaddubsw m1, m7 phaddw m2, m0 phaddw m3, m1 pmulhrsw m2, m8 pmulhrsw m3, m8 palignr m4, m3, m2, 4 punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 pshufd m0, m3, q2121 punpcklwd m3, m0 ; 45 56 .hv_w4_loop: vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m1, m10 ; a0 b0 mova m1, m2 pmaddwd m2, m11 ; a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, m12 ; a2 b2 paddd m5, m3 vpbroadcastq m3, [srcq+ssq*0] vpblendd m4, m3, 0xcc ; 7 8 pshufb m4, m6 pmaddubsw m4, m7 phaddw m4, m4 pmulhrsw m4, m8 palignr m3, m4, m0, 12 mova m0, m4 punpcklwd m3, m0 ; 67 78 pmaddwd m4, m3, m13 ; a3 b3 paddd m5, m9 paddd m5, m4 psrad m5, 10 vextracti128 xm4, m5, 1 packssdw xm5, xm4 packuswb xm5, xm5 pshuflw xm5, xm5, q3120 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] lea ss3q, [ssq*3] sub srcq, ss3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 lea r6d, [wq*8-64] mov r4, srcq mov r7, dstq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+ssq*0] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+ssq*1] vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [srcq+ssq*2] add srcq, ss3q vbroadcasti128 m0, [srcq+ssq*0] vpblendd m4, m0, 0xf0 ; 0 3 vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 add srcq, ss3q vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] pshufb %3, %1, %6 pshufb %4, %1, %7 pshufb %1, %5 pmaddubsw %2, %3, m10 pmaddubsw %4, m11 pmaddubsw %3, m11 pmaddubsw %1, m10 paddw %2, %4 paddw %1, %3 phaddw %1, %2 %endmacro HV_H_W8 m4, m1, m2, m3, m7, m8, m9 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 HV_H_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 vpermq m6, m6, q3120 pmulhrsw m0, m7 pmulhrsw m4, m7 pmulhrsw m5, m7 pmulhrsw m6, m7 vpermq m7, m0, q3120 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 .hv_w8_loop: vextracti128 r6m, m0, 1 ; not enough registers movu xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 m0, [srcq+ssq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m13 ; a1 pmaddwd m4, m13 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m14 ; a2 pmaddwd m6, m14 ; b2 paddd m8, m5 paddd m9, m6 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] HV_H_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_512] vbroadcasti128 m6, r6m pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 paddd m8, m7 pmaddwd m7, m6, m15 ; b3 paddd m7, m9 psrad m8, 10 psrad m7, 10 packssdw m8, m7 vextracti128 xm7, m8, 1 packuswb xm8, xm7 pshufd xm7, xm8, q3120 movq [dstq+dsq*0], xm7 movhps [dstq+dsq*1], xm7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop add r4, 8 add r7, 8 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET %macro PREP_8TAP_H 0 pshufb m1, m0, m5 pshufb m2, m0, m6 pshufb m3, m0, m7 pmaddubsw m1, m8 pmaddubsw m0, m2, m8 pmaddubsw m2, m9 pmaddubsw m3, m9 paddw m1, m2 paddw m0, m3 phaddw m0, m1, m0 pmulhrsw m0, m4 %endmacro %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_FN smooth, SMOOTH, SMOOTH PREP_8TAP_FN sharp_regular, SHARP, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep%+SUFFIX] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r7+wq*2+table_offset(prep,)] add wq, r7 lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h: test myd, 0xf00 jnz .hv vpbroadcastd m4, [pw_8192] vbroadcasti128 m5, [subpel_h_shufA] WIN64_SPILL_XMM 10 cmp wd, 4 je .h_w4 tzcnt wd, wd vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] shr mxd, 16 sub srcq, 3 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] add wq, r7 jmp wq .h_w4: movzx mxd, mxb dec srcq vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] lea stride3q, [strideq*3] .h_w4_loop: movq xm0, [srcq+strideq*0] vpbroadcastq m2, [srcq+strideq*2] movq xm1, [srcq+strideq*1] vpblendd m0, m2, 0xf0 vpbroadcastq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m1, m2, 0xf0 pshufb m0, m5 pshufb m1, m5 pmaddubsw m0, m6 pmaddubsw m1, m6 phaddw m0, m1 pmulhrsw m0, m4 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] PREP_8TAP_H mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 PREP_8TAP_H mova [tmpq+32*0], m0 movu xm0, [srcq+strideq*1+8*0] vinserti128 m0, [srcq+strideq*1+8*1], 1 lea srcq, [srcq+strideq*2] PREP_8TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 sub hd, 2 jg .h_w16 RET .h_w32: xor r6d, r6d jmp .h_start .h_w64: mov r6, -32*1 jmp .h_start .h_w128: mov r6, -32*3 .h_start: sub srcq, r6 mov r5, r6 .h_loop: movu xm0, [srcq+r6+8*0] vinserti128 m0, [srcq+r6+8*1], 1 PREP_8TAP_H mova [tmpq+32*0], m0 movu xm0, [srcq+r6+8*2] vinserti128 m0, [srcq+r6+8*3], 1 PREP_8TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 add r6, 32 jle .h_loop add srcq, strideq mov r6, r5 dec hd jg .h_loop RET .v: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. ; TODO: Would a 6-tap code path be worth it? lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] lea stride3q, [strideq*3] sub srcq, stride3q vpbroadcastd m7, [pw_8192] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] cmp wd, 8 jg .v_w16 je .v_w8 .v_w4: movd xm0, [srcq+strideq*0] vpbroadcastd m1, [srcq+strideq*2] vpbroadcastd xm2, [srcq+strideq*1] add srcq, stride3q vpbroadcastd m3, [srcq+strideq*0] vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ vpbroadcastd m0, [srcq+strideq*1] vpbroadcastd m2, [srcq+strideq*2] vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ vpbroadcastd m0, [srcq+stride3q ] vbroadcasti128 m5, [deint_shuf4] vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ punpcklbw m1, m2, m3 ; 01 12 23 34 vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 punpckhbw m2, m3 ; 23 34 45 56 .v_w4_loop: lea srcq, [srcq+strideq*4] pinsrd xm0, [srcq+strideq*0], 1 vpbroadcastd m3, [srcq+strideq*1] vpbroadcastd m4, [srcq+strideq*2] vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ vpbroadcastd m0, [srcq+stride3q ] vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ pshufb m3, m5 ; 67 78 89 9a pmaddubsw m4, m1, m8 vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 pmaddubsw m2, m9 paddw m4, m2 mova m2, m3 pmaddubsw m3, m11 paddw m3, m4 pmaddubsw m4, m1, m10 paddw m3, m4 pmulhrsw m3, m7 mova [tmpq], m3 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: movq xm1, [srcq+strideq*0] vpbroadcastq m4, [srcq+strideq*1] vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq m5, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*0] vpbroadcastq m6, [srcq+strideq*1] vpbroadcastq m0, [srcq+strideq*2] vpblendd m1, m4, 0x30 vpblendd m4, m2, 0x30 punpcklbw m1, m4 ; 01 12 vpblendd m2, m5, 0x30 vpblendd m5, m3, 0x30 punpcklbw m2, m5 ; 23 34 vpblendd m3, m6, 0x30 vpblendd m6, m0, 0x30 punpcklbw m3, m6 ; 45 56 .v_w8_loop: vpbroadcastq m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmaddubsw m5, m2, m9 ; a1 pmaddubsw m6, m2, m8 ; b0 vpblendd m2, m0, m4, 0x30 vpbroadcastq m0, [srcq+strideq*0] vpblendd m4, m0, 0x30 punpcklbw m2, m4 ; 67 78 pmaddubsw m1, m8 ; a0 pmaddubsw m4, m3, m9 ; b1 paddw m5, m1 mova m1, m3 pmaddubsw m3, m10 ; a2 paddw m6, m4 paddw m5, m3 vpbroadcastq m4, [srcq+strideq*1] vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+strideq*2] vpblendd m4, m0, 0x30 punpcklbw m3, m4 ; 89 9a pmaddubsw m4, m2, m11 ; a3 paddw m5, m4 pmaddubsw m4, m2, m10 ; b2 paddw m6, m4 pmaddubsw m4, m3, m11 ; b3 paddw m6, m4 pmulhrsw m5, m7 pmulhrsw m6, m7 mova [tmpq+32*0], m5 mova [tmpq+32*1], m6 add tmpq, 32*2 sub hd, 4 jg .v_w8_loop RET .v_w16: add wd, wd mov r5, srcq mov r7, tmpq lea r6d, [hq+wq*8-256] .v_w16_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vbroadcasti128 m0, [srcq+strideq*1] vbroadcasti128 m6, [srcq+strideq*0] lea srcq, [srcq+strideq*2] vbroadcasti128 m1, [srcq+strideq*0] vbroadcasti128 m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vbroadcasti128 m3, [srcq+strideq*0] shufpd m4, m4, m0, 0x0c shufpd m5, m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 shufpd m6, m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 shufpd m0, m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: vbroadcasti128 m12, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vbroadcasti128 m13, [srcq+strideq*0] pmaddubsw m14, m1, m8 ; a0 pmaddubsw m15, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 paddw m14, m3 paddw m15, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 paddw m14, m5 paddw m15, m6 shufpd m6, m0, m12, 0x0d shufpd m0, m12, m13, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 pmaddubsw m13, m6, m11 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 mova [tmpq+wq*0], m14 mova [tmpq+wq*1], m15 lea tmpq, [tmpq+wq*2] sub hd, 2 jg .v_w16_loop add r5, 16 add r7, 32 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET .hv: %assign stack_offset stack_offset - stack_size_padded %assign stack_size_padded 0 WIN64_SPILL_XMM 16 cmp wd, 4 je .hv_w4 shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] lea stride3q, [strideq*3] sub srcq, stride3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 jmp .hv_w8 .hv_w4: movzx mxd, mxb dec srcq vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] lea stride3q, [strideq*3] sub srcq, stride3q mova m7, [subpel_h_shuf4] pmovzxbd m9, [deint_shuf4] vpbroadcastd m10, [pw_8192] punpcklbw m0, m0 psraw m0, 8 ; sign-extend vpbroadcastd m11, [pd_32] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 vpbroadcastq m2, [srcq+strideq*0] vpbroadcastq m4, [srcq+strideq*1] vpbroadcastq m0, [srcq+strideq*2] vpbroadcastq m5, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*0] vpbroadcastq m6, [srcq+strideq*1] vpbroadcastq m1, [srcq+strideq*2] vpblendd m2, m4, 0xcc ; 0 1 vpblendd m0, m5, 0xcc ; 2 3 vpblendd m3, m6, 0xcc ; 4 5 pshufb m2, m7 ; 00 01 10 11 02 03 12 13 pshufb m0, m7 ; 20 21 30 31 22 23 32 33 pshufb m3, m7 ; 40 41 50 51 42 43 52 53 pshufb m1, m7 ; 60 61 60 61 62 63 62 63 pmaddubsw m2, m8 pmaddubsw m0, m8 pmaddubsw m3, m8 pmaddubsw m1, m8 phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ pmulhrsw m2, m10 pmulhrsw m3, m10 palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 pshufd m0, m3, q2121 punpcklwd m3, m0 ; 45 56 .hv_w4_loop: pmaddwd m5, m1, m12 ; a0 b0 pmaddwd m6, m2, m12 ; c0 d0 pmaddwd m2, m13 ; a1 b1 pmaddwd m4, m3, m13 ; c1 d1 mova m1, m3 pmaddwd m3, m14 ; a2 b2 paddd m5, m2 vpbroadcastq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] paddd m6, m4 vpbroadcastq m4, [srcq+strideq*0] paddd m5, m3 vpbroadcastq m3, [srcq+strideq*1] vpblendd m2, m4, 0xcc vpbroadcastq m4, [srcq+strideq*2] vpblendd m3, m4, 0xcc pshufb m2, m7 pshufb m3, m7 pmaddubsw m2, m8 pmaddubsw m3, m8 phaddw m2, m3 pmulhrsw m2, m10 palignr m3, m2, m0, 12 mova m0, m2 punpcklwd m2, m3, m0 ; 67 78 punpckhwd m3, m0 ; 89 9a pmaddwd m4, m2, m14 ; c2 d2 paddd m6, m11 paddd m5, m11 paddd m6, m4 pmaddwd m4, m2, m15 ; a3 b3 paddd m5, m4 pmaddwd m4, m3, m15 ; c3 d3 paddd m6, m4 psrad m5, 6 psrad m6, 6 packssdw m5, m6 vpermd m5, m9, m5 mova [tmpq], m5 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: lea r6d, [wq*8-64] mov r5, srcq mov r7, tmpq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+strideq*0] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [srcq+strideq*0] vbroadcasti128 m0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpblendd m4, m0, 0xf0 ; 0 3 vinserti128 m5, [srcq+strideq*0], 1 ; 1 4 vinserti128 m6, [srcq+strideq*1], 1 ; 2 5 lea srcq, [srcq+strideq*2] vinserti128 m0, [srcq+strideq*0], 1 ; 3 6 HV_H_W8 m4, m1, m2, m3, m7, m8, m9 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 HV_H_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 vpermq m6, m6, q3120 pmulhrsw m0, m7 pmulhrsw m4, m7 pmulhrsw m5, m7 pmulhrsw m6, m7 vpermq m7, m0, q3120 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 .hv_w8_loop: vextracti128 [tmpq], m0, 1 ; not enough registers movu xm0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti128 m0, [srcq+strideq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m13 ; a1 pmaddwd m4, m13 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m14 ; a2 pmaddwd m6, m14 ; b2 paddd m8, m5 paddd m9, m6 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] HV_H_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_32] vbroadcasti128 m6, [tmpq] pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 paddd m8, m7 pmaddwd m7, m6, m15 ; b3 paddd m7, m9 psrad m8, 6 psrad m7, 6 packssdw m8, m7 vpermq m7, m8, q3120 mova [tmpq+wq*0], xm7 vextracti128 [tmpq+wq*2], m7, 1 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w8_loop add r5, 8 add r7, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro REMAP_REG 2 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %xdefine r14_save r14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep %xdefine r14 r14_save %undef r14_save %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] movq xm%1, [srcq+ r4] movq xm%2, [srcq+ r6] movhps xm%1, [srcq+ r7] movhps xm%2, [srcq+ r9] vinserti128 m%1, [srcq+r10], 1 vinserti128 m%2, [srcq+r11], 1 vpbroadcastq m%5, [srcq+r13] vpbroadcastq m%6, [srcq+ rX] add srcq, ssq movq xm%3, [srcq+ r4] movq xm%4, [srcq+ r6] movhps xm%3, [srcq+ r7] movhps xm%4, [srcq+ r9] vinserti128 m%3, [srcq+r10], 1 vinserti128 m%4, [srcq+r11], 1 vpbroadcastq m%7, [srcq+r13] vpbroadcastq m%8, [srcq+ rX] add srcq, ssq vpblendd m%1, m%5, 0xc0 vpblendd m%2, m%6, 0xc0 vpblendd m%3, m%7, 0xc0 vpblendd m%4, m%8, 0xc0 pmaddubsw m%1, m15 pmaddubsw m%2, m10 pmaddubsw m%3, m15 pmaddubsw m%4, m10 phaddw m%1, m%2 phaddw m%3, m%4 phaddw m%1, m%3 pmulhrsw m%1, m12 %endmacro %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isprep 0 cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy %xdefine base_reg r12 %define rndshift 10 %else %assign isprep 1 cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy %define tmp_stridem qword [rsp+120] %xdefine base_reg r11 %define rndshift 6 %endif lea base_reg, [%1_8tap_scaled_8bpc_avx2] %define base base_reg-%1_8tap_scaled_8bpc_avx2 tzcnt wd, wm vpbroadcastd m8, dxm %if isprep && UNIX64 movd xm14, mxd vpbroadcastd m14, xm14 mov r5d, t0d DECLARE_REG_TMP 5, 7 %else vpbroadcastd m14, mxm %endif mov dyd, dym %ifidn %1, put %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %else DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %define dsm [rsp+112] %define rX r1 %define rXd r1d %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %else DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+112] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define rX r14 %define rXd r14d %endif vpbroadcastd m10, [base+pd_0x3ff] vpbroadcastd m12, [base+pw_8192] %ifidn %1, put vpbroadcastd m13, [base+pd_512] %else vpbroadcastd m13, [base+pd_32] %endif pxor m9, m9 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .w2: mov myd, mym movzx t0d, t0b dec srcq movd xm15, t0d punpckldq m8, m9, m8 paddd m14, m8 ; mx+dx*[0,1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_dw] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] pcmpeqd m8, m9 psrld m14, 10 movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*2] movhps xm0, [srcq+ssq*1] movhps xm1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m14, m5 paddb m14, m6 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*2], 1 vpbroadcastq m2, [srcq+ssq*1] vpbroadcastq m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] vpblendd m15, m7, 0xaa vpblendd m0, m2, 0xc0 ; 0 1 4 5 vpblendd m1, m3, 0xc0 ; 2 3 6 7 pblendvb m15, m11, m8 pshufb m0, m14 pshufb m1, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 phaddw m0, m1 pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 vextracti128 xm1, m0, 1 ; 4 5 6 7 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 pshufd xm4, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm4 ; 45 56 punpckhwd xm4, xm1, xm4 ; 67 __ .w2_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm11, r6q pmovsxbw xm11, xm11 pshufd xm8, xm11, q0000 pshufd xm9, xm11, q1111 pshufd xm10, xm11, q2222 pshufd xm11, xm11, q3333 pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pmaddwd xm7, xm2, xm10 pmaddwd xm8, xm4, xm11 paddd xm5, xm6 paddd xm7, xm8 paddd xm5, xm13 paddd xm5, xm7 psrad xm5, 10 packssdw xm5, xm5 packuswb xm5, xm5 pextrw [dstq], xm5, 0 add dstq, dsq dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w2_loop movq xm5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps xm3, xm0, q1032 ; 01 12 shufps xm0, xm2, q1032 ; 23 34 shufps xm2, xm4, q1032 ; 45 56 pshufb xm5, xm14 pmaddubsw xm5, xm15 phaddw xm5, xm5 pmulhrsw xm5, xm12 palignr xm1, xm5, xm1, 12 punpcklqdq xm1, xm1 ; 6 7 6 7 punpcklwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop .w2_skip_line: movhps xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xm3, xm0 ; 01 12 mova xm0, xm2 ; 23 34 pshufb xm5, xm14 pmaddubsw xm5, xm15 phaddw xm5, xm5 pmulhrsw xm5, xm12 ; 6 7 6 7 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 pshufd xm5, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm5 ; 45 56 punpckhwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop %endif .w4: mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd xm15, t0d pmaddwd m8, m7 vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd xm15, xm0 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 movd xm15, [base+subpel_filters+r4*8+2] vbroadcasti128 m5, [base+bdct_lb_dw] vpbroadcastq m6, [base+subpel_s_shuf2] pinsrd xm15, [base+subpel_filters+r6*8+2], 1 pcmpeqd m0, m9 psrld m14, 10 movu xm7, [srcq+ssq*0] movu xm9, [srcq+ssq*1] pinsrd xm15, [base+subpel_filters+r11*8+2], 2 movu xm8, [srcq+ssq*2] movu xm10, [srcq+ss3q ] pinsrd xm15, [base+subpel_filters+r13*8+2], 3 lea srcq, [srcq+ssq*4] pshufb m14, m5 paddb m14, m6 vinserti128 m7, [srcq+ssq*0], 1 vinserti128 m9, [srcq+ssq*1], 1 vinserti128 m15, xm15, 1 vinserti128 m8, [srcq+ssq*2], 1 vinserti128 m10, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] pblendvb m15, m11, m0 pshufb m7, m14 pshufb m9, m14 pshufb m8, m14 pshufb m10, m14 pmaddubsw m7, m15 pmaddubsw m9, m15 pmaddubsw m8, m15 pmaddubsw m10, m15 phaddw m7, m9 phaddw m8, m10 pmulhrsw m7, m12 ; 0 1 4 5 pmulhrsw m8, m12 ; 2 3 6 7 vextracti128 xm9, m7, 1 ; 4 5 vextracti128 xm3, m8, 1 ; 6 7 shufps xm4, xm7, xm8, q1032 ; 1 2 shufps xm5, xm8, xm9, q1032 ; 3 4 shufps xm6, xm9, xm3, q1032 ; 5 6 psrldq xm11, xm3, 8 ; 7 _ punpcklwd xm0, xm7, xm4 ; 01 punpckhwd xm7, xm4 ; 12 punpcklwd xm1, xm8, xm5 ; 23 punpckhwd xm8, xm5 ; 34 punpcklwd xm2, xm9, xm6 ; 45 punpckhwd xm9, xm6 ; 56 punpcklwd xm3, xm11 ; 67 mova [rsp+0x00], xm7 mova [rsp+0x10], xm8 mova [rsp+0x20], xm9 .w4_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm10, r6q pmovsxbw xm10, xm10 pshufd xm7, xm10, q0000 pshufd xm8, xm10, q1111 pshufd xm9, xm10, q2222 pshufd xm10, xm10, q3333 pmaddwd xm4, xm0, xm7 pmaddwd xm5, xm1, xm8 pmaddwd xm6, xm2, xm9 pmaddwd xm7, xm3, xm10 paddd xm4, xm5 paddd xm6, xm7 paddd xm4, xm13 paddd xm4, xm6 psrad xm4, rndshift packssdw xm4, xm4 %ifidn %1, put packuswb xm4, xm4 movd [dstq], xm4 add dstq, dsq %else movq [tmpq], xm4 add tmpq, 8 %endif dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w4_loop movu xm4, [srcq] test myd, 0x400 jz .w4_skip_line mova xm0, [rsp+0x00] mova [rsp+0x00], xm1 mova xm1, [rsp+0x10] mova [rsp+0x10], xm2 mova xm2, [rsp+0x20] mova [rsp+0x20], xm3 pshufb xm4, xm14 pmaddubsw xm4, xm15 phaddw xm4, xm4 pmulhrsw xm4, xm12 punpcklwd xm3, xm11, xm4 mova xm11, xm4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu xm5, [srcq+ssq*1] movu m6, [rsp+0x10] pshufb xm4, xm14 pshufb xm5, xm14 pmaddubsw xm4, xm15 pmaddubsw xm5, xm15 movu [rsp+0x00], m6 phaddw xm4, xm5 pmulhrsw xm4, xm12 punpcklwd xm9, xm11, xm4 mova [rsp+0x20], xm9 psrldq xm11, xm4, 8 mova xm0, xm1 mova xm1, xm2 mova xm2, xm3 punpcklwd xm3, xm4, xm11 lea srcq, [srcq+ssq*2] jmp .w4_loop .w8: mov dword [rsp+48], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+48], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+48], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+48], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+48], 16 movifprep tmp_stridem, 256 .w_start: %ifidn %1, put movifnidn dsm, dsq %endif shr t0d, 16 sub srcq, 3 pmaddwd m8, [base+rescale_mul] movd xm15, t0d mov [rsp+72], t0d mov [rsp+56], srcq mov [rsp+64], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] jmp .hloop .hloop_prep: dec dword [rsp+48] jz .ret add qword [rsp+64], 8*(isprep+1) mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m10, [base+pd_0x3ff] paddd m14, m8, [rsp+16] vpbroadcastd m15, [rsp+72] pxor m9, m9 mov srcq, [rsp+56] mov r0q, [rsp+64] ; dstq / tmpq .hloop: vpbroadcastq m11, [base+pq_0x40000000] pand m6, m14, m10 psrld m6, 6 paddd m15, m6 pcmpeqd m6, m9 vextracti128 xm7, m15, 1 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r7d, xm15, 1 pextrd r9d, xm15, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 movu [rsp+16], m14 movq xm15, [base+subpel_filters+ r4*8] movq xm10, [base+subpel_filters+ r6*8] movhps xm15, [base+subpel_filters+ r7*8] movhps xm10, [base+subpel_filters+ r9*8] vinserti128 m15, [base+subpel_filters+r10*8], 1 vinserti128 m10, [base+subpel_filters+r11*8], 1 vpbroadcastq m9, [base+subpel_filters+r13*8] vpbroadcastq m8, [base+subpel_filters+ rX*8] psrld m14, 10 vextracti128 xm7, m14, 1 mova [rsp], xm14 movd r4d, xm14 pextrd r6d, xm14, 2 pextrd r7d, xm14, 1 pextrd r9d, xm14, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 pshufd m5, m6, q1100 pshufd m6, m6, q3322 vpblendd m15, m9, 0xc0 vpblendd m10, m8, 0xc0 pblendvb m15, m11, m5 pblendvb m10, m11, m6 vbroadcasti128 m14, [base+subpel_s_shuf8] MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b mov myd, mym mov dyd, dym pshufb m0, m14 ; 01a 01b pshufb m1, m14 ; 23a 23b pshufb m2, m14 ; 45a 45b pshufb m3, m14 ; 67a 67b vbroadcasti128 m14, [base+wswap] .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm11, r6q punpcklqdq xm11, xm11 pmovsxbw m11, xm11 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pshufd m8, m11, q2222 pshufd m11, m11, q3333 pmaddwd m6, m2, m8 pmaddwd m7, m3, m11 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movq [dstq], xm4 add dstq, dsm %else mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+52], myd mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] jz .skip_line vpbroadcastq m6, [srcq+r13] vpbroadcastq m7, [srcq+ rX] movq xm4, [srcq+ r4] movq xm5, [srcq+ r6] movhps xm4, [srcq+ r7] movhps xm5, [srcq+ r9] vinserti128 m4, [srcq+r10], 1 vinserti128 m5, [srcq+r11], 1 add srcq, ssq mov myd, [rsp+52] mov dyd, dym pshufb m0, m14 pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 vpblendd m4, m6, 0xc0 vpblendd m5, m7, 0xc0 pmaddubsw m4, m15 pmaddubsw m5, m10 phaddw m4, m5 pslld m5, m4, 16 paddw m4, m5 pmulhrsw m4, m12 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .vloop .skip_line: mova m0, m1 mova m1, m2 mova m2, m3 vpbroadcastq m7, [srcq+r13] vpbroadcastq m8, [srcq+ rX] movq xm3, [srcq+ r4] movq xm4, [srcq+ r6] movhps xm3, [srcq+ r7] movhps xm4, [srcq+ r9] vinserti128 m3, [srcq+r10], 1 vinserti128 m4, [srcq+r11], 1 add srcq, ssq movq xm5, [srcq+ r4] movq xm6, [srcq+ r6] movhps xm5, [srcq+ r7] movhps xm6, [srcq+ r9] vinserti128 m5, [srcq+r10], 1 vinserti128 m6, [srcq+r11], 1 vpbroadcastq m9, [srcq+r13] vpbroadcastq m11, [srcq+ rX] add srcq, ssq mov myd, [rsp+52] mov dyd, dym vpblendd m3, m7, 0xc0 vpblendd m4, m8, 0xc0 vpblendd m5, m9, 0xc0 vpblendd m6, m11, 0xc0 pmaddubsw m3, m15 pmaddubsw m4, m10 pmaddubsw m5, m15 pmaddubsw m6, m10 phaddw m3, m4 phaddw m5, m6 psrld m4, m3, 16 pslld m6, m5, 16 paddw m3, m4 paddw m5, m6 pblendw m3, m5, 0xaa pmulhrsw m3, m12 jmp .vloop .dy1: movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy1_w2: mov myd, mym movzx t0d, t0b dec srcq movd xm15, t0d punpckldq m8, m9, m8 paddd m14, m8 ; mx+dx*[0-1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_dw] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] pcmpeqd m8, m9 psrld m14, 10 movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*2] movhps xm0, [srcq+ssq*1] movhps xm1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*2], 1 vpbroadcastq m2, [srcq+ssq*1] add srcq, ss3q movq xm10, r4q pmovsxbw xm10, xm10 vpblendd m15, m7, 0xaa pblendvb m15, m11, m8 pshufd xm8, xm10, q0000 pshufd xm9, xm10, q1111 pshufd xm11, xm10, q3333 pshufd xm10, xm10, q2222 vpblendd m0, m2, 0xc0 pshufb m1, m14 pshufb m0, m14 pmaddubsw m1, m15 pmaddubsw m0, m15 phaddw m0, m1 pmulhrsw m0, m12 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 pshufd xm4, xm1, q2121 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 punpcklwd xm2, xm1, xm4 ; 45 56 .dy1_w2_loop: movq xm1, [srcq+ssq*0] movhps xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pmaddwd xm7, xm2, xm10 mova xm3, xm0 mova xm0, xm2 paddd xm5, xm13 paddd xm6, xm7 pshufb xm1, xm14 pmaddubsw xm1, xm15 phaddw xm1, xm1 pmulhrsw xm1, xm12 palignr xm7, xm1, xm4, 12 punpcklwd xm2, xm7, xm1 ; 67 78 pmaddwd xm7, xm2, xm11 mova xm4, xm1 paddd xm5, xm6 paddd xm5, xm7 psrad xm5, rndshift packssdw xm5, xm5 packuswb xm5, xm5 pextrw [dstq+dsq*0], xm5, 0 pextrw [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif .dy1_w4: mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd xm15, t0d pmaddwd m8, m7 vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 vpermq m8, m8, q3120 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r11d, xm15, 1 pextrd r13d, xm15, 3 movd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] movu xm2, [srcq+ssq*0] movu xm3, [srcq+ssq*2] vbroadcasti128 m5, [base+bdct_lb_dw] vpbroadcastq m6, [base+subpel_s_shuf2] pcmpeqd m8, m9 psrld m14, 10 pinsrd xm15, [base+subpel_filters+r11*8+2], 1 vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 vinserti128 m2, [srcq+ssq*1], 1 vinserti128 m3, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 movu xm4, [srcq+ssq*0] movu xm5, [srcq+ssq*2] vinserti128 m4, [srcq+ssq*1], 1 add srcq, ss3q vpblendd m15, m7, 0x30 punpcklqdq m15, m15 pblendvb m15, m11, m8 movq xm10, r4q punpcklqdq xm10, xm10 pmovsxbw m10, xm10 pshufb m2, m14 pshufb m3, m14 pshufb m4, m14 pshufb xm5, xm14 vpermq m2, m2, q3120 vpermq m3, m3, q3120 vpermq m4, m4, q3120 vpermq m5, m5, q3120 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 pmaddubsw m2, m15 pmaddubsw m3, m15 pmaddubsw m4, m15 pmaddubsw m5, m15 phaddw m2, m3 phaddw m4, m5 pmulhrsw m2, m12 pmulhrsw m4, m12 palignr m5, m4, m2, 4 pshufd m3, m4, q2121 punpcklwd m0, m2, m5 ; 01 12 punpckhwd m1, m2, m5 ; 23 34 punpcklwd m2, m4, m3 ; 45 56 .dy1_w4_loop: movu xm11, [srcq+ssq*0] vinserti128 m11, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 mova m0, m1 mova m1, m2 paddd m4, m13 paddd m5, m6 pshufb m11, m14 vpermq m11, m11, q3120 pmaddubsw m11, m15 phaddw m11, m11 pmulhrsw m11, m12 palignr m6, m11, m3, 12 punpcklwd m2, m6, m11 ; 67 78 mova m3, m11 pmaddwd m6, m2, m10 paddd m4, m5 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 pshuflw xm4, xm4, q3120 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] %else pshufd xm4, xm4, q3120 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET .dy1_w8: mov dword [rsp+72], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+72], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+72], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+72], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+72], 16 movifprep tmp_stridem, 256 .dy1_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul] movd xm15, t0d mov [rsp+76], t0d mov [rsp+80], srcq mov [rsp+88], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+96], xm0 jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+72] jz .ret add qword [rsp+88], 8*(isprep+1) mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m10, [base+pd_0x3ff] paddd m14, m8, [rsp+32] vpbroadcastd m15, [rsp+76] pxor m9, m9 mov srcq, [rsp+80] mov r0q, [rsp+88] ; dstq / tmpq .dy1_hloop: vpbroadcastq m11, [base+pq_0x40000000] pand m6, m14, m10 psrld m6, 6 paddd m15, m6 pcmpeqd m6, m9 vextracti128 xm7, m15, 1 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r7d, xm15, 1 pextrd r9d, xm15, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 movu [rsp+32], m14 movq xm15, [base+subpel_filters+ r4*8] movq xm10, [base+subpel_filters+ r6*8] movhps xm15, [base+subpel_filters+ r7*8] movhps xm10, [base+subpel_filters+ r9*8] vinserti128 m15, [base+subpel_filters+r10*8], 1 vinserti128 m10, [base+subpel_filters+r11*8], 1 vpbroadcastq m9, [base+subpel_filters+r13*8] vpbroadcastq m8, [base+subpel_filters+ rX*8] psrld m14, 10 vextracti128 xm7, m14, 1 movq [rsp+64], xm14 movd r4d, xm14 pextrd r6d, xm14, 2 pextrd r7d, xm14, 1 pextrd r9d, xm14, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 pshufd m5, m6, q1100 pshufd m6, m6, q3322 vpblendd m15, m9, 0xc0 vpblendd m10, m8, 0xc0 pblendvb m15, m11, m5 pblendvb m10, m11, m6 vbroadcasti128 m14, [base+subpel_s_shuf8] MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b movu [rsp], m10 vpbroadcastd m8, [rsp+0x60] vpbroadcastd m9, [rsp+0x64] vpbroadcastd m10, [rsp+0x68] vpbroadcastd m11, [rsp+0x6c] pshufb m0, m14 ; 01a 01b pshufb m1, m14 ; 23a 23b pshufb m2, m14 ; 45a 45b pshufb m3, m14 ; 67a 67b vbroadcasti128 m14, [base+wswap] .dy1_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movq [dstq], xm4 add dstq, dsm %else mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep movq xm4, [srcq+ r4] movq xm5, [srcq+ r6] movhps xm4, [srcq+ r7] movhps xm5, [srcq+ r9] vinserti128 m4, [srcq+r10], 1 vinserti128 m5, [srcq+r11], 1 vpbroadcastq m6, [srcq+r13] vpbroadcastq m7, [srcq+ rX] add srcq, ssq pshufb m0, m14 pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 vpblendd m4, m6, 0xc0 vpblendd m5, m7, 0xc0 pmaddubsw m4, m15 pmaddubsw m5, [rsp] phaddw m4, m5 pslld m5, m4, 16 paddw m4, m5 pmulhrsw m4, m12 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .dy1_vloop .dy2: movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy2_w2: mov myd, mym movzx t0d, t0b dec srcq movd xm15, t0d punpckldq m8, m9, m8 paddd m14, m8 ; mx+dx*[0-1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_dw] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] pcmpeqd m8, m9 psrld m14, 10 movq xm0, [srcq+ssq*0] vpbroadcastq m2, [srcq+ssq*1] movhps xm0, [srcq+ssq*2] vpbroadcastq m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m14, m5 paddb m14, m6 vpblendd m15, m7, 0xaa pblendvb m15, m11, m8 movhps xm1, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] vpblendd m0, m2, 0x30 vpblendd m1, m4, 0xc0 vpblendd m0, m3, 0xc0 pshufb m0, m14 pshufb m1, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 movq xm11, r4q pmovsxbw xm11, xm11 phaddw m0, m1 pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 pshufd xm8, xm11, q0000 pshufd xm9, xm11, q1111 pshufd xm10, xm11, q2222 pshufd xm11, xm11, q3333 pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 vextracti128 xm1, m2, 1 punpcklwd xm3, xm2, xm1 ; 01 23 punpckhwd xm2, xm1 ; 23 45 .dy2_w2_loop: movq xm6, [srcq+ssq*0] vpbroadcastq m7, [srcq+ssq*1] movhps xm6, [srcq+ssq*2] vpbroadcastq m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pmaddwd xm4, xm3, xm8 pmaddwd xm5, xm2, xm9 vpblendd m6, m7, 0x30 vpblendd m6, m1, 0xc0 pshufb m6, m14 pmaddubsw m6, m15 phaddw m6, m6 pmulhrsw m6, m12 palignr m0, m6, m0, 8 pshufd m2, m0, q3221 vextracti128 xm1, m2, 1 punpcklwd xm3, xm2, xm1 ; 45 67 punpckhwd xm2, xm1 ; 67 89 pmaddwd xm6, xm3, xm10 pmaddwd xm7, xm2, xm11 paddd xm4, xm5 paddd xm4, xm13 paddd xm6, xm7 paddd xm4, xm6 psrad xm4, rndshift packssdw xm4, xm4 packuswb xm4, xm4 pextrw [dstq+dsq*0], xm4, 0 pextrw [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif .dy2_w4: mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd xm15, t0d pmaddwd m8, m7 vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 movd xm15, [base+subpel_filters+r4*8+2] vbroadcasti128 m5, [base+bdct_lb_dw] vpbroadcastq m6, [base+subpel_s_shuf2] pinsrd xm15, [base+subpel_filters+r6*8+2], 1 pcmpeqd m8, m9 psrld m14, 10 movu xm0, [srcq+ssq*0] movu xm2, [srcq+ssq*2] pinsrd xm15, [base+subpel_filters+r11*8+2], 2 movu xm1, [srcq+ssq*1] movu xm3, [srcq+ss3q ] pinsrd xm15, [base+subpel_filters+r13*8+2], 3 lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] vinserti128 m15, xm15, 1 pshufb m14, m5 paddb m14, m6 vinserti128 m2, [srcq+ssq*0], 1 vinserti128 m3, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pblendvb m15, m11, m8 pshufb xm0, xm14 pshufb m2, m14 pshufb xm1, xm14 pshufb m3, m14 pmaddubsw xm0, xm15 pmaddubsw m2, m15 pmaddubsw xm1, xm15 pmaddubsw m3, m15 movq xm11, r4q punpcklqdq xm11, xm11 pmovsxbw m11, xm11 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 2 _ 4 pmulhrsw m1, m12 ; 1 3 _ 5 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 punpcklwd xm2, xm0, xm1 punpckhwd m1, m0, m1 ; 23 45 vinserti128 m0, m2, xm1, 1 ; 01 23 .dy2_w4_loop: movu xm6, [srcq+ssq*0] movu xm7, [srcq+ssq*1] vinserti128 m6, [srcq+ssq*2], 1 vinserti128 m7, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pshufb m6, m14 pshufb m7, m14 pmaddubsw m6, m15 pmaddubsw m7, m15 psrld m2, m6, 16 pslld m3, m7, 16 paddw m6, m2 paddw m7, m3 pblendw m6, m7, 0xaa ; 67 89 pmulhrsw m6, m12 paddd m4, m5 vperm2i128 m0, m1, m6, 0x21 ; 45 67 mova m1, m6 pmaddwd m6, m0, m10 pmaddwd m7, m1, m11 paddd m4, m13 paddd m6, m7 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] %else mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET .dy2_w8: mov dword [rsp+40], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+40], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+40], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+40], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+40], 16 movifprep tmp_stridem, 256 .dy2_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul] movd xm15, t0d mov [rsp+64], t0d mov [rsp+48], srcq mov [rsp+56], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+40] jz .ret add qword [rsp+56], 8*(isprep+1) mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m10, [base+pd_0x3ff] paddd m14, m8, [rsp] vpbroadcastd m15, [rsp+64] pxor m9, m9 mov srcq, [rsp+48] mov r0q, [rsp+56] ; dstq / tmpq .dy2_hloop: vpbroadcastq m11, [base+pq_0x40000000] pand m6, m14, m10 psrld m6, 6 paddd m15, m6 pcmpeqd m6, m9 vextracti128 xm7, m15, 1 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r7d, xm15, 1 pextrd r9d, xm15, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 movu [rsp], m14 movq xm15, [base+subpel_filters+ r4*8] movq xm10, [base+subpel_filters+ r6*8] movhps xm15, [base+subpel_filters+ r7*8] movhps xm10, [base+subpel_filters+ r9*8] vinserti128 m15, [base+subpel_filters+r10*8], 1 vinserti128 m10, [base+subpel_filters+r11*8], 1 vpbroadcastq m9, [base+subpel_filters+r13*8] vpbroadcastq m8, [base+subpel_filters+ rX*8] psrld m14, 10 vextracti128 xm7, m14, 1 movd r4d, xm14 pextrd r6d, xm14, 2 pextrd r7d, xm14, 1 pextrd r9d, xm14, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 pshufd m5, m6, q1100 pshufd m6, m6, q3322 vpblendd m15, m9, 0xc0 vpblendd m10, m8, 0xc0 pblendvb m15, m11, m5 pblendvb m10, m11, m6 vbroadcasti128 m14, [base+subpel_s_shuf8] MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m11, [rsp+0x58] vpbroadcastd m4, [rsp+0x5c] pshufb m0, m14 ; 01a 01b pshufb m1, m14 ; 23a 23b pshufb m2, m14 ; 45a 45b pshufb m3, m14 ; 67a 67b SWAP m14, m4 .dy2_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m11 pmaddwd m7, m3, m14 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movq [dstq], xm4 add dstq, dsm %else mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep mova m0, m1 mova m1, m2 mova m2, m3 movq xm3, [srcq+ r4] movq xm4, [srcq+ r6] movhps xm3, [srcq+ r7] movhps xm4, [srcq+ r9] vinserti128 m3, [srcq+r10], 1 vinserti128 m4, [srcq+r11], 1 vpbroadcastq m5, [srcq+r13] vpbroadcastq m6, [srcq+ rX] add srcq, ssq vpblendd m3, m5, 0xc0 vpblendd m4, m6, 0xc0 pmaddubsw m3, m15 pmaddubsw m4, m10 phaddw m3, m4 movq xm4, [srcq+ r4] movq xm5, [srcq+ r6] movhps xm4, [srcq+ r7] movhps xm5, [srcq+ r9] vinserti128 m4, [srcq+r10], 1 vinserti128 m5, [srcq+r11], 1 vpbroadcastq m6, [srcq+r13] vpbroadcastq m7, [srcq+ rX] add srcq, ssq vpblendd m4, m6, 0xc0 vpblendd m5, m7, 0xc0 pmaddubsw m4, m15 pmaddubsw m5, m10 phaddw m4, m5 psrld m5, m3, 16 pslld m6, m4, 16 paddw m3, m5 paddw m4, m6 pblendw m3, m4, 0xaa pmulhrsw m3, m12 jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_8bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, t0d jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %else DECLARE_REG_TMP 6, 8 %endif %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN put PUT_8TAP_SCALED_FN sharp, SHARP, SHARP PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %else DECLARE_REG_TMP 6, 7 %endif BILIN_SCALED_FN prep PREP_8TAP_SCALED_FN sharp, SHARP, SHARP PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep %macro WARP_V 5 ; dst, 02, 46, 13, 57 ; Can be done using gathers, but that's terribly slow on many CPU:s lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm8, [filterq+myq *8] vinserti128 m8, [filterq+tmp1q*8], 1 ; a e lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+deltaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; b f lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm9, [filterq+myq *8] vinserti128 m9, [filterq+tmp1q*8], 1 ; c g lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+gammaq] ; my += gamma shr tmp2d, 10 shr tmp1d, 10 punpcklwd m8, m0 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; d h punpcklwd m0, m9, m0 punpckldq m9, m8, m0 punpckhdq m0, m8, m0 punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 pmaddwd m%2, m8 pmaddwd m9, m%3 punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 pmaddwd m8, m%4 pmaddwd m0, m%5 paddd m%2, m9 paddd m0, m8 paddd m%1, m0, m%2 %endmacro cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts %if WIN64 sub rsp, 0xa0 %endif call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main .loop: psrad m7, 13 psrad m0, 13 packssdw m7, m0 pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 vpermq m7, m7, q3120 mova [tmpq+tsq*0], xm7 vextracti128 [tmpq+tsq*2], m7, 1 dec r4d jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2 lea tmpq, [tmpq+tsq*4] jmp .loop cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ beta, filter, tmp1, delta, my, gamma %if WIN64 sub rsp, 0xa0 %assign xmm_regs_used 16 %assign stack_size_padded 0xa0 %assign stack_offset stack_offset+stack_size_padded %endif call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m7, 18 psrad m0, 18 packusdw m7, m0 pavgw m7, m11 ; (x + (1 << 10)) >> 11 vextracti128 xm0, m7, 1 packuswb xm7, xm0 pshufd xm7, xm7, q3120 movq [dstq+dsq*0], xm7 movhps [dstq+dsq*1], xm7 dec r4d jg .loop .end: RET ALIGN function_align .main: ; Stack args offset by one (r4m -> r5m etc.) due to call %if WIN64 mov abcdq, r5m mov mxd, r6m movaps [rsp+stack_offset+0x10], xmm6 movaps [rsp+stack_offset+0x20], xmm7 movaps [rsp+0x28], xmm8 movaps [rsp+0x38], xmm9 movaps [rsp+0x48], xmm10 movaps [rsp+0x58], xmm11 movaps [rsp+0x68], xmm12 movaps [rsp+0x78], xmm13 movaps [rsp+0x88], xmm14 movaps [rsp+0x98], xmm15 %endif movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] mova m12, [warp_8x8_shufA] mova m13, [warp_8x8_shufB] vpbroadcastd m14, [pw_8192] vpbroadcastd m15, [pd_32768] pxor m11, m11 lea filterq, [mc_warp_filter2] lea tmp1q, [ssq*3+3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 + 3 sub betad, tmp2d ; beta -= alpha*3 mov myd, r7m call .h psrld m1, m0, 16 call .h psrld m4, m0, 16 call .h pblendw m1, m0, 0xaa ; 02 call .h pblendw m4, m0, 0xaa ; 13 call .h psrld m2, m1, 16 pblendw m2, m0, 0xaa ; 24 call .h psrld m5, m4, 16 pblendw m5, m0, 0xaa ; 35 call .h psrld m3, m2, 16 pblendw m3, m0, 0xaa ; 46 movsx deltad, word [abcdq+2*2] movsx gammad, word [abcdq+2*3] add myd, 512+(64<<10) mov r4d, 4 lea tmp1d, [deltaq*3] sub gammad, tmp1d ; gamma -= delta*3 .main2: call .h psrld m6, m5, 16 pblendw m6, m0, 0xaa ; 57 WARP_V 7, 1, 3, 4, 6 call .h mova m1, m2 mova m2, m3 psrld m3, 16 pblendw m3, m0, 0xaa ; 68 WARP_V 0, 4, 6, 1, 3 mova m4, m5 mova m5, m6 ret ALIGN function_align .h: lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] vbroadcasti128 m10, [srcq] shr mxd, 10 shr tmp1d, 10 movq xm8, [filterq+mxq *8] vinserti128 m8, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 shr tmp1d, 10 movq xm9, [filterq+mxq *8] vinserti128 m9, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta shr tmp2d, 10 shr tmp1d, 10 punpcklqdq m8, m0 ; 0 1 4 5 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 punpcklqdq m9, m0 ; 2 3 6 7 pshufb m0, m10, m12 pmaddubsw m0, m8 pshufb m10, m13 pmaddubsw m10, m9 add srcq, ssq phaddw m0, m10 pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword ret %macro BIDIR_FN 1 ; op %1 0 lea stride3q, [strideq*3] jmp wq .w4: vextracti128 xm1, m0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 cmp hd, 8 je .ret %1 2 lea dstq, [dstq+strideq*4] vextracti128 xm1, m0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .ret: RET .w8_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*4] .w8: vextracti128 xm1, m0, 1 movq [dstq ], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 sub hd, 4 jg .w8_loop RET .w16_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq*4] .w16: vpermq m0, m0, q3120 mova [dstq ], xm0 vextracti128 [dstq+strideq*1], m0, 1 %1 2 vpermq m0, m0, q3120 mova [dstq+strideq*2], xm0 vextracti128 [dstq+stride3q ], m0, 1 sub hd, 4 jg .w16_loop RET .w32_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq*2] .w32: vpermq m0, m0, q3120 mova [dstq+strideq*0], m0 %1 2 vpermq m0, m0, q3120 mova [dstq+strideq*1], m0 sub hd, 2 jg .w32_loop RET .w64_loop: %1_INC_PTR 4 %1 0 add dstq, strideq .w64: vpermq m0, m0, q3120 mova [dstq], m0 %1 2 vpermq m0, m0, q3120 mova [dstq+32], m0 dec hd jg .w64_loop RET .w128_loop: %1 0 add dstq, strideq .w128: vpermq m0, m0, q3120 mova [dstq+0*32], m0 %1 2 vpermq m0, m0, q3120 mova [dstq+1*32], m0 %1_INC_PTR 8 %1 -4 vpermq m0, m0, q3120 mova [dstq+2*32], m0 %1 -2 vpermq m0, m0, q3120 mova [dstq+3*32], m0 dec hd jg .w128_loop RET %endmacro %macro AVG 1 ; src_offset mova m0, [tmp1q+(%1+0)*32] paddw m0, [tmp2q+(%1+0)*32] mova m1, [tmp1q+(%1+1)*32] paddw m1, [tmp2q+(%1+1)*32] pmulhrsw m0, m2 pmulhrsw m1, m2 packuswb m0, m1 %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*32 add tmp2q, %1*32 %endmacro cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg %+ SUFFIX %+ _table lea r6, [avg %+ SUFFIX %+ _table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m2, [base+pw_1024] add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m0, [tmp1q+(%1+0)*32] psubw m2, m0, [tmp2q+(%1+0)*32] mova m1, [tmp1q+(%1+1)*32] psubw m3, m1, [tmp2q+(%1+1)*32] pmulhw m2, m4 pmulhw m3, m4 paddw m0, m2 paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg %+ SUFFIX %+ _table lea r6, [w_avg %+ SUFFIX %+ _table] tzcnt wd, wm movifnidn hd, hm vpbroadcastw m4, r6m ; weight movsxd wq, dword [r6+wq*4] vpbroadcastd m5, [base+pw_2048] psllw m4, 12 ; (weight-16) << 12 when interpreted as signed add wq, r6 cmp dword r6m, 7 jg .weight_gt7 mov r6, tmp1q pxor m0, m0 mov tmp1q, tmp2q psubw m4, m0, m4 ; -weight mov tmp2q, r6 .weight_gt7: BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 vpermq m3, [maskq+%1*16], q3120 mova m0, [tmp2q+(%1+0)*32] psubw m1, m0, [tmp1q+(%1+0)*32] psubb m3, m4, m3 paddw m1, m1 ; (b - a) << 1 paddb m3, m3 punpcklbw m2, m4, m3 ; -m << 9 pmulhw m1, m2 paddw m0, m1 mova m1, [tmp2q+(%1+1)*32] psubw m2, m1, [tmp1q+(%1+1)*32] paddw m2, m2 punpckhbw m3, m4, m3 pmulhw m2, m3 paddw m1, m2 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*16 add tmp2q, %1*32 add tmp1q, %1*32 %endmacro cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask %+ SUFFIX %+ _table lea r7, [mask %+ SUFFIX %+ _table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] vpbroadcastd m5, [base+pw_2048] pxor m4, m4 add wq, r7 BIDIR_FN MASK %macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 mova m%1, [tmp1q+32*%3] mova m1, [tmp2q+32*%3] psubw m1, m%1 pabsw m%2, m1 psubusw m%2, m6, m%2 psrlw m%2, 8 ; 64 - m psllw m2, m%2, 10 pmulhw m1, m2 paddw m%1, m1 mova m1, [tmp1q+32*%4] mova m2, [tmp2q+32*%4] psubw m2, m1 pabsw m3, m2 psubusw m3, m6, m3 psrlw m3, 8 %if %5 packuswb m%2, m3 psubb m%2, m5, m%2 vpermq m%2, m%2, q3120 %else phaddw m%2, m3 %endif psllw m3, 10 pmulhw m2, m3 paddw m1, m2 pmulhrsw m%1, m7 pmulhrsw m1, m7 packuswb m%1, m1 %endmacro cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx2_table lea r6, [blend_avx2_table] tzcnt wd, wm movifnidn maskq, maskmp movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m4, [base+pb_64] vpbroadcastd m5, [base+pw_512] sub tmpq, maskq add wq, r6 lea r6, [dsq*3] jmp wq .w4: movd xm0, [dstq+dsq*0] pinsrd xm0, [dstq+dsq*1], 1 vpbroadcastd xm1, [dstq+dsq*2] pinsrd xm1, [dstq+r6 ], 3 mova xm6, [maskq] psubb xm3, xm4, xm6 punpcklbw xm2, xm3, xm6 punpckhbw xm3, xm6 mova xm6, [maskq+tmpq] add maskq, 4*4 punpcklbw xm0, xm6 punpckhbw xm1, xm6 pmaddubsw xm0, xm2 pmaddubsw xm1, xm3 pmulhrsw xm0, xm5 pmulhrsw xm1, xm5 packuswb xm0, xm1 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 pextrd [dstq+dsq*2], xm0, 2 pextrd [dstq+r6 ], xm0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET ALIGN function_align .w8: movq xm1, [dstq+dsq*0] movhps xm1, [dstq+dsq*1] vpbroadcastq m2, [dstq+dsq*2] vpbroadcastq m3, [dstq+r6 ] mova m0, [maskq] mova m6, [maskq+tmpq] add maskq, 8*4 vpblendd m1, m2, 0x30 vpblendd m1, m3, 0xc0 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 movq [dstq+dsq*2], xm1 movhps [dstq+r6 ], xm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET ALIGN function_align .w16: mova m0, [maskq] mova xm1, [dstq+dsq*0] vinserti128 m1, [dstq+dsq*1], 1 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 mova m6, [maskq+tmpq] add maskq, 16*2 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16 RET ALIGN function_align .w32: mova m0, [maskq] mova m1, [dstq] mova m6, [maskq+tmpq] add maskq, 32 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .w32 RET cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_v_avx2_table lea r5, [blend_v_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r5+wq*4] vpbroadcastd m5, [base+pw_512] add wq, r5 add maskq, obmc_masks-blend_v_avx2_table jmp wq .w2: vpbroadcastd xm2, [maskq+2*2] .w2_s0_loop: movd xm0, [dstq+dsq*0] pinsrw xm0, [dstq+dsq*1], 1 movd xm1, [tmpq] add tmpq, 2*2 punpcklbw xm0, xm1 pmaddubsw xm0, xm2 pmulhrsw xm0, xm5 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_s0_loop RET ALIGN function_align .w4: vpbroadcastq xm2, [maskq+4*2] .w4_loop: movd xm0, [dstq+dsq*0] pinsrd xm0, [dstq+dsq*1], 1 movq xm1, [tmpq] add tmpq, 4*2 punpcklbw xm0, xm1 pmaddubsw xm0, xm2 pmulhrsw xm0, xm5 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: mova xm3, [maskq+8*2] .w8_loop: movq xm0, [dstq+dsq*0] vpbroadcastq xm1, [dstq+dsq*1] mova xm2, [tmpq] add tmpq, 8*2 punpcklbw xm0, xm2 punpckhbw xm1, xm2 pmaddubsw xm0, xm3 pmaddubsw xm1, xm3 pmulhrsw xm0, xm5 pmulhrsw xm1, xm5 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: vbroadcasti128 m3, [maskq+16*2] vbroadcasti128 m4, [maskq+16*3] .w16_loop: mova xm1, [dstq+dsq*0] vinserti128 m1, [dstq+dsq*1], 1 mova m2, [tmpq] add tmpq, 16*2 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m4 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: mova xm3, [maskq+16*4] vinserti128 m3, [maskq+16*6], 1 mova xm4, [maskq+16*5] vinserti128 m4, [maskq+16*7], 1 .w32_loop: mova m1, [dstq] mova m2, [tmpq] add tmpq, 32 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m4 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .w32_loop RET cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask %define base r5-blend_h_avx2_table lea r5, [blend_h_avx2_table] mov r6d, wd tzcnt wd, wd mov hd, hm movsxd wq, dword [r5+wq*4] vpbroadcastd m5, [base+pw_512] add wq, r5 lea maskq, [base+obmc_masks+hq*2] lea hd, [hq*3] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd xm0, [dstq+dsq*0] pinsrw xm0, [dstq+dsq*1], 1 movd xm2, [maskq+hq*2] movd xm1, [tmpq] add tmpq, 2*2 punpcklwd xm2, xm2 punpcklbw xm0, xm1 pmaddubsw xm0, xm2 pmulhrsw xm0, xm5 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET ALIGN function_align .w4: mova xm3, [blend_shuf] .w4_loop: movd xm0, [dstq+dsq*0] pinsrd xm0, [dstq+dsq*1], 1 movd xm2, [maskq+hq*2] movq xm1, [tmpq] add tmpq, 4*2 pshufb xm2, xm3 punpcklbw xm0, xm1 pmaddubsw xm0, xm2 pmulhrsw xm0, xm5 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET ALIGN function_align .w8: vbroadcasti128 m4, [blend_shuf] shufpd m4, m4, 0x03 .w8_loop: vpbroadcastq m1, [dstq+dsq*0] movq xm0, [dstq+dsq*1] vpblendd m0, m1, 0x30 vpbroadcastd m3, [maskq+hq*2] movq xm1, [tmpq+8*1] vinserti128 m1, [tmpq+8*0], 1 add tmpq, 8*2 pshufb m3, m4 punpcklbw m0, m1 pmaddubsw m0, m3 pmulhrsw m0, m5 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movhps [dstq+dsq*0], xm0 movq [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop RET ALIGN function_align .w16: vbroadcasti128 m4, [blend_shuf] shufpd m4, m4, 0x0c .w16_loop: mova xm1, [dstq+dsq*0] vinserti128 m1, [dstq+dsq*1], 1 vpbroadcastd m3, [maskq+hq*2] mova m2, [tmpq] add tmpq, 16*2 pshufb m3, m4 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16_loop RET ALIGN function_align .w32: ; w32/w64/w128 sub dsq, r6 .w32_loop0: vpbroadcastw m3, [maskq+hq*2] mov wd, r6d .w32_loop: mova m1, [dstq] mova m2, [tmpq] add tmpq, 32 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq], m0 add dstq, 32 sub wd, 32 jg .w32_loop add dstq, dsq inc hq jl .w32_loop0 RET cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ bottomext, rightext ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor r12d, r12d lea r10, [ihq-1] cmp yq, ihq cmovs r10, yq test yq, yq cmovs r10, r12 imul r10, sstrideq add srcq, r10 ; ref += iclip(x, 0, iw - 1) lea r10, [iwq-1] cmp xq, iwq cmovs r10, xq test xq, xq cmovs r10, r12 add srcq, r10 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) lea bottomextq, [yq+bhq] sub bottomextq, ihq lea r3, [bhq-1] cmovs bottomextq, r12 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ bottomext, rightext ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, r12 cmp bottomextq, bhq cmovns bottomextq, r3 cmp topextq, bhq cmovg topextq, r3 ; right_ext = iclip(x + bw - iw, 0, bw - 1) lea rightextq, [xq+bwq] sub rightextq, iwq lea r2, [bwq-1] cmovs rightextq, r12 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ bottomext, rightext ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, r12 cmp rightextq, bwq cmovns rightextq, r2 cmp leftextq, bwq cmovns leftextq, r2 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ dst, dstride, src, sstride, bottomext, rightext ; center_h = bh - top_ext - bottom_ext lea r3, [bottomextq+topextq] sub centerhq, r3 ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq imul r2, dstrideq add dstq, r2 mov r9m, dstq ; center_w = bw - left_ext - right_ext mov centerwq, bwq lea r3, [rightextq+leftextq] sub centerwq, r3 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix .v_loop_%3: %if %1 ; left extension xor r3, r3 vpbroadcastb m0, [srcq] .left_loop_%3: mova [dstq+r3], m0 add r3, 32 cmp r3, leftextq jl .left_loop_%3 ; body lea r12, [dstq+leftextq] %endif xor r3, r3 .body_loop_%3: movu m0, [srcq+r3] %if %1 movu [r12+r3], m0 %else movu [dstq+r3], m0 %endif add r3, 32 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 add r12, centerwq %else lea r12, [dstq+centerwq] %endif xor r3, r3 vpbroadcastb m0, [srcq+centerwq-1] .right_loop_%3: movu [r12+r3], m0 add r3, 32 cmp r3, rightextq jl .right_loop_%3 %endif add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %endmacro test leftextq, leftextq jnz .need_left_ext test rightextq, rightextq jnz .need_right_ext v_loop 0, 0, 0 jmp .body_done .need_left_ext: test rightextq, rightextq jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; bottom edge extension test bottomextq, bottomextq jz .top mov srcq, dstq sub srcq, dstrideq xor r1, r1 .bottom_x_loop: mova m0, [srcq+r1] lea r3, [dstq+r1] mov r4, bottomextq .bottom_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .bottom_y_loop add r1, 32 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end mov srcq, r9m mov dstq, dstm xor r1, r1 .top_x_loop: mova m0, [srcq+r1] lea r3, [dstq+r1] mov r4, topextq .top_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .top_y_loop add r1, 32 cmp r1, bwq jl .top_x_loop .end: RET cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 sub dword mx0m, 4<<14 sub dword src_wm, 8 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ vpbroadcastd xm3, [base+pw_m256] vpbroadcastd m7, [base+pd_63] vbroadcasti128 m15, [base+pb_8x0_8x8] pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] pslld m5, 3 ; dx*8 pslld m6, 14 paddd m8, m2 ; mx+[0..7]*dx pxor m2, m2 ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset pand m9, m7 ; filter offset (masked) ; load source pixels - this ugly code is vpgatherdq emulation since ; directly using vpgatherdq on Haswell is quite a bit slower :( movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vextracti128 xm0, m0, 1 movq xm12, [srcq+r8] movq xm13, [srcq+r10] movhps xm12, [srcq+r9] movhps xm13, [srcq+r11] movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vinserti128 m12, [srcq+r8], 1 vinserti128 m13, [srcq+r10], 1 vpbroadcastq m10, [srcq+r9] vpbroadcastq m11, [srcq+r11] vpblendd m12, m10, 11000000b vpblendd m13, m11, 11000000b ; if no emulation is required, we don't need to shuffle or emulate edges ; this also saves 2 quasi-vpgatherdqs vptest m1, m1 jz .filter movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vextracti128 xm1, m1, 1 movq xm14, [base+resize_shuf+4+r8] movq xm0, [base+resize_shuf+4+r10] movhps xm14, [base+resize_shuf+4+r9] movhps xm0, [base+resize_shuf+4+r11] movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vinserti128 m14, [base+resize_shuf+4+r8], 1 vinserti128 m0, [base+resize_shuf+4+r10], 1 vpbroadcastq m10, [base+resize_shuf+4+r9] vpbroadcastq m11, [base+resize_shuf+4+r11] vpblendd m14, m10, 11000000b vpblendd m0, m11, 11000000b paddb m14, m15 paddb m0, m15 pshufb m12, m14 pshufb m13, m0 .filter: movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 vextracti128 xm9, m9, 1 movq xm10, [base+resize_filter+r8*8] movq xm11, [base+resize_filter+r10*8] movhps xm10, [base+resize_filter+r9*8] movhps xm11, [base+resize_filter+r11*8] movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 vinserti128 m10, [base+resize_filter+r8*8], 1 vinserti128 m11, [base+resize_filter+r10*8], 1 vpbroadcastq m14, [base+resize_filter+r9*8] vpbroadcastq m1, [base+resize_filter+r11*8] vpblendd m10, m14, 11000000b vpblendd m11, m1, 11000000b pmaddubsw m12, m10 pmaddubsw m13, m11 phaddw m12, m13 vextracti128 xm13, m12, 1 phaddsw xm12, xm13 pmulhrsw xm12, xm3 ; x=(x+64)>>7 packuswb xm12, xm12 movq [dstq+xq], xm12 paddd m4, m5 add xd, 8 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx2_table lea r7, [w_mask_420_avx2_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm movsxd wq, [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] pmovzxbd m9, [base+deint_shuf4] vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign add wq, r7 W_MASK 0, 4, 0, 1 mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 jg .w4_h16 .w4_end: vextracti128 xm0, m4, 1 vpblendd xm1, xm4, xm0, 0x05 vpblendd xm4, xm0, 0x0a pshufd xm1, xm1, q2301 psubw xm4, xm8, xm4 psubw xm4, xm1 psrlw xm4, 2 packuswb xm4, xm4 movq [maskq], xm4 RET .w4_h16: W_MASK 0, 5, 2, 3 lea dstq, [dstq+strideq*4] phaddd m4, m5 vextracti128 xm1, m0, 1 psubw m4, m8, m4 psrlw m4, 2 vpermd m4, m9, m4 vextracti128 xm5, m4, 1 packuswb xm4, xm5 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 mova [maskq], xm4 RET .w8_loop: add tmp1q, 2*32 add tmp2q, 2*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 8 .w8: vextracti128 xm2, m4, 1 vextracti128 xm1, m0, 1 psubw xm4, xm8, xm4 psubw xm4, xm2 psrlw xm4, 2 packuswb xm4, xm4 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 movq [maskq], xm4 sub hd, 4 jg .w8_loop RET .w16_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 16 .w16: vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 W_MASK 0, 5, 2, 3 punpckhqdq m1, m4, m5 punpcklqdq m4, m5 psubw m1, m8, m1 psubw m1, m4 psrlw m1, 2 vpermq m0, m0, q3120 packuswb m1, m1 vpermd m1, m9, m1 mova [dstq+strideq*2], xm0 vextracti128 [dstq+stride3q ], m0, 1 mova [maskq], xm1 sub hd, 4 jg .w16_loop RET .w32_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*2] add maskq, 16 .w32: vpermq m0, m0, q3120 mova [dstq+strideq*0], m0 W_MASK 0, 5, 2, 3 psubw m4, m8, m4 psubw m4, m5 psrlw m4, 2 vpermq m0, m0, q3120 packuswb m4, m4 vpermd m4, m9, m4 mova [dstq+strideq*1], m0 mova [maskq], xm4 sub hd, 2 jg .w32_loop RET .w64_loop_even: psubw m10, m8, m4 psubw m11, m8, m5 dec hd .w64_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 add dstq, strideq .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 vpermq m0, m0, q3120 mova [dstq+32*1], m0 test hd, 1 jz .w64_loop_even psubw m4, m10, m4 psubw m5, m11, m5 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m9, m4 mova [maskq], m4 add maskq, 32 dec hd jg .w64_loop RET .w128_loop_even: psubw m12, m8, m4 psubw m13, m8, m5 dec hd .w128_loop: W_MASK 0, 4, 0, 1 add dstq, strideq .w128: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 vpermq m0, m0, q3120 mova [dstq+32*1], m0 add tmp1q, 8*32 add tmp2q, 8*32 test hd, 1 jz .w128_even psubw m4, m10, m4 psubw m5, m11, m5 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m9, m4 mova [maskq+32*0], m4 jmp .w128_odd .w128_even: psubw m10, m8, m4 psubw m11, m8, m5 .w128_odd: W_MASK 0, 4, -4, -3 vpermq m0, m0, q3120 mova [dstq+32*2], m0 W_MASK 0, 5, -2, -1 vpermq m0, m0, q3120 mova [dstq+32*3], m0 test hd, 1 jz .w128_loop_even psubw m4, m12, m4 psubw m5, m13, m5 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m9, m4 mova [maskq+32*1], m4 add maskq, 64 dec hd jg .w128_loop RET cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx2_table lea r7, [w_mask_422_avx2_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm pxor m9, m9 movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] pmovzxbd m10, [base+deint_shuf4] vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign add wq, r7 mov maskq, maskmp W_MASK 0, 4, 0, 1 lea stride3q, [strideq*3] jmp wq .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 jg .w4_h16 .w4_end: vextracti128 xm5, m4, 1 packuswb xm4, xm5 psubb xm5, xm8, xm4 pavgb xm5, xm9 pshufd xm5, xm5, q3120 mova [maskq], xm5 RET .w4_h16: W_MASK 0, 5, 2, 3 lea dstq, [dstq+strideq*4] packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermd m5, m10, m5 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 mova [maskq], m5 RET .w8_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 16 .w8: vextracti128 xm5, m4, 1 vextracti128 xm1, m0, 1 packuswb xm4, xm5 psubb xm5, xm8, xm4 pavgb xm5, xm9 pshufd xm5, xm5, q3120 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 mova [maskq], xm5 sub hd, 4 jg .w8_loop RET .w16_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 32 .w16: vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+strideq*2], xm0 vextracti128 [dstq+stride3q ], m0, 1 mova [maskq], m5 sub hd, 4 jg .w16_loop RET .w32_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*2] add maskq, 32 .w32: vpermq m0, m0, q3120 mova [dstq+strideq*0], m0 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+strideq*1], m0 mova [maskq], m5 sub hd, 2 jg .w32_loop RET .w64_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 add dstq, strideq add maskq, 32 .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+32*1], m0 mova [maskq], m5 dec hd jg .w64_loop RET .w128_loop: add tmp1q, 32*8 add tmp2q, 32*8 W_MASK 0, 4, 0, 1 add dstq, strideq add maskq, 32*2 .w128: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+32*1], m0 mova [maskq+32*0], m5 W_MASK 0, 4, 4, 5 vpermq m0, m0, q3120 mova [dstq+32*2], m0 W_MASK 0, 5, 6, 7 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+32*3], m0 mova [maskq+32*1], m5 dec hd jg .w128_loop RET cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx2_table lea r7, [w_mask_444_avx2_table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m5, [base+pb_64] vpbroadcastd m7, [base+pw_2048] add wq, r7 W_MASK 0, 4, 0, 1, 1 lea stride3q, [strideq*3] jmp wq .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 mova [maskq+32*0], m4 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 je .w4_end W_MASK 0, 4, 2, 3, 1 lea dstq, [dstq+strideq*4] vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 mova [maskq+32*1], m4 .w4_end: RET .w8_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 lea dstq, [dstq+strideq*4] add maskq, 32 .w8: vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 mova [maskq], m4 sub hd, 4 jg .w8_loop RET .w16_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 lea dstq, [dstq+strideq*2] add maskq, 32 .w16: vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [maskq], m4 sub hd, 2 jg .w16_loop RET .w32_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 add dstq, strideq add maskq, 32 .w32: vpermq m0, m0, q3120 mova [dstq], m0 mova [maskq], m4 dec hd jg .w32_loop RET .w64_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1, 1 add dstq, strideq add maskq, 32*2 .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 mova [maskq+32*0], m4 W_MASK 0, 4, 2, 3, 1 vpermq m0, m0, q3120 mova [dstq+32*1], m0 mova [maskq+32*1], m4 dec hd jg .w64_loop RET .w128_loop: add tmp1q, 32*8 add tmp2q, 32*8 W_MASK 0, 4, 0, 1, 1 add dstq, strideq add maskq, 32*4 .w128: vpermq m0, m0, q3120 mova [dstq+32*0], m0 mova [maskq+32*0], m4 W_MASK 0, 4, 2, 3, 1 vpermq m0, m0, q3120 mova [dstq+32*1], m0 mova [maskq+32*1], m4 W_MASK 0, 4, 4, 5, 1 vpermq m0, m0, q3120 mova [dstq+32*2], m0 mova [maskq+32*2], m4 W_MASK 0, 4, 6, 7, 1 vpermq m0, m0, q3120 mova [dstq+32*3], m0 mova [maskq+32*3], m4 dec hd jg .w128_loop RET %endif ; ARCH_X86_64 av-scenechange-0.14.1/src/asm/x86/mc_avx512.asm000064400000000000000000005001711046102023000167650ustar 00000000000000; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 obmc_masks: pw_512: times 2 dw 512 ; 2 db 45, 19, 64, 0 ; 4 db 39, 25, 50, 14, 59, 5, 64, 0 ; 8 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 ; 16 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 ; 32 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20 db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22 db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24 db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26 warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24 db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26 db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28 db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30 warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43 pd_16384: dd 16384 pd_262144: dd 262144 warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54 warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59 db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63 bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31 db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63 db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47 db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63 db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127 db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62 db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39 db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47 bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23 db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31 bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87 db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39 bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23 db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31 bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7 db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15 db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7 spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42 db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50 spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54 spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26 db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34 spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13 db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55 db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63 db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71 db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79 spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52 db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54 spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40 db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42 db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48 db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50 spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12 db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14 db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20 db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22 spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 resize_permC: dd 0, 4, 8, 12 pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 wm_420_perm64: dq 0xfedcba9876543210 wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 pb_8x0_8x8: times 8 db 0 times 8 db 8 pb_127: times 4 db 127 pw_m128 times 2 dw -128 pw_m256: times 2 dw -256 pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_6903: times 2 dw 6903 pw_8192: times 2 dw 8192 pd_32: dd 32 pd_34: dd 34 pd_63: dd 63 pd_512: dd 512 pd_32768: dd 32768 %define pb_m64 (wm_sign+4) %define pb_64 (wm_sign+8) %define pd_2 (pd_0to7+8) cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern resize_filter %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep) %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 SECTION .text %macro WRAP_YMM 1+ INIT_YMM cpuname %1 INIT_ZMM cpuname %endmacro INIT_ZMM avx512icl cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx lea r7, [put_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: movzx r6d, word [srcq+ssq*0] movzx r7d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6w mov [dstq+dsq*1], r7w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu xmm0, [srcq+ssq*0] movu xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], xmm0 mova [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu ym0, [srcq+ssq*0] movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], ym0 mova [dstq+dsq*1], ym1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+64*0], m0 mova [dstq+dsq*0+64*1], m1 mova [dstq+dsq*1+64*0], m2 mova [dstq+dsq*1+64*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w128 RET .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 0xff01 vbroadcasti128 m4, [bilin_h_shuf8] add mxyd, 16 << 8 vpbroadcastw m5, mxyd mov mxyd, r7m ; my test mxyd, mxyd jnz .hv movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] vpbroadcastd m3, [pw_2048] add wq, r7 jmp wq .h_w2: movd xmm0, [srcq+ssq*0] pinsrd xmm0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb xmm0, xm4 pmaddubsw xmm0, xm5 pmulhrsw xmm0, xm3 packuswb xmm0, xmm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: mova xmm4, [bilin_h_shuf4] .h_w4_loop: movq xmm0, [srcq+ssq*0] movhps xmm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xmm0, xmm4 pmaddubsw xmm0, xm5 pmulhrsw xmm0, xm3 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb ym0, ym4 pmaddubsw ym0, ym5 pmulhrsw ym0, ym3 vpmovuswb xm0, ym0 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mova m4, [bilin_h_perm16] .h_w16_loop: movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] vpermb m0, m4, m0 pmaddubsw m0, m5 pmulhrsw m0, m3 vpmovuswb ym0, m0 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: movu ym0, [srcq+ssq*0+8*0] vinserti32x8 m0, [srcq+ssq*1+8*0], 1 movu ym1, [srcq+ssq*0+8*1] vinserti32x8 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 add srcq, ssq mova [dstq], m0 add dstq, dsq dec hd jg .h_w64 RET .h_w128: movu m0, [srcq+8*0] movu m2, [srcq+8*1] movu m1, [srcq+8*8] movu m6, [srcq+8*9] add srcq, ssq REPX {pshufb x, m4}, m0, m2, m1, m6 REPX {pmaddubsw x, m5}, m0, m2, m1, m6 REPX {pmulhrsw x, m3}, m0, m2, m1, m6 packuswb m0, m2 packuswb m1, m6 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w128 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] imul mxyd, 0xff01 vpbroadcastd m5, [pw_2048] add mxyd, 16 << 8 add wq, r7 vpbroadcastw m4, mxyd jmp wq .v_w2: movd xmm0, [srcq+ssq*0] .v_w2_loop: pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xmm1, xmm1, q2301 ; 1 0 punpcklbw xmm1, xmm0, xmm1 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 packuswb xmm1, xmm1 pextrw [dstq+dsq*0], xmm1, 1 pextrw [dstq+dsq*1], xmm1, 0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xmm0, [srcq+ssq*0] .v_w4_loop: vpbroadcastd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm1, xmm0, 0x02 ; 1 2 punpcklbw xmm1, xmm2 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 packuswb xmm1, xmm1 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xmm0, [srcq+ssq*0] .v_w8_loop: movq xmm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw xmm1, xmm3, xmm0 movq xmm0, [srcq+ssq*0] punpcklbw xmm2, xmm0, xmm3 pmaddubsw xmm1, xm4 pmaddubsw xmm2, xm4 pmulhrsw xmm1, xm5 pmulhrsw xmm2, xm5 packuswb xmm1, xmm2 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: movu xmm0, [srcq+ssq*0] .v_w16_loop: vbroadcasti128 ymm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1 vbroadcasti128 ymm0, [srcq+ssq*0] vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2 punpcklbw ymm1, ymm2, ymm3 punpckhbw ymm2, ymm3 pmaddubsw ymm1, ym4 pmaddubsw ymm2, ym4 pmulhrsw ymm1, ym5 pmulhrsw ymm2, ym5 packuswb ymm1, ymm2 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop vzeroupper RET .v_w32: movu ym0, [srcq+ssq*0] kxnorb k1, k1, k1 .v_w32_loop: vbroadcasti32x8 m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendmd m3{k1}, m2, m0 ; 0 1 vbroadcasti32x8 m0, [srcq+ssq*0] vpblendmd m2{k1}, m0, m2 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+ssq*0] .v_w64_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m1, m3, m0 punpckhbw m6, m3, m0 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m6, m4 punpcklbw m2, m0, m3 punpckhbw m7, m0, m3 pmaddubsw m2, m4 pmaddubsw m7, m4 REPX {pmulhrsw x, m5}, m1, m6, m2, m7 packuswb m1, m6 packuswb m2, m7 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w64_loop RET .v_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] .v_w128_loop: add srcq, ssq movu m2, [srcq+64*0] movu m3, [srcq+64*1] punpcklbw m6, m2, m0 pmaddubsw m6, m4 punpckhbw m0, m2, m0 pmaddubsw m0, m4 punpcklbw m7, m3, m1 pmaddubsw m7, m4 punpckhbw m1, m3, m1 pmaddubsw m1, m4 REPX {pmulhrsw x, m5}, m6, m0, m7, m1 packuswb m6, m0 mova m0, m2 packuswb m7, m1 mova m1, m3 mova [dstq+64*0], m6 mova [dstq+64*1], m7 add dstq, dsq dec hd jg .v_w128_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow vpbroadcastd m7, [pw_2048] add wq, r7 vpbroadcastw m6, mxyd jmp wq .hv_w2: vpbroadcastd xmm0, [srcq+ssq*0] pshufb xmm0, xm4 pmaddubsw xmm0, xm5 .hv_w2_loop: movd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pinsrd xmm1, [srcq+ssq*0], 1 pshufb xmm1, xm4 pmaddubsw xmm1, xm5 ; 1 _ 2 _ shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _ mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm6 paddw xmm1, xmm2 pmulhrsw xmm1, xm7 packuswb xmm1, xmm1 pextrw [dstq+dsq*0], xmm1, 0 pextrw [dstq+dsq*1], xmm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova xmm4, [bilin_h_shuf4] movddup xmm0, [srcq+ssq*0] pshufb xmm0, xmm4 pmaddubsw xmm0, xm5 .hv_w4_loop: movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm1, [srcq+ssq*0] pshufb xmm1, xmm4 pmaddubsw xmm1, xm5 ; 1 2 shufps xmm2, xmm0, xmm1, q1032 ; 0 1 mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm6 paddw xmm1, xmm2 pmulhrsw xmm1, xm7 packuswb xmm1, xmm1 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: vbroadcasti128 ym0, [srcq+ssq*0] pshufb ym0, ym4 pmaddubsw ym0, ym5 .hv_w8_loop: movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 ym1, [srcq+ssq*0], 1 pshufb ym1, ym4 pmaddubsw ym1, ym5 ; 1 2 valignq ym2, ym1, ym0, 2 mova ym0, ym1 psubw ym1, ym2 paddw ym1, ym1 pmulhw ym1, ym6 paddw ym1, ym2 pmulhrsw ym1, ym7 vpmovuswb xm1, ym1 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: vbroadcasti32x8 m0, [srcq+ssq*0] mova m4, [bilin_h_perm16] vpermb m0, m4, m0 pmaddubsw m0, m5 .hv_w16_loop: movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m1, [srcq+ssq*0], 1 vpermb m1, m4, m1 pmaddubsw m1, m5 ; 1 2 valignq m2, m1, m0, 4 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 vpmovuswb ym1, m1 mova [dstq+dsq*0], xm1 vextracti32x4 [dstq+dsq*1], ym1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+ssq*0] pmovzxbq m8, [pb_02461357] pmaddubsw m0, m5 .hv_w32_loop: vpermb m2, m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpermb m3, m4, [srcq+ssq*0] pmaddubsw m2, m5 psubw m1, m2, m0 paddw m1, m1 pmulhw m1, m6 paddw m1, m0 pmaddubsw m0, m3, m5 psubw m3, m0, m2 paddw m3, m3 pmulhw m3, m6 paddw m3, m2 pmulhrsw m1, m7 pmulhrsw m3, m7 packuswb m1, m3 vpermq m1, m8, m1 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop RET .hv_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w64_loop: add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 psubw m8, m2, m0 psubw m9, m3, m1 paddw m8, m8 pmulhw m8, m6 paddw m9, m9 pmulhw m9, m6 paddw m8, m0 pmulhrsw m8, m7 paddw m9, m1 pmulhrsw m9, m7 mova m0, m2 mova m1, m3 packuswb m8, m9 mova [dstq], m8 add dstq, dsq dec hd jg .hv_w64_loop RET .hv_w128: movu m0, [srcq+8*0] movu m1, [srcq+8*1] movu m2, [srcq+8*8] movu m3, [srcq+8*9] REPX {pshufb x, m4}, m0, m1, m2, m3 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 .hv_w128_loop: add srcq, ssq movu m8, [srcq+8*0] movu m9, [srcq+8*1] movu m10, [srcq+8*8] movu m11, [srcq+8*9] REPX {pshufb x, m4}, m8, m9, m10, m11 REPX {pmaddubsw x, m5}, m8, m9, m10, m11 psubw m12, m8, m0 psubw m13, m9, m1 psubw m14, m10, m2 psubw m15, m11, m3 paddw m12, m12 pmulhw m12, m6 paddw m13, m13 pmulhw m13, m6 paddw m14, m14 pmulhw m14, m6 paddw m15, m15 pmulhw m15, m6 paddw m12, m0 pmulhrsw m12, m7 paddw m13, m1 pmulhrsw m13, m7 paddw m14, m2 pmulhrsw m14, m7 paddw m15, m3 pmulhrsw m15, m7 mova m0, m8 mova m1, m9 mova m2, m10 mova m3, m11 packuswb m12, m13 packuswb m14, m15 mova [dstq+64*0], m12 mova [dstq+64*1], m14 add dstq, dsq dec hd jg .hv_w128_loop RET DECLARE_REG_TMP 3, 5, 6 cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea t2, [prep_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [t2+wq*2+table_offset(prep,)] add wq, t2 lea stride3q, [strideq*3] jmp wq .prep_w4: movd xmm0, [srcq+strideq*0] pinsrd xmm0, [srcq+strideq*1], 1 pinsrd xmm0, [srcq+strideq*2], 2 pinsrd xmm0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmovzxbw ym0, xmm0 psllw ym0, 4 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .prep_w4 RET .prep_w8: movq xmm0, [srcq+strideq*0] movq xmm1, [srcq+strideq*1] vinserti128 ym0, ymm0, [srcq+strideq*2], 1 vinserti128 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1 pmovzxbw m0, ym0 psllw m0, 4 mova [tmpq], m0 add tmpq, 32*2 sub hd, 4 jg .prep_w8 RET .prep_w16: movu xmm0, [srcq+strideq*0] vinserti128 ym0, ymm0, [srcq+strideq*1], 1 movu xmm1, [srcq+strideq*2] vinserti128 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmovzxbw m0, ym0 pmovzxbw m1, ym1 psllw m0, 4 psllw m1, 4 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 32*4 sub hd, 4 jg .prep_w16 RET .prep_w32: pmovzxbw m0, [srcq+strideq*0] pmovzxbw m1, [srcq+strideq*1] pmovzxbw m2, [srcq+strideq*2] pmovzxbw m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {psllw x, 4}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .prep_w32 RET .prep_w64: pmovzxbw m0, [srcq+strideq*0+32*0] pmovzxbw m1, [srcq+strideq*0+32*1] pmovzxbw m2, [srcq+strideq*1+32*0] pmovzxbw m3, [srcq+strideq*1+32*1] lea srcq, [srcq+strideq*2] REPX {psllw x, 4}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .prep_w64 RET .prep_w128: pmovzxbw m0, [srcq+32*0] pmovzxbw m1, [srcq+32*1] pmovzxbw m2, [srcq+32*2] pmovzxbw m3, [srcq+32*3] REPX {psllw x, 4}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 add srcq, strideq dec hd jg .prep_w128 RET .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 0xff01 add mxyd, 16 << 8 vpbroadcastw m5, mxyd mov mxyd, r6m ; my test mxyd, mxyd jnz .hv movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] add wq, t2 lea stride3q, [strideq*3] jmp wq .h_w4: vbroadcasti32x4 ym4, [bilin_h_shuf4] .h_w4_loop: movq xmm0, [srcq+strideq*0] movq xmm1, [srcq+strideq*1] vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1 vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1 pshufb ym0, ym4 pmaddubsw ym0, ym5 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: vbroadcasti32x4 m4, [bilin_h_shuf8] .h_w8_loop: movu xmm0, [srcq+strideq*0] vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 vinserti32x4 m0, [srcq+strideq*2], 2 vinserti32x4 m0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pshufb m0, m4 pmaddubsw m0, m5 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h_w16: mova m4, [bilin_h_perm16] .h_w16_loop: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 movu ym1, [srcq+strideq*2] vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] vpermb m0, m4, m0 vpermb m1, m4, m1 pmaddubsw m0, m5 pmaddubsw m1, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 4 jg .h_w16_loop RET .h_w32: mova m4, [bilin_h_perm32] .h_w32_loop: vpermb m0, m4, [srcq+strideq*0] vpermb m1, m4, [srcq+strideq*1] vpermb m2, m4, [srcq+strideq*2] vpermb m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .h_w32_loop RET .h_w64: mova m4, [bilin_h_perm32] .h_w64_loop: vpermb m0, m4, [srcq+strideq*0+32*0] vpermb m1, m4, [srcq+strideq*0+32*1] vpermb m2, m4, [srcq+strideq*1+32*0] vpermb m3, m4, [srcq+strideq*1+32*1] lea srcq, [srcq+strideq*2] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .h_w64_loop RET .h_w128: mova m4, [bilin_h_perm32] .h_w128_loop: vpermb m0, m4, [srcq+32*0] vpermb m1, m4, [srcq+32*1] vpermb m2, m4, [srcq+32*2] vpermb m3, m4, [srcq+32*3] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 add srcq, strideq dec hd jg .h_w128_loop RET .v: WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 0xff01 add mxyd, 16 << 8 add wq, t2 lea stride3q, [strideq*3] vpbroadcastw m6, mxyd jmp wq .v_w4: vpbroadcastd xm0, [srcq+strideq*0] mov r3d, 0x29 vbroadcasti32x4 ym3, [bilin_v_shuf4] kmovb k1, r3d .v_w4_loop: vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____ vpbroadcastd ym2, [srcq+strideq*2] vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__ lea srcq, [srcq+strideq*4] vpbroadcastd ym0, [srcq+strideq*0] punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_ pshufb ym2, ym3 pmaddubsw ym2, ym6 mova [tmpq], ym2 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: mova m5, [bilin_v_perm8] vbroadcasti32x4 ym0, [srcq+strideq*0] .v_w8_loop: vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vpbroadcastq ym0, [srcq+strideq*2] vinserti32x4 m1, [srcq+stride3q ], 2 lea srcq, [srcq+strideq*4] vinserti32x4 ym0, [srcq+strideq*0], 0 vpermt2b m1, m5, m0 pmaddubsw m1, m6 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: mova m5, [bilin_v_perm16] movu xm0, [srcq+strideq*0] .v_w16_loop: movu xm2, [srcq+strideq*2] vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vpermt2b m1, m5, m2 vinserti32x4 ym2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] movu xm0, [srcq+strideq*0] vpermt2b m2, m5, m0 pmaddubsw m1, m6 pmaddubsw m2, m6 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: mova m5, [bilin_v_perm32] movu ym0, [srcq+strideq*0] .v_w32_loop: movu ym2, [srcq+strideq*1] movu ym3, [srcq+strideq*2] movu ym4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpermt2b m0, m5, m2 vpermt2b m2, m5, m3 vpermt2b m3, m5, m4 pmaddubsw m1, m0, m6 movu ym0, [srcq+strideq*0] vpermt2b m4, m5, m0 pmaddubsw m2, m6 pmaddubsw m3, m6 pmaddubsw m4, m6 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 mova [tmpq+64*2], m3 mova [tmpq+64*3], m4 add tmpq, 64*4 sub hd, 4 jg .v_w32_loop RET .v_w64: mova m5, [bilin_v_perm64] vpermq m0, m5, [srcq+strideq*0] .v_w64_loop: vpermq m1, m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklbw m4, m1, m0 punpckhbw m2, m1, m0 vpermq m0, m5, [srcq+strideq*0] punpcklbw m3, m0, m1 punpckhbw m1, m0, m1 pmaddubsw m4, m6 pmaddubsw m2, m6 pmaddubsw m3, m6 pmaddubsw m1, m6 mova [tmpq+64*0], m4 mova [tmpq+64*1], m2 mova [tmpq+64*2], m3 mova [tmpq+64*3], m1 add tmpq, 64*4 sub hd, 2 jg .v_w64_loop RET .v_w128: mova m5, [bilin_v_perm64] vpermq m0, m5, [srcq+strideq*0+ 0] vpermq m1, m5, [srcq+strideq*0+64] .v_w128_loop: vpermq m2, m5, [srcq+strideq*1+ 0] vpermq m3, m5, [srcq+strideq*1+64] lea srcq, [srcq+strideq*2] punpcklbw m4, m2, m0 punpckhbw m0, m2, m0 pmaddubsw m4, m6 pmaddubsw m0, m6 mova [tmpq+64*0], m4 mova [tmpq+64*1], m0 punpcklbw m4, m3, m1 punpckhbw m1, m3, m1 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+64*2], m4 mova [tmpq+64*3], m1 vpermq m0, m5, [srcq+strideq*0+ 0] vpermq m1, m5, [srcq+strideq*0+64] punpcklbw m4, m0, m2 punpckhbw m2, m0, m2 pmaddubsw m4, m6 pmaddubsw m2, m6 mova [tmpq+64*4], m4 mova [tmpq+64*5], m2 punpcklbw m4, m1, m3 punpckhbw m3, m1, m3 pmaddubsw m4, m6 pmaddubsw m3, m6 mova [tmpq+64*6], m4 mova [tmpq+64*7], m3 add tmpq, 64*8 sub hd, 2 jg .v_w128_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 vpbroadcastw m6, mxyd add wq, t2 lea stride3q, [strideq*3] jmp wq .hv_w4: vbroadcasti32x4 ym4, [bilin_h_shuf4] vpbroadcastq ym0, [srcq+strideq*0] pshufb ym0, ym4 pmaddubsw ym0, ym5 .hv_w4_loop: movq xmm1, [srcq+strideq*1] movq xmm2, [srcq+strideq*2] vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1 punpcklqdq ym1, ym2 pshufb ym1, ym4 pmaddubsw ym1, ym5 ; 1 2 3 4 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 mova ym0, ym1 psubw ym1, ym2 pmulhrsw ym1, ym6 paddw ym1, ym2 mova [tmpq], ym1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: vbroadcasti32x4 m4, [bilin_h_shuf8] vbroadcasti32x4 m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu xmm1, [srcq+strideq*1] vinserti128 ym1, ymm1, [srcq+strideq*2], 1 vinserti128 m1, [srcq+stride3q ], 2 lea srcq, [srcq+strideq*4] vinserti128 m1, [srcq+strideq*0], 3 pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 3 4 valignq m2, m1, m0, 6 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: mova m4, [bilin_h_perm16] vbroadcasti32x8 m0, [srcq+strideq*0] vpermb m0, m4, m0 pmaddubsw m0, m5 .hv_w16_loop: movu ym1, [srcq+strideq*1] vinserti32x8 m1, [srcq+strideq*2], 1 movu ym2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti32x8 m2, [srcq+strideq*0], 1 vpermb m1, m4, m1 vpermb m2, m4, m2 pmaddubsw m1, m5 ; 1 2 vshufi32x4 m3, m0, m1, q1032 ; 0 1 pmaddubsw m0, m2, m5 ; 3 4 vshufi32x4 m2, m1, m0, q1032 ; 2 3 psubw m1, m3 pmulhrsw m1, m6 paddw m1, m3 psubw m3, m0, m2 pmulhrsw m3, m6 paddw m3, m2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m3 add tmpq, 64*2 sub hd, 4 jg .hv_w16_loop RET .hv_w32: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+strideq*0] pmaddubsw m0, m5 .hv_w32_loop: vpermb m1, m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermb m2, m4, [srcq+strideq*0] pmaddubsw m1, m5 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 pmaddubsw m0, m2, m5 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+64*0], m3 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .hv_w32_loop RET .hv_w64: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+32*0] vpermb m1, m4, [srcq+32*1] pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w64_loop: add srcq, strideq vpermb m2, m4, [srcq+32*0] vpermb m3, m4, [srcq+32*1] pmaddubsw m2, m5 pmaddubsw m3, m5 psubw m7, m2, m0 psubw m8, m3, m1 pmulhrsw m7, m6 pmulhrsw m8, m6 paddw m7, m0 mova m0, m2 paddw m8, m1 mova m1, m3 mova [tmpq+64*0], m7 mova [tmpq+64*1], m8 add tmpq, 64*2 dec hd jg .hv_w64_loop RET .hv_w128: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+32*0] vpermb m1, m4, [srcq+32*1] vpermb m2, m4, [srcq+32*2] vpermb m3, m4, [srcq+32*3] REPX {pmaddubsw x, m5}, m0, m1, m2, m3 .hv_w128_loop: add srcq, strideq vpermb m7, m4, [srcq+32*0] vpermb m8, m4, [srcq+32*1] vpermb m9, m4, [srcq+32*2] vpermb m10, m4, [srcq+32*3] REPX {pmaddubsw x, m5}, m7, m8, m9, m10 psubw m11, m7, m0 psubw m12, m8, m1 psubw m13, m9, m2 psubw m14, m10, m3 REPX {pmulhrsw x, m6}, m11, m12, m13, m14 paddw m11, m0 mova m0, m7 paddw m12, m1 mova m1, m8 paddw m13, m2 mova m2, m9 paddw m14, m3 mova m3, m10 mova [tmpq+64*0], m11 mova [tmpq+64*1], m12 mova [tmpq+64*2], m13 mova [tmpq+64*3], m14 add tmpq, 64*4 dec hd jg .hv_w128_loop RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; fn, type, type_h, type_v cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) %endif %endmacro %macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb %if %5 vpermb m%2, m6, m%1 vpermb m%3, m7, m%1 vpermb m%4, m8, m%1 %else %if %2 < %4 ; reuse a previous value if possible pshufb m%2, m%1, m6 %endif pshufb m%3, m%1, m7 pshufb m%4, m%1, m8 %endif mova m%1, m5 vpdpbusd m%1, m%2, m9 mova m%2, m5 vpdpbusd m%2, m%3, m9 vpdpbusd m%1, m%3, m10 vpdpbusd m%2, m%4, m10 packusdw m%1, m%2 psrlw m%1, 6 %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_FN smooth, SMOOTH, SMOOTH PUT_8TAP_FN sharp_regular, SHARP, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %define base r8-put_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx512icl] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 lea r6, [ssq*3] lea r7, [dsq*3] %if WIN64 pop r8 %endif jmp wq .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) WIN64_SPILL_XMM 11 cmp wd, 4 jl .h_w2 vbroadcasti128 m6, [subpel_h_shufA] je .h_w4 tzcnt wd, wd vbroadcasti128 m7, [subpel_h_shufB] vbroadcasti128 m8, [subpel_h_shufC] shr mxd, 16 sub srcq, 3 movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] vpbroadcastd m9, [base+mxq*8+subpel_filters+0] vpbroadcastd m10, [base+mxq*8+subpel_filters+4] add wq, r8 jmp wq .h_w2: movzx mxd, mxb dec srcq mova xmm4, [subpel_h_shuf4] vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] .h_w2_loop: movq xmm0, [srcq+ssq*0] movhps xmm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xmm0, xmm4 mova xmm1, xm5 vpdpbusd xmm1, xmm0, xmm3 packssdw xmm0, xmm1, xmm1 psraw xmm0, 6 packuswb xmm0, xm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb dec srcq vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] .h_w4_loop: movq xmm0, [srcq+ssq*0] movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xmm0, xm6 pshufb xmm1, xm6 mova xmm2, xm5 vpdpbusd xmm2, xmm0, xmm3 mova xmm0, xm5 vpdpbusd xmm0, xmm1, xmm3 packssdw xmm0, xmm2, xmm0 psraw xmm0, 6 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 vpmovuswb xm0, ym0 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mova m6, [spel_h_perm16a] mova m7, [spel_h_perm16b] mova m8, [spel_h_perm16c] .h_w16_loop: movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] PUT_8TAP_H 0, 1, 2, 3, 1 vpmovuswb ym0, m0 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: movu ym0, [srcq+ssq*0+8*0] vinserti32x8 m0, [srcq+ssq*1+8*0], 1 movu ym1, [srcq+ssq*0+8*1] vinserti32x8 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] PUT_8TAP_H 0, 2, 3, 4 PUT_8TAP_H 1, 4, 3, 2 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq PUT_8TAP_H 0, 2, 3, 4 PUT_8TAP_H 1, 4, 3, 2 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .h_w64 RET .h_w128: movu m0, [srcq+8*0] movu m2, [srcq+8*1] movu m1, [srcq+8*8] movu m3, [srcq+8*9] add srcq, ssq PUT_8TAP_H 0, 4, 11, 12 PUT_8TAP_H 2, 12, 11, 4 PUT_8TAP_H 1, 4, 11, 12 PUT_8TAP_H 3, 12, 11, 4 packuswb m0, m2 packuswb m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w128 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] vpbroadcastd m7, [pw_512] lea myq, [base+subpel_filters+myq*8] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] add r6, r8 lea ss3q, [ssq*3] sub srcq, ss3q jmp r6 .v_w2: movd xmm2, [srcq+ssq*0] pinsrw xmm2, [srcq+ssq*1], 2 pinsrw xmm2, [srcq+ssq*2], 4 add srcq, ss3q pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklbw xmm3, xmm1 ; 45 56 punpcklbw xmm1, xmm2, xmm4 ; 01 12 punpckhbw xmm2, xmm4 ; 23 34 .v_w2_loop: pmaddubsw xmm5, xmm1, xm8 ; a0 b0 mova xmm1, xmm2 pmaddubsw xmm2, xm9 ; a1 b1 paddw xmm5, xmm2 mova xmm2, xmm3 pmaddubsw xmm3, xm10 ; a2 b2 paddw xmm5, xmm3 vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 punpcklbw xmm3, xmm4 ; 67 78 pmaddubsw xmm4, xmm3, xm11 ; a3 b3 paddw xmm5, xmm4 pmulhrsw xmm5, xm7 packuswb xmm5, xmm5 pextrw [dstq+dsq*0], xmm5, 0 pextrw [dstq+dsq*1], xmm5, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xmm2, [srcq+ssq*0] pinsrd xmm2, [srcq+ssq*1], 1 pinsrd xmm2, [srcq+ssq*2], 2 add srcq, ss3q pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklbw xmm3, xmm1 ; 45 56 punpcklbw xmm1, xmm2, xmm4 ; 01 12 punpckhbw xmm2, xmm4 ; 23 34 .v_w4_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw xmm5, xmm1, xm8 ; a0 b0 mova xmm1, xmm2 pmaddubsw xmm2, xm9 ; a1 b1 paddw xmm5, xmm2 mova xmm2, xmm3 pmaddubsw xmm3, xm10 ; a2 b2 paddw xmm5, xmm3 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 punpcklbw xmm3, xmm4 ; 67 78 pmaddubsw xmm4, xmm3, xm11 ; a3 b3 paddw xmm5, xmm4 pmulhrsw xmm5, xm7 packuswb xmm5, xmm5 movd [dstq+dsq*0], xmm5 pextrd [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xmm1, [srcq+ssq*0] vpbroadcastq ymm0, [srcq+ssq*1] vpbroadcastq ymm2, [srcq+ssq*2] add srcq, ss3q vpbroadcastq ymm5, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm4, [srcq+ssq*2] add srcq, ss3q vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklbw ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm2, ymm5, 0x30 vpblendd ymm5, ymm3, 0x30 punpcklbw ymm2, ymm5 ; 23 34 vpblendd ymm3, ymm4, 0x30 vpblendd ymm4, ymm0, 0x30 punpcklbw ymm3, ymm4 ; 45 56 .v_w8_loop: vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw ymm5, ymm1, ym8 ; a0 b0 mova ymm1, ymm2 pmaddubsw ymm2, ym9 ; a1 b1 paddw ymm5, ymm2 mova ymm2, ymm3 pmaddubsw ymm3, ym10 ; a2 b2 paddw ymm5, ymm3 vpblendd ymm3, ymm0, ymm4, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm4, ymm4, ymm0, 0x30 punpcklbw ymm3, ymm4 ; 67 78 pmaddubsw ymm4, ymm3, ym11 ; a3 b3 paddw ymm5, ymm4 pmulhrsw ymm5, ym7 vextracti128 xmm4, ymm5, 1 packuswb xmm5, xmm4 movq [dstq+dsq*0], xmm5 movhps [dstq+dsq*1], xmm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: mova m12, [spel_v_perm16] vbroadcasti32x4 m1, [srcq+ssq*0] vbroadcasti32x4 ym4, [srcq+ssq*1] mov r6d, 0x0f vbroadcasti32x4 m2, [srcq+ssq*2] add srcq, ss3q vbroadcasti32x4 ym5, [srcq+ssq*0] kmovb k1, r6d vbroadcasti32x4 m3, [srcq+ssq*1] vbroadcasti32x4 ym6, [srcq+ssq*2] add srcq, ss3q vbroadcasti32x4 m0, [srcq+ssq*0] vshufpd m1{k1}, m4, m2, 0xcc vshufpd m2{k1}, m5, m3, 0xcc vshufpd m3{k1}, m6, m0, 0xcc vpermb m1, m12, m1 ; 01 12 vpermb m2, m12, m2 ; 23 34 vpermb m3, m12, m3 ; 45 56 .v_w16_loop: pmaddubsw m4, m1, m8 ; a0 b0 mova m1, m2 pmaddubsw m5, m2, m9 ; a1 b1 mova m2, m3 pmaddubsw m6, m3, m10 ; a2 b2 mova m3, m0 paddw m4, m5 vbroadcasti32x4 ym5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti32x4 m0, [srcq+ssq*0] vshufpd m3{k1}, m5, m0, 0xcc vpermb m3, m12, m3 ; 67 78 pmaddubsw m5, m3, m11 ; a3 b3 paddw m4, m6 paddw m4, m5 pmulhrsw m4, m7 vextracti32x8 ym5, m4, 1 packuswb ym4, ym5 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: mova m12, [spel_v_perm32] pmovzxbq m14, [pb_02461357] vpshrdw m13, m12, m12, 8 movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 vpermb m1, m12, m0 ; 01 vinserti32x8 m0, [srcq+ssq*2], 0 add srcq, ss3q vpermb m2, m13, m0 ; 12 vinserti32x8 m0, [srcq+ssq*0], 1 vpermb m3, m12, m0 ; 23 vinserti32x8 m0, [srcq+ssq*1], 0 vpermb m4, m13, m0 ; 34 vinserti32x8 m0, [srcq+ssq*2], 1 add srcq, ss3q vpermb m5, m12, m0 ; 45 vinserti32x8 m0, [srcq+ssq*0], 0 vpermb m6, m13, m0 ; 56 .v_w32_loop: vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddubsw m15, m1, m8 mova m1, m3 pmaddubsw m16, m2, m8 mova m2, m4 pmaddubsw m17, m3, m9 mova m3, m5 pmaddubsw m18, m4, m9 mova m4, m6 pmaddubsw m19, m5, m10 vpermb m5, m12, m0 ; 67 vinserti32x8 m0, [srcq+ssq*0], 0 pmaddubsw m20, m6, m10 vpermb m6, m13, m0 ; 78 paddw m15, m17 pmaddubsw m17, m5, m11 paddw m16, m18 pmaddubsw m18, m6, m11 paddw m15, m19 paddw m16, m20 paddw m15, m17 paddw m16, m18 pmulhrsw m15, m7 pmulhrsw m16, m7 packuswb m15, m16 vpermq m15, m14, m15 mova [dstq+dsq*0], ym15 vextracti32x8 [dstq+dsq*1], m15, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop vzeroupper RET .v_w64: .v_w128: lea r6d, [hq+wq*4-256] mov r4, srcq mov r7, dstq .v_loop0: movu m2, [srcq+ssq*0] movu m4, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q movu m13, [srcq+ssq*0] movu m15, [srcq+ssq*1] movu m17, [srcq+ssq*2] add srcq, ss3q movu m0, [srcq+ssq*0] punpcklbw m1, m2, m4 ; 01l punpckhbw m2, m4 ; 01h punpcklbw m3, m4, m6 ; 12l punpckhbw m4, m6 ; 12h punpcklbw m5, m6, m13 ; 23l punpckhbw m6, m13 ; 23h punpcklbw m12, m13, m15 ; 34l punpckhbw m13, m15 ; 34h punpcklbw m14, m15, m17 ; 45l punpckhbw m15, m17 ; 45h punpcklbw m16, m17, m0 ; 56l punpckhbw m17, m0 ; 56h .v_loop: pmaddubsw m18, m1, m8 ; a0l mova m1, m5 pmaddubsw m19, m2, m8 ; a0h mova m2, m6 pmaddubsw m20, m3, m8 ; b0l mova m3, m12 pmaddubsw m21, m4, m8 ; b0h mova m4, m13 pmaddubsw m5, m9 ; a1l pmaddubsw m6, m9 ; a1h pmaddubsw m12, m9 ; b1l pmaddubsw m13, m9 ; b1h paddw m18, m5 mova m5, m14 pmaddubsw m14, m10 ; a2l paddw m19, m6 mova m6, m15 pmaddubsw m15, m10 ; a2h paddw m20, m12 mova m12, m16 pmaddubsw m16, m10 ; b2l paddw m21, m13 mova m13, m17 pmaddubsw m17, m10 ; b2h paddw m18, m14 paddw m19, m15 paddw m20, m16 paddw m21, m17 movu m17, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m14, m0, m17 ; 67l punpckhbw m15, m0, m17 ; 67h pmaddubsw m16, m14, m11 ; a3l pmaddubsw m0, m15, m11 ; a3h paddw m18, m16 paddw m19, m0 movu m0, [srcq+ssq*0] punpcklbw m16, m17, m0 ; 78l punpckhbw m17, m0 ; 78h pmulhrsw m18, m7 pmulhrsw m19, m7 packuswb m18, m19 mova [dstq+dsq*0], m18 pmaddubsw m18, m16, m11 ; b3l pmaddubsw m19, m17, m11 ; b3h paddw m18, m20 paddw m19, m21 pmulhrsw m18, m7 pmulhrsw m19, m7 packuswb m18, m19 mova [dstq+dsq*1], m18 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_loop add r4, 64 add r7, 64 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 256 jg .v_loop0 vzeroupper RET .hv: cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq vpbroadcastd m7, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m8, [pd_2] vpbroadcastq ym0, [base+subpel_filters+myq*8] lea ss3q, [ssq*3] vpbroadcastd ym9, [pd_32768] mov r6, srcq punpcklbw ym0, ym8, ym0 sub r6, ss3q psraw ym0, 2 ; << 6 mova xm14, [spel_hv_end] pshufd ym10, ym0, q0000 pshufd ym11, ym0, q1111 pshufd ym12, ym0, q2222 pshufd ym13, ym0, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 ym6, [subpel_h_shuf4] movq xmm2, [r6+ssq*0] movhps xmm2, [r6+ssq*1] movq xmm0, [r6+ssq*2] movhps xmm0, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm4, [srcq+ssq*2] add srcq, ss3q vpbroadcastq ymm1, [srcq+ssq*0] vpblendd ymm2, ymm3, 0x30 vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 pshufb ymm2, ym6 pshufb ymm0, ym6 mova ymm1, ym8 vpdpbusd ymm1, ymm2, ym7 mova ymm2, ym8 vpdpbusd ymm2, ymm0, ym7 packssdw ymm2, ymm1, ymm2 psraw ymm2, 2 vextracti128 xmm3, ymm2, 1 palignr xmm4, xmm3, xmm2, 4 punpcklwd xmm1, xmm2, xmm4 ; 01 12 punpckhwd xmm2, xmm4 ; 23 34 pshufd xmm0, xmm3, q2121 punpcklwd xmm3, xmm0 ; 45 56 .hv_w2_loop: movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm4, [srcq+ssq*0] mova xmm5, xm9 vpdpwssd xmm5, xmm1, xm10 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm5, xmm2, xm11 ; a1 b1 pshufb xmm4, xm6 mova xmm2, xmm3 vpdpwssd xmm5, xmm3, xm12 ; a2 b2 mova xmm3, xm8 vpdpbusd xmm3, xmm4, xm7 packssdw xmm4, xmm3, xmm3 psraw xmm4, 2 palignr xmm3, xmm4, xmm0, 12 mova xmm0, xmm4 punpcklwd xmm3, xmm4 ; 67 78 vpdpwssd xmm5, xmm3, xm13 ; a3 b3 packuswb xmm5, xmm5 pshufb xmm5, xm14 pextrw [dstq+dsq*0], xmm5, 0 pextrw [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop vzeroupper RET .hv_w4: movq xmm1, [r6+ssq*0] vpbroadcastq ym2, [r6+ssq*1] vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 vinserti32x4 m2, [srcq+ssq*0], 2 vinserti32x4 m1, [srcq+ssq*1], 2 vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 vbroadcasti32x4 m6, [subpel_h_shufA] add srcq, ss3q vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 pshufb m2, m6 pshufb m1, m6 mova m0, m8 vpdpbusd m0, m2, m7 mova m4, m8 vpdpbusd m4, m1, m7 mova ym1, [spel_hv_perm4a] mova ym2, [spel_hv_perm4b] mova ym3, [spel_hv_perm4c] packssdw m0, m4 psraw m0, 2 ; _ 0 1 2 3 4 5 6 mov r6d, 0x5555 vpermb ym1, ym1, ym0 ; 01 12 vpermb m2, m2, m0 ; 23 34 vpermb m3, m3, m0 ; 45 56 kmovw k1, r6d mova ym15, [spel_hv_perm4d] .hv_w4_loop: movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 mova ym5, ym9 vpdpwssd ym5, ym1, ym10 ; a0 b0 mova ym1, ym2 pshufb ym4, ym6 mova ym0, ym8 vpdpbusd ym0, ym4, ym7 vpdpwssd ym5, ym2, ym11 ; a1 b1 mova ym2, ym3 vpdpwssd ym5, ym3, ym12 ; a2 b2 vpsraw ym3{k1}, ym0, 2 ; 7 8 vpermb ym3, ym15, ym3 ; 67 78 vpdpwssd ym5, ym3, ym13 ; a3 b3 packuswb ym5, ym5 vpermb ym5, ym14, ym5 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [base+subpel_filters+mxq*8+0] vpbroadcastd m11, [base+subpel_filters+mxq*8+4] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m8, [pd_2] vpbroadcastq m0, [base+subpel_filters+myq*8] vpbroadcastd m9, [pd_32768] punpcklbw m0, m8, m0 lea ss3q, [ssq*3] psraw m0, 2 ; << 6 pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 cmp wd, 8 jne .hv_w16 mov r6, srcq sub r6, ss3q movu xmm1, [r6+ssq*0] vinserti128 ymm1, [r6+ssq*1], 1 movu xmm2, [srcq+ssq*1] vinserti32x4 m6, zmm1, [r6+ssq*2], 2 vinserti128 ymm2, [srcq+ssq*2], 1 vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3 add srcq, ss3q vbroadcasti32x4 m4, [subpel_h_shufA] vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ vbroadcasti32x4 m7, [subpel_h_shufB] vbroadcasti32x4 m17, [subpel_h_shufC] pshufb m1, m6, m4 ; 0 1 2 3 0123 mova m2, m8 vpdpbusd m2, m1, m10 pshufb m5, m6, m7 ; 0 1 2 3 4567 mova m1, m8 vpdpbusd m1, m5, m10 pshufb m4, m0, m4 ; 4 5 6 _ 0123 mova m3, m8 vpdpbusd m3, m4, m10 pshufb m7, m0, m7 ; 4 5 6 _ 4567 mova m4, m8 vpdpbusd m4, m7, m10 pshufb m6, m17 vpdpbusd m2, m5, m11 vpdpbusd m1, m6, m11 pshufb m6, m0, m17 vpdpbusd m3, m7, m11 vpdpbusd m4, m6, m11 mova m5, [spel_hv_perm8a] mova m0, [spel_hv_perm8b] mov r6, 0x55555555ff00 packssdw m2, m1 packssdw m3, m4 mova m18, [spel_hv_perm8c] psraw m2, 2 ; 0 1 2 3 psraw m3, 2 ; 4 5 6 _ vpermb m1, m5, m2 ; 01 12 vbroadcasti32x8 m6, [subpel_h_shufA] kmovq k1, r6 vpermt2b m2, m0, m3 ; 23 34 vbroadcasti32x8 m7, [subpel_h_shufB] kshiftrq k2, k1, 16 mova xm16, [spel_hv_end] vpermb m3, m5, m3 ; 45 56 .hv_w8_loop: vbroadcasti32x4 ym4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti32x4 m4{k1}, [srcq+ssq*0] mova m0, m9 vpdpwssd m0, m1, m12 ; a0 b0 pshufb m1, m4, m6 ; 7 8 0123 4567 mova m5, m8 vpdpbusd m5, m1, m10 pshufb m4, m7 ; 7 8 4567 89ab vpdpwssd m0, m2, m13 ; a1 b1 mova m1, m2 vpdpbusd m5, m4, m11 mova m2, m3 vpdpwssd m0, m3, m14 ; a2 b2 psraw m3{k2}, m5, 2 ; 75 86 vpermb m3, m18, m3 ; 67 78 vpdpwssd m0, m3, m15 ; a3 b3 packuswb m0, m0 vpermb zmm1, m16, m0 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: movu m7, [spel_hv_perm16a] sub srcq, ss3q mova m20, [spel_hv_perm16b] lea r6d, [wq*2-32] mova m21, [spel_hv_perm16c] mov r4, srcq mov r7, dstq mova ym16, [spel_hv_end16] lea r6d, [hq+r6*8] .hv_w16_loop0: movu ym17, [srcq+ssq*0] vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 movu ym18, [srcq+ssq*2] add srcq, ss3q vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3 movu ym19, [srcq+ssq*1] vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5 add srcq, ss3q vpermb m2, m7, m17 ; 0 1 0123 89ab vpermb m0, m20, m17 ; 0 1 4567 cdef vpermb m4, m7, m18 ; 2 3 0123 89ab mova m1, m8 vpdpbusd m1, m2, m10 vpermb m5, m20, m18 ; 2 3 4567 cdef mova m2, m8 vpdpbusd m2, m0, m10 vpermb m17, m21, m17 ; 0 1 89ab ghij mova m3, m8 vpdpbusd m3, m4, m10 vpermb m6, m7, m19 ; 4 5 0123 89ab mova m4, m8 vpdpbusd m4, m5, m10 vpermb m18, m21, m18 ; 2 3 89ab ghij vpdpbusd m1, m0, m11 movu ym0, [srcq+ssq*0] ; 6 vpdpbusd m2, m17, m11 vpermb m17, m20, m19 ; 4 5 4567 cdef vpdpbusd m3, m5, m11 mova m5, m8 vpdpbusd m5, m6, m10 mova m6, m8 vpdpbusd m6, m17, m10 vpdpbusd m4, m18, m11 mova m18, [spel_hv_perm16d] vpermb m18, m18, m0 ; 6 0145 2367 89cd abef vpdpbusd m5, m17, m11 vpermb m19, m21, m19 ; 4 5 89ab ghij mova m17, m8 vpdpbusd m17, m18, m10 mova m18, [spel_hv_perm16e] vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij packssdw m1, m2 ; 01 vpdpbusd m6, m19, m11 packssdw m3, m4 ; 23 vpdpbusd m17, m0, m11 psraw m1, 2 packssdw m5, m6 ; 45 psraw m3, 2 vpshrdd m2, m1, m3, 16 ; 12 psraw m5, 2 vpshrdd m4, m3, m5, 16 ; 34 psraw m17, 2 vpshrdd m6, m5, m17, 16 ; 56 .hv_w16_loop: movu ym18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0], 1 mova m0, m9 vpdpwssd m0, m1, m12 ; a0 vpermb m1, m7, m18 ; 7 8 0123 89ab mova m17, m9 vpdpwssd m17, m2, m12 ; b0 vpermb m2, m20, m18 ; 7 8 4567 cdef mova m19, m8 vpdpbusd m19, m1, m10 vpermb m18, m21, m18 mova m1, m8 vpdpbusd m1, m2, m10 vpdpwssd m0, m3, m13 ; a1 vpdpwssd m17, m4, m13 ; b1 vpdpbusd m19, m2, m11 mova m2, m4 vpdpbusd m1, m18, m11 mova m4, m6 vpdpwssd m0, m5, m14 ; a2 vpdpwssd m17, m6, m14 ; b2 packssdw m19, m1 mova m1, m3 mova m3, m5 psraw m6, m19, 2 ; 7 8 vpshrdd m5, m4, m6, 16 ; 6 7 vpdpwssd m17, m6, m15 ; b3 vpdpwssd m0, m5, m15 ; a3 packuswb m0, m17 vpermb zmm1, m16, m0 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w16_loop0 vzeroupper RET %macro PREP_8TAP_H 0 vpermb m10, m5, m0 vpermb m11, m5, m1 vpermb m12, m6, m0 vpermb m13, m6, m1 vpermb m14, m7, m0 vpermb m15, m7, m1 mova m0, m4 vpdpbusd m0, m10, m8 mova m2, m4 vpdpbusd m2, m12, m8 mova m1, m4 vpdpbusd m1, m11, m8 mova m3, m4 vpdpbusd m3, m13, m8 vpdpbusd m0, m12, m9 vpdpbusd m2, m14, m9 vpdpbusd m1, m13, m9 vpdpbusd m3, m15, m9 packssdw m0, m2 packssdw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 %endmacro %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_FN smooth, SMOOTH, SMOOTH PREP_8TAP_FN sharp_regular, SHARP, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r7+wq*2+table_offset(prep,)] add wq, r7 lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h: test myd, 0xf00 jnz .hv vpbroadcastd m4, [pd_2] WIN64_SPILL_XMM 10 cmp wd, 4 je .h_w4 tzcnt wd, wd shr mxd, 16 sub srcq, 3 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0] vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4] add wq, r7 jmp wq .h_w4: movzx mxd, mxb vbroadcasti128 ym5, [subpel_h_shufA] mov r3d, 0x4 dec srcq vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2] kmovb k1, r3d lea stride3q, [strideq*3] .h_w4_loop: movq xm2, [srcq+strideq*0] movq xm3, [srcq+strideq*1] vpbroadcastq ym2{k1}, [srcq+strideq*2] vpbroadcastq ym3{k1}, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pshufb ym2, ym5 pshufb ym3, ym5 mova ym0, ym4 vpdpbusd ym0, ym2, ym6 mova ym1, ym4 vpdpbusd ym1, ym3, ym6 packssdw ym0, ym1 psraw ym0, 2 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: vbroadcasti128 m5, [subpel_h_shufA] vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] lea stride3q, [strideq*3] .h_w8_loop: movu xmm3, [srcq+strideq*0] vinserti128 ym3, ymm3, [srcq+strideq*1], 1 vinserti128 m3, [srcq+strideq*2], 2 vinserti128 m3, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pshufb m1, m3, m5 pshufb m2, m3, m6 mova m0, m4 vpdpbusd m0, m1, m8 mova m1, m4 vpdpbusd m1, m2, m8 pshufb m3, m7 vpdpbusd m0, m2, m9 vpdpbusd m1, m3, m9 packssdw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h_w16: mova m5, [spel_h_perm16a] mova m6, [spel_h_perm16b] mova m7, [spel_h_perm16c] lea stride3q, [strideq*3] .h_w16_loop: movu ym0, [srcq+strideq*0] movu ym1, [srcq+strideq*2] vinserti32x8 m0, [srcq+strideq*1], 1 vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] PREP_8TAP_H add tmpq, 64*2 sub hd, 4 jg .h_w16_loop RET .h_w32: mova m5, [spel_h_perm32a] mova m6, [spel_h_perm32b] mova m7, [spel_h_perm32c] .h_w32_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] lea srcq, [srcq+strideq*2] PREP_8TAP_H add tmpq, 64*2 sub hd, 2 jg .h_w32_loop RET .h_w64: xor r6d, r6d jmp .h_start .h_w128: mov r6, -64*1 .h_start: mova m5, [spel_h_perm32a] mova m6, [spel_h_perm32b] mova m7, [spel_h_perm32c] sub srcq, r6 mov r5, r6 .h_loop: movu m0, [srcq+r6+32*0] movu m1, [srcq+r6+32*1] PREP_8TAP_H add tmpq, 64*2 add r6, 64 jle .h_loop add srcq, strideq mov r6, r5 dec hd jg .h_loop RET .v: movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having tzcnt wd, wd cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. ; TODO: Would a 6-tap code path be worth it? lea myq, [r7+myq*8+subpel_filters-prep_avx512icl] movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)] add wq, r7 lea stride3q, [strideq*3] sub srcq, stride3q vpbroadcastd m7, [pw_8192] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] jmp wq .v_w4: movd xmm0, [srcq+strideq*0] vpbroadcastd ymm1, [srcq+strideq*2] vpbroadcastd xmm2, [srcq+strideq*1] vpbroadcastd ymm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ vpbroadcastd ymm0, [srcq+strideq*0] vpbroadcastd ymm2, [srcq+strideq*1] vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ vpbroadcastd ymm0, [srcq+strideq*2] vbroadcasti128 ymm5, [deint_shuf4] vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 punpckhbw ymm2, ymm3 ; 23 34 45 56 .v_w4_loop: pinsrd xmm0, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] vpbroadcastd ymm3, [srcq+strideq*0] vpbroadcastd ymm4, [srcq+strideq*1] vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ vpbroadcastd ymm0, [srcq+strideq*2] vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ pshufb ymm3, ymm5 ; 67 78 89 9a pmaddubsw ymm4, ymm1, ym8 vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 pmaddubsw ymm2, ym9 paddw ymm4, ymm2 mova ymm2, ymm3 pmaddubsw ymm3, ym11 paddw ymm3, ymm4 pmaddubsw ymm4, ymm1, ym10 paddw ymm3, ymm4 pmulhrsw ymm3, ym7 mova [tmpq], ymm3 add tmpq, 32 sub hd, 4 jg .v_w4_loop vzeroupper RET .v_w8: mov r3d, 0xf044 kmovw k1, r3d kshiftrw k2, k1, 8 movq xm0, [srcq+strideq*0] vpbroadcastq ym1, [srcq+strideq*1] vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m4, [srcq+strideq*0] vpbroadcastq m5, [srcq+strideq*1] vpbroadcastq m6, [srcq+strideq*2] vmovdqa64 ym0{k1}, ym1 vmovdqa64 ym1{k1}, ym2 vmovdqa64 m2{k1}, m3 vmovdqa64 m3{k1}, m4 vmovdqa64 m4{k1}, m5 vmovdqa64 m5{k1}, m6 punpcklbw ym0, ym1 ; 01 12 __ __ punpcklbw m2, m3 ; 23 34 23 34 punpcklbw m4, m5 ; 45 56 45 56 vmovdqa64 m0{k2}, m2 ; 01 12 23 34 vmovdqa64 m2{k2}, m4 ; 23 34 45 56 .v_w8_loop: vpbroadcastq m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*0] vpbroadcastq m5, [srcq+strideq*1] pmaddubsw m14, m0, m8 pmaddubsw m15, m2, m9 vpblendmq m0{k1}, m6, m1 vpblendmq m2{k1}, m1, m3 vpbroadcastq m6, [srcq+strideq*2] paddw m14, m15 punpcklbw m2, m0, m2 ; 67 78 67 78 vpblendmq m12{k1}, m3, m5 vpblendmq m13{k1}, m5, m6 vpblendmq m0{k2}, m4, m2 ; 45 56 67 78 punpcklbw m4, m12, m13 ; 89 9a 89 9a vmovdqa64 m2{k2}, m4 ; 67 78 89 9a pmaddubsw m12, m0, m10 pmaddubsw m13, m2, m11 paddw m14, m12 paddw m14, m13 pmulhrsw m14, m7 mova [tmpq], m14 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: mov r3d, 0xf0 kmovb k1, r3d vbroadcasti128 m0, [srcq+strideq*0] vbroadcasti128 m1, [srcq+strideq*1] vbroadcasti128 m2, [srcq+strideq*2] vbroadcasti128 m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] vbroadcasti128 m6, [srcq+strideq*2] vmovdqa64 m0{k1}, m1 vmovdqa64 m1{k1}, m2 vmovdqa64 m2{k1}, m3 vmovdqa64 m3{k1}, m4 vmovdqa64 m4{k1}, m5 vmovdqa64 m5{k1}, m6 shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_-- shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_-- punpckhbw m2, m0, m1 ; 23a 23b 34a 34b punpcklbw m0, m1 ; 01a 01b 12a 12b punpcklbw m4, m5 ; 45a 45b 56a 56b .v_w16_loop: vbroadcasti128 m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vbroadcasti128 m5, [srcq+strideq*0] vpblendmq m1{k1}, m6, m3 vmovdqa64 m3{k1}, m5 pmaddubsw m12, m0, m8 pmaddubsw m13, m2, m8 pmaddubsw m14, m2, m9 pmaddubsw m15, m4, m9 pmaddubsw m0, m4, m10 vbroadcasti128 m2, [srcq+strideq*1] vbroadcasti128 m6, [srcq+strideq*2] paddw m12, m14 paddw m13, m15 paddw m12, m0 vmovdqa64 m5{k1}, m2 vmovdqa64 m2{k1}, m6 mova m0, m4 shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab punpcklbw m2, m1, m3 ; 67a 67b 78a 78b punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab pmaddubsw m14, m2, m10 pmaddubsw m15, m2, m11 paddw m13, m14 paddw m12, m15 pmaddubsw m14, m4, m11 paddw m13, m14 pmulhrsw m12, m7 pmulhrsw m13, m7 mova [tmpq+ 0], m12 mova [tmpq+64], m13 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: mova m18, [bilin_v_perm64] movu ym0, [srcq+strideq*0] movu ym1, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu ym2, [srcq+strideq*0] movu ym3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu ym4, [srcq+strideq*0] movu ym5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu ym6, [srcq+strideq*0] vpermq m0, m18, m0 vpermq m1, m18, m1 vpermq m2, m18, m2 vpermq m3, m18, m3 vpermq m4, m18, m4 vpermq m5, m18, m5 vpermq m6, m18, m6 punpcklbw m0, m1 punpcklbw m1, m2 punpcklbw m2, m3 punpcklbw m3, m4 punpcklbw m4, m5 punpcklbw m5, m6 .v_w32_loop: movu ym12, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu ym13, [srcq+strideq*0] pmaddubsw m14, m0, m8 pmaddubsw m16, m2, m9 pmaddubsw m15, m1, m8 pmaddubsw m17, m3, m9 mova m0, m2 mova m1, m3 vpermq m12, m18, m12 vpermq m13, m18, m13 paddw m14, m16 paddw m15, m17 pmaddubsw m16, m4, m10 pmaddubsw m17, m5, m10 punpcklbw m6, m12 punpcklbw m12, m13 mova m2, m4 mova m3, m5 paddw m14, m16 paddw m15, m17 pmaddubsw m16, m6, m11 pmaddubsw m17, m12, m11 mova m4, m6 mova m5, m12 paddw m14, m16 paddw m15, m17 pmulhrsw m14, m7 pmulhrsw m15, m7 mova m6, m13 mova [tmpq+ 0], m14 mova [tmpq+64], m15 add tmpq, 64*2 sub hd, 2 jg .v_w32_loop vzeroupper RET .v_w64: mov wd, 64 jmp .v_start .v_w128: mov wd, 128 .v_start: WIN64_SPILL_XMM 27 mova m26, [bilin_v_perm64] lea r6d, [hq+wq*2] mov r5, srcq mov r7, tmpq .v_loop0: vpermq m0, m26, [srcq+strideq*0] vpermq m1, m26, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermq m2, m26, [srcq+strideq*0] vpermq m3, m26, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermq m4, m26, [srcq+strideq*0] vpermq m5, m26, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermq m6, m26, [srcq+strideq*0] punpckhbw m12, m0, m1 punpcklbw m0, m1 punpckhbw m13, m1, m2 punpcklbw m1, m2 punpckhbw m14, m2, m3 punpcklbw m2, m3 punpckhbw m15, m3, m4 punpcklbw m3, m4 punpckhbw m16, m4, m5 punpcklbw m4, m5 punpckhbw m17, m5, m6 punpcklbw m5, m6 .v_loop: vpermq m18, m26, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermq m19, m26, [srcq+strideq*0] pmaddubsw m20, m0, m8 pmaddubsw m21, m12, m8 pmaddubsw m22, m1, m8 pmaddubsw m23, m13, m8 mova m0, m2 mova m12, m14 mova m1, m3 mova m13, m15 pmaddubsw m2, m9 pmaddubsw m14, m9 pmaddubsw m3, m9 pmaddubsw m15, m9 punpckhbw m24, m6, m18 punpcklbw m6, m18 paddw m20, m2 paddw m21, m14 paddw m22, m3 paddw m23, m15 mova m2, m4 mova m14, m16 mova m3, m5 mova m15, m17 pmaddubsw m4, m10 pmaddubsw m16, m10 pmaddubsw m5, m10 pmaddubsw m17, m10 punpckhbw m25, m18, m19 punpcklbw m18, m19 paddw m20, m4 paddw m21, m16 paddw m22, m5 paddw m23, m17 mova m4, m6 mova m16, m24 mova m5, m18 mova m17, m25 pmaddubsw m6, m11 pmaddubsw m24, m11 pmaddubsw m18, m11 pmaddubsw m25, m11 paddw m20, m6 paddw m21, m24 paddw m22, m18 paddw m23, m25 pmulhrsw m20, m7 pmulhrsw m21, m7 pmulhrsw m22, m7 pmulhrsw m23, m7 mova m6, m19 mova [tmpq+wq*0+ 0], m20 mova [tmpq+wq*0+64], m21 mova [tmpq+wq*2+ 0], m22 mova [tmpq+wq*2+64], m23 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_loop add r5, 64 add r7, 128 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 jg .v_loop0 RET .hv: %assign stack_offset stack_offset - stack_size_padded %assign stack_size_padded 0 WIN64_SPILL_XMM 16 cmp wd, 4 je .hv_w4 shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0] vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd tzcnt wd, wd vpbroadcastd m8, [pd_2] movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)] vpbroadcastd m9, [pd_32] add wq, r7 vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] lea stride3q, [strideq*3] sub srcq, stride3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 jmp wq .hv_w4: movzx mxd, mxb dec srcq vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] lea stride3q, [strideq*3] sub srcq, stride3q mov r3d, 0x04 kmovb k1, r3d kshiftlb k2, k1, 2 kshiftlb k3, k1, 4 vpbroadcastd m10, [pd_2] vbroadcasti128 m16, [subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend vpbroadcastd m11, [pd_32] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 movq xm3, [srcq+strideq*0] vpbroadcastq ym2, [srcq+strideq*1] vpbroadcastq ym3{k1}, [srcq+strideq*2] vpbroadcastq m2{k2}, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m3{k2}, [srcq+strideq*0] vpbroadcastq m2{k3}, [srcq+strideq*1] vpbroadcastq m3{k3}, [srcq+strideq*2] mova m17, [spel_hv_perm4a] movu m18, [spel_hv_perm4b] mova m0, m10 mova m1, m10 pshufb m2, m16 pshufb m3, m16 vpdpbusd m0, m2, m8 vpdpbusd m1, m3, m8 packssdw m0, m1 ; _ 0 1 2 3 4 5 6 psraw m0, 2 vpermb m1, m17, m0 ; 01 12 23 34 vpermb m2, m18, m0 ; 23 34 45 56 .hv_w4_loop: movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] movq xm4, [srcq+strideq*0] vpbroadcastq ym3{k1}, [srcq+strideq*1] vpbroadcastq ym4{k1}, [srcq+strideq*2] mova ym5, ym10 mova ym6, ym10 pshufb ym3, ym16 pshufb ym4, ym16 vpdpbusd ym5, ym3, ym8 vpdpbusd ym6, ym4, ym8 mova m7, m11 packssdw ym5, ym6 ; 7 8 9 a _ _ _ _ psraw ym5, 2 valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a vpdpwssd m7, m1, m12 vpdpwssd m7, m2, m13 vpermb m1, m17, m0 ; 45 56 67 78 vpermb m2, m18, m0 ; 67 78 89 9a vpdpwssd m7, m1, m14 vpdpwssd m7, m2, m15 psrad m7, 6 vpmovdw [tmpq], m7 add tmpq, 32 sub hd, 4 jg .hv_w4_loop vzeroupper RET .hv_w8: WIN64_SPILL_XMM 24 vbroadcasti128 m16, [subpel_h_shufA] vbroadcasti128 m17, [subpel_h_shufB] vbroadcasti128 m18, [subpel_h_shufC] vinserti128 ym0, [srcq+strideq*0], 1 vinserti128 m0, [srcq+strideq*1], 2 vinserti128 m0, [srcq+strideq*2], 3 movu xm1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti128 ym1, [srcq+strideq*0], 1 vinserti128 m1, [srcq+strideq*1], 2 vinserti128 m1, [srcq+strideq*2], 3 mova m2, m8 mova m4, m8 mova m3, m8 mova m5, m8 pshufb m20, m0, m16 pshufb m21, m0, m17 pshufb m22, m0, m18 pshufb m23, m1, m16 pshufb m6, m1, m17 pshufb m7, m1, m18 vpdpbusd m2, m20, m10 vpdpbusd m4, m21, m10 vpdpbusd m2, m21, m11 vpdpbusd m4, m22, m11 vpdpbusd m3, m23, m10 vpdpbusd m5, m6, m10 vpdpbusd m3, m6, m11 vpdpbusd m5, m7, m11 packssdw m2, m4 packssdw m3, m5 psraw m2, 2 ; _ 0 1 2 psraw m3, 2 ; 3 4 5 6 valignq m0, m3, m2, 2 ; 0 1 2 3 valignq m1, m3, m2, 4 ; 1 2 3 4 valignq m2, m3, m2, 6 ; 2 3 4 5 punpcklwd m4, m0, m1 ; 01a 12a 23a 34a punpckhwd m5, m0, m1 ; 01b 12b 23b 34b punpcklwd m6, m2, m3 ; 23a 34a 45a 56a punpckhwd m7, m2, m3 ; 23b 34b 45b 56b .hv_w8_loop: movu xm19, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti128 ym19, [srcq+strideq*0], 1 vinserti128 m19, [srcq+strideq*1], 2 vinserti128 m19, [srcq+strideq*2], 3 mova m20, m9 mova m21, m9 mova m22, m8 mova m23, m8 vpdpwssd m20, m4, m12 vpdpwssd m21, m5, m12 vpdpwssd m20, m6, m13 vpdpwssd m21, m7, m13 pshufb m0, m19, m16 pshufb m1, m19, m17 pshufb m2, m19, m18 vpdpbusd m22, m0, m10 vpdpbusd m23, m1, m10 vpdpbusd m22, m1, m11 vpdpbusd m23, m2, m11 packssdw m22, m23 psraw m22, 2 ; 7 8 9 A valignq m0, m22, m3, 2 ; 4 5 6 7 valignq m1, m22, m3, 4 ; 5 6 7 8 valignq m2, m22, m3, 6 ; 6 7 8 9 mova m3, m22 punpcklwd m4, m0, m1 ; 45a 56a 67a 78a punpckhwd m5, m0, m1 ; 45b 56b 67b 78b punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab vpdpwssd m20, m4, m14 vpdpwssd m21, m5, m14 vpdpwssd m20, m6, m15 vpdpwssd m21, m7, m15 psrad m20, 6 psrad m21, 6 packssdw m20, m21 mova [tmpq], m20 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: mov wd, 16*2 jmp .hv_start .hv_w32: mov wd, 32*2 jmp .hv_start .hv_w64: mov wd, 64*2 jmp .hv_start .hv_w128: mov wd, 128*2 .hv_start: WIN64_SPILL_XMM 31 mova m16, [spel_h_perm16a] mova m17, [spel_h_perm16b] mova m18, [spel_h_perm16c] lea r6d, [hq+wq*8-256] mov r5, srcq mov r7, tmpq .hv_loop0: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] movu ym1, [srcq+strideq*0] vinserti32x8 m1, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] movu ym2, [srcq+strideq*0] vinserti32x8 m2, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] movu ym3, [srcq+strideq*0] mova m4, m8 mova m5, m8 mova m6, m8 mova m7, m8 vpermb m19, m16, m0 vpermb m20, m17, m0 vpermb m21, m18, m0 vpermb m22, m16, m1 vpermb m23, m17, m1 vpermb m24, m18, m1 vpermb m25, m16, m2 vpermb m26, m17, m2 vpermb m27, m18, m2 vpermb ym28, ym16, ym3 vpermb ym29, ym17, ym3 vpermb ym30, ym18, ym3 mova m0, m8 mova m1, m8 mova ym2, ym8 mova ym3, ym8 vpdpbusd m4, m19, m10 vpdpbusd m5, m20, m10 vpdpbusd m6, m22, m10 vpdpbusd m7, m23, m10 vpdpbusd m0, m25, m10 vpdpbusd m1, m26, m10 vpdpbusd ym2, ym28, ym10 vpdpbusd ym3, ym29, ym10 vpdpbusd m4, m20, m11 vpdpbusd m5, m21, m11 vpdpbusd m6, m23, m11 vpdpbusd m7, m24, m11 vpdpbusd m0, m26, m11 vpdpbusd m1, m27, m11 vpdpbusd ym2, ym29, ym11 vpdpbusd ym3, ym30, ym11 packssdw m4, m5 packssdw m6, m7 packssdw m0, m1 packssdw ym2, ym3 psraw m4, 2 ; 0a 0b 1a 1b psraw m6, 2 ; 2a 2b 3a 3b psraw m0, 2 ; 4a 4b 5a 5b psraw ym2, 2 ; 6a 6b __ __ vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b punpcklwd m2, m4, m5 ; 01a 01c 12a 12c punpckhwd m3, m4, m5 ; 01b 01d 12b 12d punpcklwd m4, m6, m7 ; 23a 23c 34a 34c punpckhwd m5, m6, m7 ; 23b 23d 34b 34d punpcklwd m6, m0, m1 ; 45a 45c 56a 56c punpckhwd m7, m0, m1 ; 45b 45d 56b 56d .hv_loop: movu ym19, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m19, [srcq+strideq*0], 1 mova m20, m9 mova m21, m9 mova m22, m8 mova m23, m8 vpdpwssd m20, m2, m12 vpdpwssd m21, m3, m12 vpdpwssd m20, m4, m13 vpdpwssd m21, m5, m13 vpermb m24, m16, m19 vpermb m25, m17, m19 vpermb m26, m18, m19 vpdpbusd m22, m24, m10 vpdpbusd m23, m25, m10 vpdpbusd m22, m25, m11 vpdpbusd m23, m26, m11 packssdw m22, m23 psraw m22, 2 ; 7a 7b 8a 8b vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b mova m2, m4 mova m3, m5 mova m1, m22 mova m4, m6 mova m5, m7 punpcklwd m6, m0, m1 ; 67a 67c 78a 78c punpckhwd m7, m0, m1 ; 67b 67d 78b 78d vpdpwssd m20, m4, m14 vpdpwssd m21, m5, m14 vpdpwssd m20, m6, m15 vpdpwssd m21, m7, m15 psrad m20, 6 psrad m21, 6 packssdw m20, m21 mova [tmpq+wq*0], ym20 vextracti32x8 [tmpq+wq*1], m20, 1 lea tmpq, [tmpq+wq*2] sub hd, 2 jg .hv_loop add r5, 16 add r7, 32 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 jg .hv_loop0 RET cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts vpbroadcastd m9, [pd_16384] mova ym15, [warp_8x8t_end] call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main jmp .start .loop: call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2 lea tmpq, [tmpq+tsq*4] .start: paddd m16, m16 vpermb m16, m15, m16 mova [tmpq+tsq*0], xm16 vextracti128 [tmpq+tsq*2], ym16, 1 sub r6d, 0x1800 jg .loop RET cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter vpbroadcastd m9, [pd_262144] mova xm15, [warp_8x8_end] call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m16, 19 packuswb m16, m16 vpermb m16, m15, m16 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 sub r6d, 0x1800 jg .loop RET ALIGN function_align .main: vpbroadcastd m1, [pd_512] %if WIN64 mov abcdq, r5mp vpaddd ym18, ym1, r6m {1to8} ; mx %else add r5d, 512 vpbroadcastd ym18, r5d %endif vpaddd ym20, ym1, r7m {1to8} ; my mova ym16, [pd_0to7] vpbroadcastd ym19, [abcdq+4*0] vpbroadcastd ym21, [abcdq+4*1] lea r4, [ssq*3+3] mova m10, [warp_8x8_permA] mov r6d, 0x5555 mova m11, [warp_8x8_permB] lea filterq, [mc_warp_filter+64*8] vpbroadcastq m12, [warp_8x8_hpack] sub srcq, r4 ; src -= src_stride*3 + 3 vbroadcasti32x4 m13, [warp_8x8_permC] kxnorb k2, k2, k2 vbroadcasti32x4 m14, [warp_8x8_permD] vpdpwssd ym18, ym19, ym16 ; alpha vpdpwssd ym20, ym21, ym16 ; gamma vbroadcasti32x4 m0, [srcq] psrad ym19, 16 ; beta psrad ym21, 16 ; delta kmovw k1, r6d psrad ym16, ym18, 10 kmovb k3, k2 paddd ym18, ym19 vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0 psrld m1, 8 ; pd_2 pshufb m0, m11 paddd m8, m1, m1 ; pd_4 vpdpbusd m1, m0, m2 call .h psllq m2, m1, 45 pslld m1, 13 paddd m1, m2 vpshrdq m1, m0, 48 ; 01 12 call .h vpshrdq m2, m1, m0, 48 ; 23 34 call .h vpshrdq m3, m2, m0, 48 ; 45 56 .main2: call .h psrad ym17, ym20, 10 kmovb k2, k3 paddd ym20, ym21 vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0 psrad ym16, ym20, 10 kmovb k3, k2 paddd ym20, ym21 vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1 shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3 mova m16, m9 pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1 vpdpwssd m16, m1, m4 pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3 mova m1, m2 vpdpwssd m16, m2, m5 shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7 mova m2, m3 pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5 vpdpwssd m16, m3, m4 vpshrdq m3, m0, 48 ; 67 78 pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7 vpdpwssd m16, m3, m5 ret ALIGN function_align .h: movu xm5, [srcq+ssq*1] psrad ym16, ym18, 10 lea srcq, [srcq+ssq*2] vinserti32x4 ym5, [srcq+ssq*0], 1 kmovb k2, k3 paddd ym18, ym19 vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1 psrad ym17, ym18, 10 kmovb k3, k2 paddd ym18, ym19 vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2 mova m0, m8 vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7 vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3 vpdpbusd m0, m4, m17 vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7 vpdpbusd m0, m5, m16 vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3) ret %macro BIDIR_FN 1 ; op lea stride3q, [strideq*3] jmp wq .w4: cmp hd, 8 jg .w4_h16 WRAP_YMM %1 0 vextracti32x4 xm1, ym0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_ret lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_ret: RET .w4_h16: vpbroadcastd m7, strided pmulld m7, [bidir_sctr_w4] %1 0 kxnorw k1, k1, k1 vpscatterdd [dstq+m7]{k1}, m0 RET .w8: cmp hd, 4 jne .w8_h8 WRAP_YMM %1 0 vextracti32x4 xm1, ym0, 1 movq [dstq ], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: %1_INC_PTR 2 lea dstq, [dstq+strideq*4] .w8_h8: %1 0 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq ], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq ], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16_loop: %1_INC_PTR 2 lea dstq, [dstq+strideq*4] .w16: %1 0 vpermq m0, m0, q3120 mova [dstq ], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m7, [pb_02461357] .w32_loop: %1 0 %1_INC_PTR 2 vpermq m0, m7, m0 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m7, [pb_02461357] .w64_loop: %1 0 %1_INC_PTR 2 vpermq m0, m7, m0 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET .w128: pmovzxbq m7, [pb_02461357] .w128_loop: %1 0 vpermq m6, m7, m0 %1 2 mova [dstq+64*0], m6 %1_INC_PTR 4 vpermq m6, m7, m0 mova [dstq+64*1], m6 add dstq, strideq dec hd jg .w128_loop RET %endmacro %macro AVG 1 ; src_offset mova m0, [tmp1q+(%1+0)*mmsize] paddw m0, [tmp2q+(%1+0)*mmsize] mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m4, [base+pw_1024] add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m0, [tmp1q+(%1+0)*mmsize] psubw m2, m0, [tmp2q+(%1+0)*mmsize] mova m1, [tmp1q+(%1+1)*mmsize] psubw m3, m1, [tmp2q+(%1+1)*mmsize] pmulhw m2, m4 pmulhw m3, m4 paddw m0, m2 paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg_avx512icl_table lea r6, [w_avg_avx512icl_table] tzcnt wd, wm movifnidn hd, hm vpbroadcastw m4, r6m ; weight movsxd wq, dword [r6+wq*4] vpbroadcastd m5, [base+pw_2048] psllw m4, 12 ; (weight-16) << 12 when interpreted as signed add wq, r6 cmp dword r6m, 7 jg .weight_gt7 mov r6, tmp1q pxor m0, m0 mov tmp1q, tmp2q psubw m4, m0, m4 ; -weight mov tmp2q, r6 .weight_gt7: BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 %if mmsize == 64 vpermq m3, m8, [maskq+%1*32] %else vpermq m3, [maskq+%1*16], q3120 %endif mova m0, [tmp2q+(%1+0)*mmsize] psubw m1, m0, [tmp1q+(%1+0)*mmsize] psubb m3, m4, m3 paddw m1, m1 ; (b - a) << 1 paddb m3, m3 punpcklbw m2, m4, m3 ; -m << 9 pmulhw m1, m2 paddw m0, m1 mova m1, [tmp2q+(%1+1)*mmsize] psubw m2, m1, [tmp1q+(%1+1)*mmsize] paddw m2, m2 punpckhbw m3, m4, m3 pmulhw m2, m3 paddw m1, m2 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*32 add tmp2q, %1*64 add tmp1q, %1*64 %endmacro cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx512icl_table lea r7, [mask_avx512icl_table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] pxor m4, m4 mova m8, [base+bilin_v_perm64] vpbroadcastd m5, [base+pw_2048] add wq, r7 BIDIR_FN MASK %macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 mova m%1, [tmp1q+mmsize*%3] mova m1, [tmp2q+mmsize*%3] psubw m1, m%1 pabsw m%2, m1 psubusw m%2, m6, m%2 psrlw m%2, 8 ; 64 - m psllw m2, m%2, 10 pmulhw m1, m2 paddw m%1, m1 mova m1, [tmp1q+mmsize*%4] mova m2, [tmp2q+mmsize*%4] psubw m2, m1 pabsw m3, m2 psubusw m3, m6, m3 vpshldw m%2, m3, 8 psllw m3, m%2, 10 %if %5 psubb m%2, m5, m%2 %endif pmulhw m2, m3 paddw m1, m2 pmulhrsw m%1, m7 pmulhrsw m1, m7 packuswb m%1, m1 %endmacro cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx512icl_table lea r7, [w_mask_420_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm movsxd wq, [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] vpbroadcastd m9, [base+pb_m64] ; -1 << 6 mova ym10, [base+wm_420_mask+32] vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6 add wq, r7 mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: mova m5, [wm_420_perm4] cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1 vinserti128 ym5, [wm_420_perm4+32], 1 vpermb ym4, ym5, ym4 vpdpbusd ym8, ym4, ym9 vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_end: vpermb ym8, ym10, ym8 movq [maskq], xm8 RET .w4_h16: vpbroadcastd m11, strided pmulld m11, [bidir_sctr_w4] W_MASK 0, 4, 0, 1 vpermb m4, m5, m4 vpdpbusd m8, m4, m9 kxnorw k1, k1, k1 vpermb m8, m10, m8 mova [maskq], xm8 vpscatterdd [dstq+m11]{k1}, m0 RET .w8: mova m5, [wm_420_perm8] cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1 vinserti128 ym5, [wm_420_perm8+32], 1 vpermb ym4, ym5, ym4 vpdpbusd ym8, ym4, ym9 vpermb m8, m10, m8 mova [maskq], xm8 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 16 lea dstq, [dstq+strideq*4] .w8_h8: W_MASK 0, 4, 0, 1 vpermb m4, m5, m4 mova m1, m8 vpdpbusd m1, m4, m9 vpermb m1, m10, m1 mova [maskq], xm1 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16: mova m5, [wm_420_perm16] .w16_loop: W_MASK 0, 4, 0, 1 vpermb m4, m5, m4 mova m1, m8 vpdpbusd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m0, q3120 mova [maskq], xm1 add maskq, 16 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m5, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpbusd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m5, m0 mova [maskq], xm1 add maskq, 16 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14 psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15 .w64_loop: W_MASK 0, 4, 0, 2 W_MASK 11, 5, 1, 3 mova m2, m8 vpdpbusd m2, m4, m9 mova m3, m8 vpdpbusd m3, m5, m9 add tmp1q, 256 add tmp2q, 256 vpermt2b m2, m10, m3 mova m1, m0 vpermt2q m0, m12, m11 vpermt2q m1, m13, m11 mova [maskq], ym2 add maskq, 32 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64_loop RET .w128: pmovzxbq m14, [wm_420_perm64] mova m10, [wm_420_mask] psrlq m15, m14, 4 .w128_loop: W_MASK 0, 12, 0, 4 W_MASK 11, 13, 1, 5 mova m4, m8 vpdpbusd m4, m12, m9 mova m5, m8 vpdpbusd m5, m13, m9 mova m1, m0 vpermt2q m0, m14, m11 vpermt2q m1, m15, m11 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*1+64*0], m1 W_MASK 0, 12, 2, 6 W_MASK 11, 13, 3, 7 vprold m4, 16 vprold m5, 16 vpdpbusd m4, m12, m9 vpdpbusd m5, m13, m9 add tmp1q, 512 add tmp2q, 512 vpermt2b m4, m10, m5 mova m1, m0 vpermt2q m0, m14, m11 vpermt2q m1, m15, m11 mova [maskq], m4 add maskq, 64 mova [dstq+strideq*0+64*1], m0 mova [dstq+strideq*1+64*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w128_loop RET cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx512icl_table lea r7, [w_mask_422_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] vpbroadcastd m9, [base+pw_m128] mova m10, [base+wm_422_mask] vpbroadcastd m11, [base+pb_127] add wq, r7 vpbroadcastd m8, [base+wm_sign+4+r6*4] mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1 movhps xm10, [wm_422_mask+16] vpdpwssd ym8, ym4, ym9 vpermb ym8, ym10, ym8 vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_end: pand xm8, xm11 mova [maskq], xm8 RET .w4_h16: vpbroadcastd m5, strided pmulld m5, [bidir_sctr_w4] W_MASK 0, 4, 0, 1 vpdpwssd m8, m4, m9 kxnorw k1, k1, k1 vpermb m8, m10, m8 pand ym8, ym11 mova [maskq], ym8 vpscatterdd [dstq+m5]{k1}, m0 RET .w8: cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1 movhps xm10, [wm_422_mask+16] vpdpwssd ym8, ym4, ym9 vpermb ym8, ym10, ym8 pand xm8, xm11 mova [maskq], xm8 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 32 lea dstq, [dstq+strideq*4] .w8_h8: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 vpermb m1, m10, m1 pand ym1, ym11 mova [maskq], ym1 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 32 lea dstq, [dstq+strideq*4] .w16: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 vpermb m1, m10, m1 vpermq m0, m0, q3120 pand ym1, ym11 mova [maskq], ym1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m5, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m5, m0 pand ym1, ym11 mova [maskq], ym1 add maskq, 32 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m5, [pb_02461357] .w64_loop: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m5, m0 pand ym1, ym11 mova [maskq], ym1 add maskq, 32 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET .w128: pmovzxbq m13, [pb_02461357] .w128_loop: W_MASK 0, 4, 0, 1 W_MASK 12, 5, 2, 3 mova m2, m8 vpdpwssd m2, m4, m9 mova m3, m8 vpdpwssd m3, m5, m9 add tmp1q, 256 add tmp2q, 256 vpermt2b m2, m10, m3 vpermq m0, m13, m0 vpermq m1, m13, m12 pand m2, m11 mova [maskq], m2 add maskq, 64 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq dec hd jg .w128_loop RET cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx512icl_table lea r7, [w_mask_444_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m5, [base+pb_64] vpbroadcastd m7, [base+pw_2048] mova m8, [base+wm_444_mask] add wq, r7 mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1, 1 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_end: RET .w4_h16: vpbroadcastd m9, strided pmulld m9, [bidir_sctr_w4] W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 kxnorw k1, k1, k1 mova [maskq], m4 vpscatterdd [dstq+m9]{k1}, m0 RET .w8: cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1, 1 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 64 lea dstq, [dstq+strideq*4] .w8_h8: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 mova [maskq], m4 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 64 lea dstq, [dstq+strideq*4] .w16: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 vpermq m0, m0, q3120 mova [maskq], m4 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m9, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 add tmp1q, 128 add tmp2q, 128 vpermq m0, m9, m0 mova [maskq], m4 add maskq, 64 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m9, [pb_02461357] .w64_loop: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 add tmp1q, 128 add tmp2q, 128 vpermq m0, m9, m0 mova [maskq], m4 add maskq, 64 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET .w128: pmovzxbq m11, [pb_02461357] .w128_loop: W_MASK 0, 4, 0, 1, 1 W_MASK 10, 9, 2, 3, 1 vpermb m4, m8, m4 vpermb m9, m8, m9 add tmp1q, 256 add tmp2q, 256 vpermq m0, m11, m0 vpermq m10, m11, m10 mova [maskq+64*0], m4 mova [maskq+64*1], m9 add maskq, 128 mova [dstq+64*0], m0 mova [dstq+64*1], m10 add dstq, strideq dec hd jg .w128_loop RET cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask %define base r6-blend_avx512icl_table lea r6, [blend_avx512icl_table] tzcnt wd, wm movifnidn maskq, maskmp movifnidn hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m6, [base+pb_64] vpbroadcastd m7, [base+pw_512] sub tmpq, maskq add wq, r6 lea r6, [dsq*3] jmp wq .w4: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 vpbroadcastd xmm1, [dstq+dsq*2] pinsrd xmm1, [dstq+r6 ], 3 mova xmm4, [maskq] mova xmm5, [maskq+tmpq] add maskq, 4*4 psubb xmm3, xm6, xmm4 punpcklbw xmm0, xmm5 punpcklbw xmm2, xmm3, xmm4 punpckhbw xmm1, xmm5 punpckhbw xmm3, xmm4 pmaddubsw xmm0, xmm2 pmaddubsw xmm1, xmm3 pmulhrsw xmm0, xm7 pmulhrsw xmm1, xm7 packuswb xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 pextrd [dstq+dsq*2], xmm0, 2 pextrd [dstq+r6 ], xmm0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET .w8: movq xmm0, [dstq+dsq*0] vpbroadcastq xmm1, [dstq+dsq*1] vpbroadcastq ymm2, [dstq+dsq*2] vpbroadcastq ymm3, [dstq+r6 ] mova ymm4, [maskq] mova ymm5, [maskq+tmpq] add maskq, 8*4 vpblendd ymm0, ymm2, 0x30 vpblendd ymm1, ymm3, 0xc0 psubb ymm3, ym6, ymm4 punpcklbw ymm0, ymm5 punpcklbw ymm2, ymm3, ymm4 punpckhbw ymm1, ymm5 punpckhbw ymm3, ymm4 pmaddubsw ymm0, ymm2 pmaddubsw ymm1, ymm3 pmulhrsw ymm0, ym7 pmulhrsw ymm1, ym7 packuswb ymm0, ymm1 vextracti128 xmm1, ymm0, 1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 movq [dstq+dsq*2], xmm1 movhps [dstq+r6 ], xmm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 vzeroupper RET .w16: mova xm1, [dstq+dsq*0] vinserti32x4 ym1, [dstq+dsq*1], 1 vinserti32x4 m1, [dstq+dsq*2], 2 mova m4, [maskq] vinserti32x4 m1, [dstq+r6 ], 3 mova m5, [maskq+tmpq] add maskq, 16*4 psubb m3, m6, m4 punpcklbw m0, m1, m5 punpcklbw m2, m3, m4 punpckhbw m1, m5 punpckhbw m3, m4 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m7 pmulhrsw m1, m7 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 vextracti32x4 [dstq+dsq*2], m0, 2 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w16 RET .w32: mova ym1, [dstq+dsq*0] vinserti32x8 m1, [dstq+dsq*1], 1 mova m4, [maskq] mova m5, [maskq+tmpq] add maskq, 32*2 psubb m3, m6, m4 punpcklbw m0, m1, m5 punpcklbw m2, m3, m4 punpckhbw m1, m5 punpckhbw m3, m4 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m7 pmulhrsw m1, m7 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32 RET cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_v_avx512icl_table lea r5, [blend_v_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m5, [base+pw_512] add wq, r5 add maskq, obmc_masks-blend_v_avx512icl_table jmp wq .w2: vpbroadcastd xmm2, [maskq+2*2] .w2_s0_loop: movd xmm0, [dstq+dsq*0] pinsrw xmm0, [dstq+dsq*1], 1 movd xmm1, [tmpq] add tmpq, 2*2 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm2 pmulhrsw xmm0, xm5 packuswb xmm0, xmm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_s0_loop RET .w4: vpbroadcastq xmm2, [maskq+4*2] .w4_loop: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movq xmm1, [tmpq] add tmpq, 4*2 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm2 pmulhrsw xmm0, xm5 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET .w8: mova xmm3, [maskq+8*2] .w8_loop: movq xmm0, [dstq+dsq*0] vpbroadcastq xmm1, [dstq+dsq*1] mova xmm2, [tmpq] add tmpq, 8*2 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 pmaddubsw xmm0, xmm3 pmaddubsw xmm1, xmm3 pmulhrsw xmm0, xm5 pmulhrsw xmm1, xm5 packuswb xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: vbroadcasti32x4 ym3, [maskq+16*2] vbroadcasti32x4 ym4, [maskq+16*3] .w16_loop: mova xm1, [dstq+dsq*0] vinserti32x4 ym1, [dstq+dsq*1], 1 mova ym2, [tmpq] add tmpq, 16*2 punpcklbw ym0, ym1, ym2 punpckhbw ym1, ym2 pmaddubsw ym0, ym3 pmaddubsw ym1, ym4 pmulhrsw ym0, ym5 pmulhrsw ym1, ym5 packuswb ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET .w32: mova m4, [maskq+32*2] vshufi32x4 m3, m4, m4, q2020 vshufi32x4 m4, m4, q3131 .w32_loop: mova ym1, [dstq+dsq*0] vinserti32x8 m1, [dstq+dsq*1], 1 mova m2, [tmpq] add tmpq, 32*2 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m4 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32_loop RET cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask %define base r6-blend_h_avx512icl_table lea r6, [blend_h_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] lea maskq, [base+obmc_masks+hq*2] vpbroadcastd m5, [base+pw_512] lea hd, [hq*3] add wq, r6 shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd xmm0, [dstq+dsq*0] pinsrw xmm0, [dstq+dsq*1], 1 movd xmm2, [maskq+hq*2] movd xmm1, [tmpq] add tmpq, 2*2 punpcklwd xmm2, xmm2 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm2 pmulhrsw xmm0, xm5 packuswb xmm0, xmm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova xmm3, [blend_shuf] .w4_loop: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movd xmm2, [maskq+hq*2] movq xmm1, [tmpq] add tmpq, 4*2 pshufb xmm2, xmm3 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm2 pmulhrsw xmm0, xm5 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET .w8: vbroadcasti128 ymm4, [blend_shuf] shufpd ymm4, ymm4, 0x03 .w8_loop: vpbroadcastq ymm1, [dstq+dsq*0] movq xmm0, [dstq+dsq*1] vpblendd ymm0, ymm1, 0x30 vpbroadcastd ymm3, [maskq+hq*2] movq xmm1, [tmpq+8*1] vinserti128 ymm1, [tmpq+8*0], 1 add tmpq, 8*2 pshufb ymm3, ymm4 punpcklbw ymm0, ymm1 pmaddubsw ymm0, ymm3 pmulhrsw ymm0, ym5 vextracti128 xmm1, ymm0, 1 packuswb xmm0, xmm1 movhps [dstq+dsq*0], xmm0 movq [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop vzeroupper RET .w16: vbroadcasti32x4 ym4, [blend_shuf] shufpd ym4, ym4, 0x0c .w16_loop: mova xm1, [dstq+dsq*0] vinserti32x4 ym1, [dstq+dsq*1], 1 vpbroadcastd ym3, [maskq+hq*2] mova ym2, [tmpq] add tmpq, 16*2 pshufb ym3, ym4 punpcklbw ym0, ym1, ym2 punpckhbw ym1, ym2 pmaddubsw ym0, ym3 pmaddubsw ym1, ym3 pmulhrsw ym0, ym5 pmulhrsw ym1, ym5 packuswb ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16_loop RET .w32: vbroadcasti32x4 m4, [blend_shuf] shufpd m4, m4, 0xf0 .w32_loop: mova ym1, [dstq+dsq*0] vinserti32x8 m1, [dstq+dsq*1], 1 vpbroadcastd m3, [maskq+hq*2] mova m2, [tmpq] add tmpq, 32*2 pshufb m3, m4 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w32_loop RET .w64: vpbroadcastw m3, [maskq+hq*2] mova m1, [dstq] mova m2, [tmpq] add tmpq, 32*2 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq], m0 add dstq, dsq inc hq jl .w64 RET .w128: vpbroadcastw m6, [maskq+hq*2] mova m2, [dstq+64*0] mova m1, [tmpq+64*0] mova m3, [dstq+64*1] mova m4, [tmpq+64*1] add tmpq, 64*2 punpcklbw m0, m2, m1 punpckhbw m2, m1 pmaddubsw m0, m6 pmaddubsw m2, m6 punpcklbw m1, m3, m4 punpckhbw m3, m4 pmaddubsw m1, m6 pmaddubsw m3, m6 REPX {pmulhrsw x, m5}, m0, m2, m1, m3 packuswb m0, m2 packuswb m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq inc hq jl .w128 RET cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 sub dword mx0m, 4<<14 sub dword src_wm, 8 mov r6, ~0 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm kmovq k3, r6 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pw_m256] vpbroadcastd m7, [base+pd_63] vbroadcasti32x4 m15, [base+pb_8x0_8x8] vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] pslld m5, 4 ; dx*16 pslld m6, 14 pxor m2, m2 mova m16, [base+resize_permA] mova m17, [base+resize_permB] mova xm18, [base+resize_permC] .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset vptestmd k4, m1, m1 pand m9, m7 ; filter offset (masked) ktestw k4, k4 jz .load vextracti32x8 ym12, m0, 1 vextracti32x8 ym13, m1, 1 kmovq k1, k3 kmovq k2, k3 vpgatherdq m10{k1}, [srcq+ym0] vpgatherdq m11{k2}, [srcq+ym12] kmovq k1, k3 kmovq k2, k3 vpgatherdq m14{k1}, [base+resize_shuf+4+ym1] vpgatherdq m0{k2}, [base+resize_shuf+4+ym13] mova m12, m16 mova m13, m17 paddb m14, m15 paddb m0, m15 pshufb m10, m14 pshufb m11, m0 vpermi2d m12, m10, m11 vpermi2d m13, m10, m11 jmp .filter .load: kmovq k1, k3 kmovq k2, k3 vpgatherdd m12{k1}, [srcq+m0+0] vpgatherdd m13{k2}, [srcq+m0+4] .filter: kmovq k1, k3 kmovq k2, k3 vpgatherdd m10{k1}, [base+resize_filter+m9*8+0] vpgatherdd m11{k2}, [base+resize_filter+m9*8+4] mova m14, m2 vpdpbusd m14, m12, m10 vpdpbusd m14, m13, m11 packssdw m14, m14 pmulhrsw m14, m3 packuswb m14, m14 vpermd m14, m18, m14 mova [dstq+xq], xm14 paddd m4, m5 add xd, 16 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET %endif ; ARCH_X86_64 av-scenechange-0.14.1/src/asm/x86/mc_sse.asm000064400000000000000000011361121046102023000165320ustar 00000000000000; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; Copyright © 2018, VideoLabs ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" SECTION_RODATA 16 ; dav1d_obmc_masks[] with 64-x interleaved obmc_masks: db 0, 0, 0, 0 ; 2 @4 db 45, 19, 64, 0 ; 4 @8 db 39, 25, 50, 14, 59, 5, 64, 0 ; 8 @16 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 ; 16 @32 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 ; 32 @64 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 rescale_mul: dd 0, 1, 2, 3 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 wm_420_sign: times 4 dw 258 times 4 dw 257 wm_422_sign: times 8 db 128 times 8 db 127 pb_8x0_8x8: times 8 db 0 times 8 db 8 bdct_lb_dw: times 4 db 0 times 4 db 4 times 4 db 8 times 4 db 12 pb_64: times 16 db 64 pw_m256: times 8 dw -256 pw_1: times 8 dw 1 pw_2: times 8 dw 2 pw_8: times 8 dw 8 pw_15: times 8 dw 15 pw_26: times 8 dw 26 pw_34: times 8 dw 34 pw_512: times 8 dw 512 pw_1024: times 8 dw 1024 pw_2048: times 8 dw 2048 pw_6903: times 8 dw 6903 pw_8192: times 8 dw 8192 pd_32: times 4 dd 32 pd_63: times 4 dd 63 pd_512: times 4 dd 512 pd_16384: times 4 dd 16484 pd_32768: times 4 dd 32768 pd_262144:times 4 dd 262144 pd_0x3ff: times 4 dd 0x3ff pd_0x4000:times 4 dd 0x4000 pq_0x40000000: times 2 dq 0x40000000 const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage ; [-1, 0) db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0 db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0 db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0 db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0 db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0 db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0 db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0 db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0 db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0 db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0 db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0 db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0 db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0 db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0 db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0 db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0 db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0 db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0 db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0 db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0 db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0 db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0 db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0 db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0 db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0 db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0 db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0 db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0 db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0 db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0 db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0 db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0 ; [0, 1) db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0 db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0 db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1 db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1 db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1 db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1 db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1 db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1 db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2 db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2 db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2 db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2 db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2 db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2 db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2 db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2 db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2 db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2 db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2 db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2 db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2 db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2 db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2 db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2 db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2 db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1 db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2 db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1 db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1 db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1 db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0 db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0 ; [1, 2) db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0 db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1 db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1 db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1 db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1 db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2 db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2 db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2 db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3 db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3 db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3 db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4 db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4 db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4 db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4 db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4 db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4 db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4 db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4 db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4 db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4 db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4 db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4 db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3 db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3 db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3 db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2 db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2 db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2 db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1 db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1 db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0 db 0, 0, 2, -1, 0, 0, 127, 0 pw_258: times 2 dw 258 cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) %macro BIDIR_JMP_TABLE 2-* ;evaluated at definition time (in loop below) %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) ; dynamically generated label %%table: %rep %0 - 2 ; repeat for num args dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep) %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put) %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep) BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX SECTION .text INIT_XMM ssse3 %if ARCH_X86_32 DECLARE_REG_TMP 1 %define base t0-put_ssse3 %else DECLARE_REG_TMP 7 %define base 0 %endif %macro RESTORE_DSQ_32 1 %if ARCH_X86_32 mov %1, dsm ; restore dsq %endif %endmacro cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx LEA t0, put_ssse3 movifnidn srcq, srcmp movifnidn ssq, ssmp tzcnt wd, wm mov hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [t0+wq*2+table_offset(put,)] add wq, t0 RESTORE_DSQ_32 t0 jmp wq .put_w2: movzx r4d, word [srcq+ssq*0] movzx r6d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r4w mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r4d, [srcq+ssq*0] mov r6d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r4d mov [dstq+dsq*1], r6d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq [dstq+dsq*0], m0 movq [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0+16*0] movu m1, [srcq+ssq*0+16*1] movu m2, [srcq+ssq*1+16*0] movu m3, [srcq+ssq*1+16*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+16*0], m0 mova [dstq+dsq*0+16*1], m1 mova [dstq+dsq*1+16*0], m2 mova [dstq+dsq*1+16*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] add srcq, ssq mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 add dstq, dsq dec hd jg .put_w64 RET .put_w128: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] mova [dstq+16*4], m0 mova [dstq+16*5], m1 mova [dstq+16*6], m2 mova [dstq+16*7], m3 add srcq, ssq add dstq, dsq dec hd jg .put_w128 RET .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 0x00ff00ff mova m4, [base+bilin_h_shuf8] mova m0, [base+bilin_h_shuf4] add mxyd, 0x00100010 movd m5, mxyd mov mxyd, r7m ; my pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] mova m3, [base+pw_2048] add wq, t0 movifnidn dsq, dsmp jmp wq .h_w2: pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} .h_w2_loop: movd m0, [srcq+ssq*0] movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq m0, m1 pshufb m0, m4 pmaddubsw m0, m5 pmulhrsw m0, m3 packuswb m0, m0 movd r6d, m0 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movq m4, [srcq+ssq*0] movhps m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m4, m0 pmaddubsw m4, m5 pmulhrsw m4, m3 packuswb m4, m4 movd [dstq+dsq*0], m4 psrlq m4, 32 movd [dstq+dsq*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .h_w16 RET .h_w32: movu m0, [srcq+mmsize*0+8*0] movu m1, [srcq+mmsize*0+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movu m1, [srcq+mmsize*1+8*0] movu m2, [srcq+mmsize*1+8*1] add srcq, ssq pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 packuswb m1, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, dsq dec hd jg .h_w32 RET .h_w64: mov r6, -16*3 .h_w64_loop: movu m0, [srcq+r6+16*3+8*0] movu m1, [srcq+r6+16*3+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+r6+16*3], m0 add r6, 16 jle .h_w64_loop add srcq, ssq add dstq, dsq dec hd jg .h_w64 RET .h_w128: mov r6, -16*7 .h_w128_loop: movu m0, [srcq+r6+16*7+8*0] movu m1, [srcq+r6+16*7+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+r6+16*7], m0 add r6, 16 jle .h_w128_loop add srcq, ssq add dstq, dsq dec hd jg .h_w128 RET .v: movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] imul mxyd, 0x00ff00ff mova m5, [base+pw_2048] add mxyd, 0x00100010 add wq, t0 movd m4, mxyd pshufd m4, m4, q0000 movifnidn dsq, dsmp jmp wq .v_w2: movd m0, [srcq+ssq*0] .v_w2_loop: pinsrw m0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] pshuflw m1, m0, q2301 pinsrw m0, [srcq+ssq*0], 0 ; 2 1 punpcklbw m1, m0 pmaddubsw m1, m4 pmulhrsw m1, m5 packuswb m1, m1 movd r6d, m1 mov [dstq+dsq*1], r6w shr r6d, 16 mov [dstq+dsq*0], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd m0, [srcq+ssq*0] .v_w4_loop: movd m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m1, m0 movd m0, [srcq+ssq*0] punpckldq m1, m2 ; 0 1 punpckldq m2, m0 ; 1 2 punpcklbw m1, m2 pmaddubsw m1, m4 pmulhrsw m1, m5 packuswb m1, m1 movd [dstq+dsq*0], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 ; lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq m0, [srcq+ssq*0] .v_w8_loop: movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m1, m0 movq m0, [srcq+ssq*0] punpcklbw m1, m2 punpcklbw m2, m0 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET %macro PUT_BILIN_V_W16 0 movu m0, [srcq+ssq*0] %%loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m1, m0 mova m2, m0 movu m0, [srcq+ssq*0] punpcklbw m1, m3 punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 punpcklbw m2, m3, m0 punpckhbw m3, m0 pmaddubsw m2, m4 pmaddubsw m3, m4 pmulhrsw m2, m5 pmulhrsw m3, m5 packuswb m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg %%loop %endmacro .v_w16: PUT_BILIN_V_W16 RET .v_w128: lea r6d, [hq+(7<<16)] jmp .v_w16gt .v_w64: lea r6d, [hq+(3<<16)] jmp .v_w16gt .v_w32: lea r6d, [hq+(1<<16)] .v_w16gt: mov r4, srcq %if ARCH_X86_64 mov r7, dstq %endif .v_w16gt_loop: PUT_BILIN_V_W16 %if ARCH_X86_64 add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 %else mov dstq, dstmp add r4, 16 movzx hd, r6w add dstq, 16 mov srcq, r4 mov dstmp, dstq %endif sub r6d, 1<<16 jg .v_w16gt RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow mova m7, [base+pw_15] movd m6, mxyd add wq, t0 pshuflw m6, m6, q0000 paddb m5, m5 punpcklqdq m6, m6 jmp wq .hv_w2: RESTORE_DSQ_32 t0 movd m0, [srcq+ssq*0] punpckldq m0, m0 pshufb m0, m4 pmaddubsw m0, m5 .hv_w2_loop: movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m2, [srcq+ssq*0] punpckldq m1, m2 pshufb m1, m4 pmaddubsw m1, m5 ; 1 _ 2 _ shufps m2, m0, m1, q1032 ; 0 _ 1 _ mova m0, m1 psubw m1, m2 ; 2 * (src[x + src_stride] - src[x]) pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4 pavgw m2, m7 ; src[x] + 8 paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8 psrlw m1, 4 packuswb m1, m1 %if ARCH_X86_64 movq r6, m1 %else pshuflw m1, m1, q2020 movd r6d, m1 %endif mov [dstq+dsq*0], r6w shr r6, gprsize*4 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova m4, [base+bilin_h_shuf4] movddup m0, [srcq+ssq*0] movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w4_loop: movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps m1, [srcq+ssq*0] pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 shufps m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 pmulhw m1, m6 pavgw m2, m7 paddw m1, m2 psrlw m1, 4 packuswb m1, m1 movd [dstq+dsq*0], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: movu m0, [srcq+ssq*0] movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m2, m4 pmaddubsw m2, m5 psubw m1, m2, m0 pmulhw m1, m6 pavgw m0, m7 paddw m1, m0 movu m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 psubw m3, m0, m2 pmulhw m3, m6 pavgw m2, m7 paddw m3, m2 psrlw m1, 4 psrlw m3, 4 packuswb m1, m3 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w128: lea r6d, [hq+(7<<16)] jmp .hv_w16_start .hv_w64: lea r6d, [hq+(3<<16)] jmp .hv_w16_start .hv_w32: lea r6d, [hq+(1<<16)] .hv_w16_start: mov r4, srcq %if ARCH_X86_32 %define m8 [dstq] %else mov r7, dstq %endif .hv_w16: movifnidn dsq, dsmp %if WIN64 movaps r4m, m8 %endif .hv_w16_loop0: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w16_loop: add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 mova m8, m2 psubw m2, m0 pmulhw m2, m6 pavgw m0, m7 paddw m2, m0 mova m0, m3 psubw m3, m1 pmulhw m3, m6 pavgw m1, m7 paddw m3, m1 mova m1, m0 mova m0, m8 psrlw m2, 4 psrlw m3, 4 packuswb m2, m3 mova [dstq], m2 add dstq, dsmp dec hd jg .hv_w16_loop %if ARCH_X86_32 mov dstq, dstm add r4, 16 movzx hd, r6w add dstq, 16 mov srcq, r4 mov dstm, dstq %else add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 %endif sub r6d, 1<<16 jg .hv_w16_loop0 %if WIN64 movaps m8, r4m %endif RET %macro PSHUFB_BILIN_H8 2 ; dst, src %if cpuflag(ssse3) pshufb %1, %2 %else psrldq %2, %1, 1 punpcklbw %1, %2 %endif %endmacro %macro PSHUFB_BILIN_H4 3 ; dst, src, tmp %if cpuflag(ssse3) pshufb %1, %2 %else psrldq %2, %1, 1 punpckhbw %3, %1, %2 punpcklbw %1, %2 punpcklqdq %1, %3 %endif %endmacro %macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero %if cpuflag(ssse3) pmaddubsw %1, %2 %else %if %5 == 1 pxor %3, %3 %endif punpckhbw %4, %1, %3 punpcklbw %1, %1, %3 pmaddwd %4, %2 pmaddwd %1, %2 packssdw %1, %4 %endif %endmacro %macro PMULHRSW 5 ; dst, src, tmp, rndval, shift %if cpuflag(ssse3) pmulhrsw %1, %2 %else punpckhwd %3, %1, %4 punpcklwd %1, %4 pmaddwd %3, %2 pmaddwd %1, %2 psrad %3, %5 psrad %1, %5 packssdw %1, %3 %endif %endmacro %macro PREP_BILIN 0 %if ARCH_X86_32 %define base r6-prep%+SUFFIX %else %define base 0 %endif cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx LEA r6, prep%+SUFFIX tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: %if notcpuflag(ssse3) add r6, prep_ssse3 - prep_sse2 jmp prep_ssse3 %else movzx wd, word [r6+wq*2+table_offset(prep,)] pxor m4, m4 add wq, r6 lea stride3q, [strideq*3] jmp wq .prep_w4: movd m0, [srcq+strideq*0] movd m1, [srcq+strideq*1] movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpckldq m0, m1 punpckldq m2, m3 punpcklbw m0, m4 punpcklbw m2, m4 psllw m0, 4 psllw m2, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 4 jg .prep_w4 RET .prep_w8: movq m0, [srcq+strideq*0] movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpcklbw m0, m4 punpcklbw m1, m4 punpcklbw m2, m4 punpcklbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .prep_w8 RET .prep_w16: movu m1, [srcq+strideq*0] movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklbw m0, m1, m4 punpckhbw m1, m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 2 jg .prep_w16 RET .prep_w128: mov r3, -128 jmp .prep_w32_start .prep_w64: mov r3, -64 jmp .prep_w32_start .prep_w32: mov r3, -32 .prep_w32_start: sub srcq, r3 .prep_w32_vloop: mov r6, r3 .prep_w32_hloop: movu m1, [srcq+r6+16*0] movu m3, [srcq+r6+16*1] punpcklbw m0, m1, m4 punpckhbw m1, m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 add r6, 32 jl .prep_w32_hloop add srcq, strideq dec hd jg .prep_w32_vloop RET %endif .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] %if cpuflag(ssse3) imul mxyd, 0x00ff00ff mova m4, [base+bilin_h_shuf8] add mxyd, 0x00100010 %else imul mxyd, 0xffff add mxyd, 16 %endif movd m5, mxyd mov mxyd, r6m ; my pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] %if notcpuflag(ssse3) WIN64_SPILL_XMM 8 pxor m6, m6 %endif add wq, r6 jmp wq .h_w4: %if cpuflag(ssse3) mova m4, [base+bilin_h_shuf4] %endif lea stride3q, [strideq*3] .h_w4_loop: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] movq m1, [srcq+strideq*2] movhps m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] PSHUFB_BILIN_H4 m0, m4, m2 PMADDUBSW m0, m5, m6, m2, 0 PSHUFB_BILIN_H4 m1, m4, m2 PMADDUBSW m1, m5, m6, m2, 0 mova [tmpq+0 ], m0 mova [tmpq+16], m1 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: lea stride3q, [strideq*3] .h_w8_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PSHUFB_BILIN_H8 m3, m4 PMADDUBSW m0, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 PMADDUBSW m2, m5, m6, m7, 0 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .h_w8_loop RET .h_w16: movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] movu m2, [srcq+strideq*1+8*0] movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PSHUFB_BILIN_H8 m3, m4 PMADDUBSW m0, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 PMADDUBSW m2, m5, m6, m7, 0 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 2 jg .h_w16 RET .h_w128: mov r3, -128 jmp .h_w32_start .h_w64: mov r3, -64 jmp .h_w32_start .h_w32: mov r3, -32 .h_w32_start: sub srcq, r3 .h_w32_vloop: mov r6, r3 .h_w32_hloop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] movu m2, [srcq+r6+8*2] movu m3, [srcq+r6+8*3] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PSHUFB_BILIN_H8 m3, m4 PMADDUBSW m0, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 PMADDUBSW m2, m5, m6, m7, 0 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 add r6, 32 jl .h_w32_hloop add srcq, strideq dec hd jg .h_w32_vloop RET .v: %if notcpuflag(ssse3) %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 %endif movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] %if cpuflag(ssse3) imul mxyd, 0x00ff00ff add mxyd, 0x00100010 %else imul mxyd, 0xffff pxor m6, m6 add mxyd, 16 %endif add wq, r6 lea stride3q, [strideq*3] movd m5, mxyd pshufd m5, m5, q0000 jmp wq .v_w4: movd m0, [srcq+strideq*0] .v_w4_loop: movd m1, [srcq+strideq*1] movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpckldq m0, m1 punpckldq m1, m2 punpcklbw m0, m1 ; 01 12 PMADDUBSW m0, m5, m6, m7, 0 mova [tmpq+16*0], m0 movd m0, [srcq+strideq*0] punpckldq m2, m3 punpckldq m3, m0 punpcklbw m2, m3 ; 23 34 PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 4 jg .v_w4_loop RET .v_w8: movq m0, [srcq+strideq*0] .v_w8_loop: movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpcklbw m0, m1 ; 01 punpcklbw m1, m2 ; 12 PMADDUBSW m0, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*0], m0 movq m0, [srcq+strideq*0] punpcklbw m2, m3 ; 23 punpcklbw m3, m0 ; 34 PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq+16*1], m1 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .v_w8_loop RET .v_w16: movu m0, [srcq+strideq*0] .v_w16_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpcklbw m4, m0, m1 punpckhbw m0, m1 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m0, m5, m6, m7, 0 mova [tmpq+16*0], m4 punpcklbw m4, m1, m2 punpckhbw m1, m2 PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*1], m0 movu m0, [srcq+strideq*0] PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*2], m4 punpcklbw m4, m2, m3 punpckhbw m2, m3 PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*3], m1 PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq+16*4], m4 punpcklbw m4, m3, m0 punpckhbw m3, m0 PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*5], m2 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*6], m4 mova [tmpq+16*7], m3 add tmpq, 16*8 sub hd, 4 jg .v_w16_loop RET .v_w128: lea r3d, [hq+(3<<8)] mov r6d, 256 jmp .v_w32_start .v_w64: lea r3d, [hq+(1<<8)] mov r6d, 128 jmp .v_w32_start .v_w32: xor r3d, r3d mov r6d, 64 .v_w32_start: %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r7, tmpq %endif mov r5, srcq .v_w32_hloop: movu m0, [srcq+strideq*0+16*0] movu m1, [srcq+strideq*0+16*1] .v_w32_vloop: movu m2, [srcq+strideq*1+16*0] movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] punpcklbw m4, m0, m2 punpckhbw m0, m2 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m0, m5, m6, m7, 0 mova [tmpq+16*0], m4 mova [tmpq+16*1], m0 movu m0, [srcq+strideq*0+16*0] punpcklbw m4, m1, m3 punpckhbw m1, m3 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*2], m4 mova [tmpq+16*3], m1 movu m1, [srcq+strideq*0+16*1] add tmpq, r6 punpcklbw m4, m2, m0 punpckhbw m2, m0 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq+16*0], m4 mova [tmpq+16*1], m2 punpcklbw m4, m3, m1 punpckhbw m3, m1 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*2], m4 mova [tmpq+16*3], m3 add tmpq, r6 sub hd, 2 jg .v_w32_vloop add r5, 32 movzx hd, r3b mov srcq, r5 %if ARCH_X86_64 add r7, 16*4 mov tmpq, r7 %else mov tmpq, tmpmp add tmpq, 16*4 mov tmpmp, tmpq %endif sub r3d, 1<<8 jg .v_w32_hloop %if WIN64 POP r7 %endif RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] %assign stack_offset stack_offset - stack_size_padded %if cpuflag(ssse3) imul mxyd, 0x08000800 WIN64_SPILL_XMM 8 %else or mxyd, 1<<16 WIN64_SPILL_XMM 9 %if ARCH_X86_64 mova m8, [base+pw_8] %else %define m8 [base+pw_8] %endif pxor m7, m7 %endif movd m6, mxyd add wq, r6 pshufd m6, m6, q0000 jmp wq .hv_w4: %if cpuflag(ssse3) mova m4, [base+bilin_h_shuf4] movddup m0, [srcq+strideq*0] %else movhps m0, [srcq+strideq*0] %endif lea r3, [strideq*3] PSHUFB_BILIN_H4 m0, m4, m3 PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 .hv_w4_loop: movq m1, [srcq+strideq*1] movhps m1, [srcq+strideq*2] movq m2, [srcq+r3 ] lea srcq, [srcq+strideq*4] movhps m2, [srcq+strideq*0] PSHUFB_BILIN_H4 m1, m4, m3 PSHUFB_BILIN_H4 m2, m4, m3 PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 PMADDUBSW m2, m5, m7, m4, 0 ; 3 4 shufpd m0, m1, 0x01 ; 0 1 shufpd m3, m1, m2, 0x01 ; 2 3 psubw m1, m0 PMULHRSW m1, m6, m4, m8, 4 paddw m1, m0 mova m0, m2 psubw m2, m3 PMULHRSW m2, m6, m4, m8, 4 paddw m2, m3 mova [tmpq+16*0], m1 mova [tmpq+16*1], m2 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: movu m0, [srcq+strideq*0] PSHUFB_BILIN_H8 m0, m4 PMADDUBSW m0, m5, m7, m4, 0 ; 0 .hv_w8_loop: movu m1, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu m2, [srcq+strideq*0] PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PMADDUBSW m1, m5, m7, m4, 0 ; 1 PMADDUBSW m2, m5, m7, m4, 0 ; 2 psubw m3, m1, m0 PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 mova m0, m2 psubw m2, m1 PMULHRSW m2, m6, m4, m8, 4 paddw m2, m1 mova [tmpq+16*0], m3 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 2 jg .hv_w8_loop RET .hv_w128: lea r3d, [hq+(7<<8)] mov r5d, 256 jmp .hv_w16_start .hv_w64: lea r3d, [hq+(3<<8)] mov r5d, 128 jmp .hv_w16_start .hv_w32: lea r3d, [hq+(1<<8)] mov r5d, 64 jmp .hv_w16_start .hv_w16: xor r3d, r3d mov r5d, 32 .hv_w16_start: %if ARCH_X86_64 || cpuflag(ssse3) mov r6, srcq %endif %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r7, tmpq %endif .hv_w16_hloop: movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PMADDUBSW m0, m5, m7, m4, 0 ; 0a PMADDUBSW m1, m5, m7, m4, 0 ; 0b .hv_w16_vloop: movu m2, [srcq+strideq*1+8*0] PSHUFB_BILIN_H8 m2, m4 PMADDUBSW m2, m5, m7, m4, 0 ; 1a psubw m3, m2, m0 PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 mova [tmpq+16*0], m3 movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] PSHUFB_BILIN_H8 m3, m4 PMADDUBSW m3, m5, m7, m4, 0 ; 1b psubw m0, m3, m1 PMULHRSW m0, m6, m4, m8, 4 paddw m0, m1 mova [tmpq+16*1], m0 add tmpq, r5 movu m0, [srcq+strideq*0+8*0] PSHUFB_BILIN_H8 m0, m4 PMADDUBSW m0, m5, m7, m4, 0 ; 2a psubw m1, m0, m2 PMULHRSW m1, m6, m4, m8, 4 paddw m1, m2 mova [tmpq+16*0], m1 movu m1, [srcq+strideq*0+8*1] PSHUFB_BILIN_H8 m1, m4 PMADDUBSW m1, m5, m7, m4, 0 ; 2b psubw m2, m1, m3 PMULHRSW m2, m6, m4, m8, 4 paddw m2, m3 mova [tmpq+16*1], m2 add tmpq, r5 sub hd, 2 jg .hv_w16_vloop movzx hd, r3b %if ARCH_X86_64 add r6, 16 add r7, 2*16 mov srcq, r6 mov tmpq, r7 %elif cpuflag(ssse3) mov tmpq, tmpm add r6, 16 add tmpq, 2*16 mov srcq, r6 mov tmpm, tmpq %else mov srcq, srcm mov tmpq, tmpm add srcq, 16 add tmpq, 2*16 mov srcm, srcq mov tmpm, tmpq %endif sub r3d, 1<<8 jg .hv_w16_hloop %if WIN64 POP r7 %endif RET %endmacro ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; prefix, type, type_h, type_v cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) %endif %endmacro %if ARCH_X86_32 DECLARE_REG_TMP 1, 2 %elif WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif FN put_8tap, sharp, SHARP, SHARP FN put_8tap, sharp_smooth, SHARP, SMOOTH FN put_8tap, smooth_sharp, SMOOTH, SHARP FN put_8tap, smooth, SMOOTH, SMOOTH FN put_8tap, sharp_regular, SHARP, REGULAR FN put_8tap, regular_sharp, REGULAR, SHARP FN put_8tap, smooth_regular, SMOOTH, REGULAR FN put_8tap, regular_smooth, REGULAR, SMOOTH FN put_8tap, regular, REGULAR, REGULAR %if ARCH_X86_32 %define base_reg r1 %define base base_reg-put_ssse3 %else %define base_reg r8 %define base 0 %endif cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %assign org_stack_offset stack_offset imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h %if ARCH_X86_64 imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v %else imul ssd, mym, 0x010101 add ssd, t1d ; 8tap_v, my, 4tap_v mov srcq, srcm %endif mov wd, wm movifnidn hd, hm LEA base_reg, put_ssse3 test mxd, 0xf00 jnz .h %if ARCH_X86_32 test ssd, 0xf00 %else test myd, 0xf00 %endif jnz .v tzcnt wd, wd movzx wd, word [base_reg+wq*2+table_offset(put,)] add wq, base_reg ; put_bilin mangling jump %assign stack_offset org_stack_offset movifnidn dsq, dsmp movifnidn ssq, ssmp %if WIN64 pop r8 %endif lea r6, [ssq*3] jmp wq .h: %if ARCH_X86_32 test ssd, 0xf00 %else test myd, 0xf00 %endif jnz .hv movifnidn ssq, ssmp WIN64_SPILL_XMM 12 cmp wd, 4 jl .h_w2 je .h_w4 tzcnt wd, wd %if ARCH_X86_64 mova m10, [base+subpel_h_shufA] mova m11, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %endif shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3] mova m7, [base+pw_34] ; 2 + (8 << 2) pshufd m5, m6, q0000 pshufd m6, m6, q1111 add wq, base_reg jmp wq .h_w2: %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq mova m4, [base+subpel_h_shuf4] movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] mova m5, [base+pw_34] ; 2 + (8 << 2) pshufd m3, m3, q0000 movifnidn dsq, dsmp .h_w2_loop: movq m0, [srcq+ssq*0] movhps m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m4 pmaddubsw m0, m3 phaddw m0, m0 paddw m0, m5 ; pw34 psraw m0, 6 packuswb m0, m0 movd r6d, m0 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] mova m6, [base+subpel_h_shufA] mova m5, [base+pw_34] ; 2 + (8 << 2) pshufd m3, m3, q0000 movifnidn dsq, dsmp .h_w4_loop: movq m0, [srcq+ssq*0] ; 1 movq m1, [srcq+ssq*1] ; 2 lea srcq, [srcq+ssq*2] pshufb m0, m6 ; subpel_h_shufA pshufb m1, m6 ; subpel_h_shufA pmaddubsw m0, m3 ; subpel_filters pmaddubsw m1, m3 ; subpel_filters phaddw m0, m1 paddw m0, m5 ; pw34 psraw m0, 6 packuswb m0, m0 movd [dstq+dsq*0], m0 psrlq m0, 32 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] %if ARCH_X86_32 pshufb %2, %1, [base+subpel_h_shufB] pshufb %3, %1, [base+subpel_h_shufC] pshufb %1, [base+subpel_h_shufA] %else pshufb %2, %1, m11; subpel_h_shufB pshufb %3, %1, m9 ; subpel_h_shufC pshufb %1, m10 ; subpel_h_shufA %endif pmaddubsw %4, %2, m5 ; subpel +0 B0 pmaddubsw %2, m6 ; subpel +4 B4 pmaddubsw %3, m6 ; C4 pmaddubsw %1, m5 ; A0 paddw %3, %4 ; C4+B0 paddw %1, %2 ; A0+B4 phaddw %1, %3 paddw %1, m7 ; pw34 psraw %1, 6 %endmacro .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 %if ARCH_X86_32 movq [dstq], m0 add dstq, dsm movhps [dstq], m0 add dstq, dsm %else movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] %endif sub hd, 2 jg .h_w8 RET .h_w128: mov r4, -16*7 jmp .h_w16_start .h_w64: mov r4, -16*3 jmp .h_w16_start .h_w32: mov r4, -16*1 jmp .h_w16_start .h_w16: xor r4d, r4d .h_w16_start: sub srcq, r4 sub dstq, r4 .h_w16_loop_v: mov r6, r4 .h_w16_loop_h: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 mova [dstq+r6], m0 add r6, 16 jle .h_w16_loop_h add srcq, ssq add dstq, dsmp dec hd jg .h_w16_loop_v RET .v: %if ARCH_X86_32 movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] %else %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] %endif tzcnt r6d, wd movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] punpcklwd m0, m0 mova m7, [base+pw_512] add r6, base_reg %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed ALLOC_STACK -16*4 %assign regs_used 7 pshufd m1, m0, q0000 mova subpel0, m1 pshufd m1, m0, q1111 mova subpel1, m1 pshufd m1, m0, q2222 mova subpel2, m1 pshufd m1, m0, q3333 mova subpel3, m1 mov ssq, [rstk+stack_offset+gprsize*4] lea ssq, [ssq*3] sub srcq, ssq mov ssq, [rstk+stack_offset+gprsize*4] mov dsq, [rstk+stack_offset+gprsize*2] %else %define subpel0 m8 %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 lea ss3q, [ssq*3] pshufd m8, m0, q0000 sub srcq, ss3q pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 %endif jmp r6 .v_w2: movd m1, [srcq+ssq*0] movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movd m2, [srcq+ssq*0] movd m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m3, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] %else movd m2, [srcq+ssq*2] add srcq, ss3q movd m5, [srcq+ssq*0] movd m3, [srcq+ssq*1] movd m4, [srcq+ssq*2] add srcq, ss3q %endif punpcklwd m1, m0 ; 0 1 punpcklwd m0, m2 ; 1 2 punpcklbw m1, m0 ; 01 12 movd m0, [srcq+ssq*0] punpcklwd m2, m5 ; 2 3 punpcklwd m5, m3 ; 3 4 punpcklwd m3, m4 ; 4 5 punpcklwd m4, m0 ; 5 6 punpcklbw m2, m5 ; 23 34 punpcklbw m3, m4 ; 45 56 .v_w2_loop: movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 punpcklwd m3, m0, m4 ; 6 7 movd m0, [srcq+ssq*0] punpcklwd m4, m0 ; 7 8 punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 packuswb m5, m5 movd r6d, m5 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: %if ARCH_X86_32 .v_w8: .v_w16: .v_w32: .v_w64: .v_w128: shl wd, 14 %if STACK_ALIGNMENT < 16 %define dstm [rsp+mmsize*4+gprsize] mov dstm, dstq %endif lea r6d, [hq+wq-(1<<16)] mov r4, srcq .v_w4_loop0: %endif movd m1, [srcq+ssq*0] movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movd m2, [srcq+ssq*0] movd m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m3, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] %else movd m2, [srcq+ssq*2] add srcq, ss3q movd m5, [srcq+ssq*0] movd m3, [srcq+ssq*1] movd m4, [srcq+ssq*2] add srcq, ss3q %endif punpckldq m1, m0 ; 0 1 punpckldq m0, m2 ; 1 2 punpcklbw m1, m0 ; 01 12 movd m0, [srcq+ssq*0] punpckldq m2, m5 ; 2 3 punpckldq m5, m3 ; 3 4 punpckldq m3, m4 ; 4 5 punpckldq m4, m0 ; 5 6 punpcklbw m2, m5 ; 23 34 punpcklbw m3, m4 ; 45 56 .v_w4_loop: movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 punpckldq m3, m0, m4 ; 6 7 _ _ movd m0, [srcq+ssq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 packuswb m5, m5 movd [dstq+dsq*0], m5 psrlq m5, 32 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 mov dstq, dstm add r4, 4 movzx hd, r6w add dstq, 4 mov srcq, r4 mov dstm, dstq sub r6d, 1<<16 jg .v_w4_loop0 %endif RET %if ARCH_X86_64 .v_w8: .v_w16: .v_w32: .v_w64: .v_w128: lea r6d, [wq*8-64] mov r4, srcq mov r7, dstq lea r6d, [hq+r6*4] .v_w8_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] movq m3, [srcq+ssq*2] add srcq, ss3q movq m4, [srcq+ssq*0] movq m5, [srcq+ssq*1] movq m6, [srcq+ssq*2] add srcq, ss3q movq m0, [srcq+ssq*0] punpcklbw m1, m2 ; 01 punpcklbw m2, m3 ; 12 punpcklbw m3, m4 ; 23 punpcklbw m4, m5 ; 34 punpcklbw m5, m6 ; 45 punpcklbw m6, m0 ; 56 .v_w8_loop: movq m13, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m14, m1, subpel0 ; a0 mova m1, m3 pmaddubsw m15, m2, subpel0 ; b0 mova m2, m4 pmaddubsw m3, subpel1 ; a1 mova m12, m0 pmaddubsw m4, subpel1 ; b1 movq m0, [srcq+ssq*0] paddw m14, m3 paddw m15, m4 mova m3, m5 pmaddubsw m5, subpel2 ; a2 mova m4, m6 pmaddubsw m6, subpel2 ; b2 punpcklbw m12, m13 ; 67 punpcklbw m13, m0 ; 78 paddw m14, m5 mova m5, m12 pmaddubsw m12, subpel3 ; a3 paddw m15, m6 mova m6, m13 pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 packuswb m14, m15 movq [dstq+dsq*0], m14 movhps [dstq+dsq*1], m14 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop add r4, 8 add r7, 8 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 %undef subpel0 %undef subpel1 %undef subpel2 %undef subpel3 .hv: %assign stack_offset org_stack_offset cmp wd, 4 jg .hv_w8 %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2] %if ARCH_X86_32 movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] mov ssq, ssmp lea r6, [ssq*3] sub srcq, r6 %define base_reg r6 mov r6, r1; use as new base %assign regs_used 2 ALLOC_STACK -mmsize*14 %assign regs_used 7 mov dsq, [rstk+stack_offset+gprsize*2] %define subpelv0 [rsp+mmsize*0] %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m6, m0, q0000 mova subpelv0, m6 pshufd m6, m0, q1111 mova subpelv1, m6 pshufd m6, m0, q2222 mova subpelv2, m6 pshufd m6, m0, q3333 mova subpelv3, m6 %else movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] ALLOC_STACK mmsize*14, 14 lea ss3q, [ssq*3] sub srcq, ss3q %define subpelv0 m10 %define subpelv1 m11 %define subpelv2 m12 %define subpelv3 m13 punpcklbw m0, m0 psraw m0, 8 ; sign-extend mova m8, [base+pw_8192] mova m9, [base+pd_512] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 %endif pshufd m7, m1, q0000 cmp wd, 4 je .hv_w4 .hv_w2: mova m6, [base+subpel_h_shuf4] movq m2, [srcq+ssq*0] ; 0 movhps m2, [srcq+ssq*1] ; 0 _ 1 %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] lea srcq, [srcq+ssq*2] movq m0, [srcq+ssq*0] ; 2 movhps m0, [srcq+ssq*1] ; 2 _ 3 lea srcq, [srcq+ssq*2] %else %define w8192reg m8 %define d512reg m9 movq m0, [srcq+ssq*2] ; 2 add srcq, ss3q movhps m0, [srcq+ssq*0] ; 2 _ 3 %endif pshufb m2, m6 ; 0 ~ 1 ~ pshufb m0, m6 ; 2 ~ 3 ~ pmaddubsw m2, m7 ; subpel_filters pmaddubsw m0, m7 ; subpel_filters phaddw m2, m0 ; 0 1 2 3 pmulhrsw m2, w8192reg %if ARCH_X86_32 movq m3, [srcq+ssq*0] ; 4 movhps m3, [srcq+ssq*1] ; 4 _ 5 lea srcq, [srcq+ssq*2] %else movq m3, [srcq+ssq*1] ; 4 movhps m3, [srcq+ssq*2] ; 4 _ 5 add srcq, ss3q %endif movq m0, [srcq+ssq*0] ; 6 pshufb m3, m6 ; 4 ~ 5 ~ pshufb m0, m6 ; 6 ~ pmaddubsw m3, m7 ; subpel_filters pmaddubsw m0, m7 ; subpel_filters phaddw m3, m0 ; 4 5 6 _ pmulhrsw m3, w8192reg palignr m4, m3, m2, 4; V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 punpckhwd m2, m4 ; V 23 34 2 3 3 4 pshufd m0, m3, q2121; V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 4 5 5 6 .hv_w2_loop: movq m4, [srcq+ssq*1] ; V 7 lea srcq, [srcq+ssq*2] ; V movhps m4, [srcq+ssq*0] ; V 7 8 pshufb m4, m6 pmaddubsw m4, m7 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 ; V pmaddwd m2, subpelv1 ; V a1 b1 paddd m5, m2 ; V mova m2, m3 ; V pmaddwd m3, subpelv2 ; a2 b2 phaddw m4, m4 pmulhrsw m4, w8192reg paddd m5, m3 ; V palignr m3, m4, m0, 12 mova m0, m4 punpcklwd m3, m0 ; V 67 78 pmaddwd m4, m3, subpelv3 ; V a3 b3 paddd m5, d512reg paddd m5, m4 psrad m5, 10 packssdw m5, m5 packuswb m5, m5 movd r4d, m5 mov [dstq+dsq*0], r4w shr r4d, 16 mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET %undef w8192reg %undef d512reg .hv_w4: %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 %define hv4_line_0_3 7 %define hv4_line_0_4 8 %define hv4_line_0_5 9 %define hv4_line_1_0 10 %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 %macro SAVELINE_W4 3 mova [rsp+mmsize*hv4_line_%3_%2], %1 %endmacro %macro RESTORELINE_W4 3 mova %1, [rsp+mmsize*hv4_line_%3_%2] %endmacro %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] %else %define w8192reg m8 %define d512reg m9 %endif ; lower shuffle 0 1 2 3 4 mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 0 _ _ _ movhps m5, [srcq+ssq*1] ; 0 _ 1 _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movq m4, [srcq+ssq*0] ; 2 _ _ _ movhps m4, [srcq+ssq*1] ; 2 _ 3 _ lea srcq, [srcq+ssq*2] %else movq m4, [srcq+ssq*2] ; 2 _ _ _ movhps m4, [srcq+ss3q ] ; 2 _ 3 _ lea srcq, [srcq+ssq*4] %endif pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ pmaddubsw m2, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m2, m0 ;H 0 1 2 3 pmulhrsw m2, w8192reg ;H pw_8192 SAVELINE_W4 m2, 2, 0 ; upper shuffle 2 3 4 5 6 mova m6, [base+subpel_h_shuf4+16] pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ pmaddubsw m2, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m2, m0 ;H 0 1 2 3 pmulhrsw m2, w8192reg ;H pw_8192 ; ; lower shuffle mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 4 _ _ _ movhps m5, [srcq+ssq*1] ; 4 _ 5 _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movq m4, [srcq+ssq*0] ; 6 _ _ _ add srcq, ssq %else movq m4, [srcq+ssq*2] ; 6 _ _ _ add srcq, ss3q %endif pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ pmaddubsw m3, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg ;H pw_8192 SAVELINE_W4 m3, 3, 0 ; upper shuffle mova m6, [base+subpel_h_shuf4+16] pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ pmaddubsw m3, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg ;H pw_8192 ;process high palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 ;process low RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 .hv_w4_loop: ;process low pmaddwd m5, m1, subpelv0 ; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 mova m6, [base+subpel_h_shuf4] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ pmaddubsw m4, m7 ;H subpel_filters phaddw m4, m4 ;H 7 8 7 8 pmulhrsw m4, w8192reg ;H pw_8192 palignr m3, m4, m0, 12 ; 6 7 8 7 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 paddd m5, d512reg ; pd_512 paddd m5, m4 psrad m5, 10 SAVELINE_W4 m0, 0, 0 SAVELINE_W4 m1, 1, 0 SAVELINE_W4 m2, 2, 0 SAVELINE_W4 m3, 3, 0 SAVELINE_W4 m5, 5, 0 ;process high RESTORELINE_W4 m0, 0, 1 RESTORELINE_W4 m1, 1, 1 RESTORELINE_W4 m2, 2, 1 RESTORELINE_W4 m3, 3, 1 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 mova m6, [base+subpel_h_shuf4+16] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ lea srcq, [srcq+ssq*2] pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ pmaddubsw m4, m7 ;H subpel_filters phaddw m4, m4 ;H 7 8 7 8 pmulhrsw m4, w8192reg ;H pw_8192 palignr m3, m4, m0, 12 ; 6 7 8 7 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 paddd m5, d512reg ; pd_512 paddd m5, m4 psrad m4, m5, 10 RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 ; d -> w packuswb m5, m5 ; w -> b pshuflw m5, m5, q3120 movd [dstq+dsq*0], m5 psrlq m5, 32 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 RESTORELINE_W4 m0, 0, 0 RESTORELINE_W4 m1, 1, 0 RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 jg .hv_w4_loop RET %undef subpelv0 %undef subpelv1 %undef subpelv2 %undef subpelv3 .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 %define hv8_line_4 3 %define hv8_line_6 4 %macro SAVELINE_W8 2 mova [rsp+hv8_line_%1*mmsize], %2 %endmacro %macro RESTORELINE_W8 2 mova %2, [rsp+hv8_line_%1*mmsize] %endmacro shr mxd, 16 sub srcq, 3 %if ARCH_X86_32 %define base_reg r1 %define subpelh0 [rsp+mmsize*5] %define subpelh1 [rsp+mmsize*6] %define subpelv0 [rsp+mmsize*7] %define subpelv1 [rsp+mmsize*8] %define subpelv2 [rsp+mmsize*9] %define subpelv3 [rsp+mmsize*10] %define accuv0 [rsp+mmsize*11] %define accuv1 [rsp+mmsize*12] movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3] movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3] mov ssq, ssmp ALLOC_STACK -mmsize*13 %if STACK_ALIGNMENT < 16 %define dstm [rsp+mmsize*13+gprsize*1] %define dsm [rsp+mmsize*13+gprsize*2] mov r6, [rstk+stack_offset+gprsize*2] mov dsm, r6 %endif pshufd m0, m1, q0000 pshufd m1, m1, q1111 punpcklbw m5, m5 psraw m5, 8 ; sign-extend pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 pshufd m5, m5, q3333 mova subpelh0, m0 mova subpelh1, m1 mova subpelv0, m2 mova subpelv1, m3 mova subpelv2, m4 mova subpelv3, m5 lea r6, [ssq*3] mov dstm, dstq sub srcq, r6 %else ALLOC_STACK 16*5, 16 %define subpelh0 m10 %define subpelh1 m11 %define subpelv0 m12 %define subpelv1 m13 %define subpelv2 m14 %define subpelv3 m15 %define accuv0 m8 %define accuv1 m9 movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 punpcklbw m1, m1 psraw m1, 8 ; sign-extend pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 lea ss3q, [ssq*3] mov r7, dstq sub srcq, ss3q %endif shl wd, 14 lea r6d, [hq+wq-(1<<16)] mov r4, srcq .hv_w8_loop0: movu m4, [srcq+ssq*0] ; 0 = _ _ movu m5, [srcq+ssq*1] ; 1 = _ _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] %endif %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] %if ARCH_X86_32 pshufb %3, %1, [base+subpel_h_shufB] pshufb %4, %1, [base+subpel_h_shufC] pshufb %1, [base+subpel_h_shufA] %else pshufb %3, %1, %6 ; subpel_h_shufB pshufb %4, %1, %7 ; subpel_h_shufC pshufb %1, %5 ; subpel_h_shufA %endif pmaddubsw %2, %3, subpelh0 ; subpel +0 C0 pmaddubsw %4, subpelh1; subpel +4 B4 pmaddubsw %3, subpelh1; C4 pmaddubsw %1, subpelh0; A0 paddw %2, %4 ; C0+B4 paddw %1, %3 ; A0+C4 phaddw %1, %2 %endmacro %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %endif HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ %if ARCH_X86_32 movu m6, [srcq+ssq*0] ; 2 = _ _ movu m0, [srcq+ssq*1] ; 3 = _ _ lea srcq, [srcq+ssq*2] %else movu m6, [srcq+ssq*2] ; 2 = _ _ add srcq, ss3q movu m0, [srcq+ssq*0] ; 3 = _ _ %endif HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ mova m7, [base+pw_8192] pmulhrsw m4, m7 ; H pw_8192 pmulhrsw m5, m7 ; H pw_8192 pmulhrsw m6, m7 ; H pw_8192 pmulhrsw m0, m7 ; H pw_8192 punpcklwd m1, m4, m5 ; 0 1 ~ punpcklwd m2, m5, m6 ; 1 2 ~ punpcklwd m3, m6, m0 ; 2 3 ~ SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 mova m7, [base+subpel_h_shufA] %if ARCH_X86_32 movu m4, [srcq+ssq*0] ; 4 = _ _ movu m5, [srcq+ssq*1] ; 5 = _ _ lea srcq, [srcq+ssq*2] %else movu m4, [srcq+ssq*1] ; 4 = _ _ movu m5, [srcq+ssq*2] ; 5 = _ _ add srcq, ss3q %endif movu m6, [srcq+ssq*0] ; 6 = _ _ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ mova m7, [base+pw_8192] pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ punpcklwd m4, m0, m1 ; 3 4 ~ punpcklwd m5, m1, m2 ; 4 5 ~ punpcklwd m6, m2, m3 ; 5 6 ~ SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 .hv_w8_loop: ; m8 accu for V a ; m9 accu for V b SAVELINE_W8 1, m3 SAVELINE_W8 2, m4 SAVELINE_W8 3, m5 SAVELINE_W8 4, m6 %if ARCH_X86_32 pmaddwd m0, m1, subpelv0 ; a0 pmaddwd m7, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd m0, m3 paddd m7, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd m0, m5 paddd m7, m6 mova m5, [base+pd_512] paddd m0, m5 ; pd_512 paddd m7, m5 ; pd_512 mova accuv0, m0 mova accuv1, m7 %else pmaddwd m8, m1, subpelv0 ; a0 pmaddwd m9, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd m8, m3 paddd m9, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd m8, m5 paddd m9, m6 mova m7, [base+pd_512] paddd m8, m7 ; pd_512 paddd m9, m7 ; pd_512 mova m7, [base+subpel_h_shufB] mova m6, [base+subpel_h_shufC] mova m5, [base+subpel_h_shufA] %endif movu m0, [srcq+ssq*1] ; 7 movu m4, [srcq+ssq*2] ; 8 lea srcq, [srcq+ssq*2] HV_H_W8 m0, m1, m2, m3, m5, m7, m6 HV_H_W8 m4, m1, m2, m3, m5, m7, m6 mova m5, [base+pw_8192] pmulhrsw m0, m5 ; H pw_8192 pmulhrsw m4, m5 ; H pw_8192 RESTORELINE_W8 6, m6 punpcklwd m5, m6, m0 ; 6 7 ~ punpcklwd m6, m0, m4 ; 7 8 ~ pmaddwd m1, m5, subpelv3 ; a3 paddd m2, m1, accuv0 pmaddwd m1, m6, subpelv3 ; b3 paddd m1, m1, accuv1 ; H + V psrad m2, 10 psrad m1, 10 packssdw m2, m1 ; d -> w packuswb m2, m1 ; w -> b movd [dstq+dsq*0], m2 psrlq m2, 32 %if ARCH_X86_32 add dstq, dsm movd [dstq+dsq*0], m2 add dstq, dsm %else movd [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] %endif sub hd, 2 jle .hv_w8_outer SAVELINE_W8 6, m4 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: %if ARCH_X86_32 mov dstq, dstm add r4, 4 movzx hd, r6w add dstq, 4 mov srcq, r4 mov dstm, dstq %else add r4, 4 add r7, 4 movzx hd, r6b mov srcq, r4 mov dstq, r7 %endif sub r6d, 1<<16 jg .hv_w8_loop0 RET %macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask %if cpuflag(ssse3) pshufb %1, %2 %else %if %5 == 1 pcmpeqd %2, %2 psrlq %2, 32 %endif psrldq %3, %1, 1 pshufd %3, %3, q2301 pand %1, %2 pandn %4, %2, %3 por %1, %4 %endif %endmacro %macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask %ifnidn %1, %2 mova %1, %2 %endif PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 %endmacro %macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask %if notcpuflag(ssse3) psrlq %1, %2, 16 %elifnidn %1, %2 mova %1, %2 %endif PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 %endmacro %macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp] %if cpuflag(ssse3) palignr %1, %2, %3, %4 %else %if %0 == 4 %assign %%i regnumof%+%1 + 1 %define %%tmp m %+ %%i %else %define %%tmp %5 %endif psrldq %1, %3, %4 pslldq %%tmp, %2, 16-%4 por %1, %%tmp %endif %endmacro %macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 %if cpuflag(ssse3) phaddw %1, %2 %elifnidn %1, %2 %if %4 == 1 mova %3, [base+pw_1] %endif pmaddwd %1, %3 pmaddwd %2, %3 packssdw %1, %2 %else %if %4 == 1 pmaddwd %1, [base+pw_1] %else pmaddwd %1, %3 %endif packssdw %1, %1 %endif %endmacro %macro PMULHRSW_POW2 4 ; dst, src1, src2, shift %if cpuflag(ssse3) pmulhrsw %1, %2, %3 %else paddw %1, %2, %3 psraw %1, %4 %endif %endmacro %macro PMULHRSW_8192 3 ; dst, src1, src2 PMULHRSW_POW2 %1, %2, %3, 2 %endmacro %macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2] movd %1, [%2+0] movd %3, [%2+1] movd %4, [%2+2] movd %5, [%2+3] punpckldq %1, %3 punpckldq %4, %5 punpcklqdq %1, %4 %endmacro %macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc %if cpuflag(ssse3) movu m%1, [%2] pshufb m2, m%1, m11 ; subpel_h_shufB pshufb m3, m%1, m9 ; subpel_h_shufC pshufb m%1, m10 ; subpel_h_shufA %else %if ARCH_X86_64 SWAP m12, m5 SWAP m13, m6 SWAP m14, m7 %define %%mx0 m%+%%i %define %%mx1 m%+%%j %assign %%i 0 %rep 12 movd %%mx0, [%2+%%i] %assign %%i %%i+1 %endrep %assign %%i 0 %rep 6 %assign %%j %%i+1 punpckldq %%mx0, %%mx1 %assign %%i %%i+2 %endrep %assign %%i 0 %rep 3 %assign %%j %%i+2 punpcklqdq %%mx0, %%mx1 %assign %%i %%i+4 %endrep SWAP m%1, m0 SWAP m2, m4 SWAP m3, m8 SWAP m5, m12 SWAP m6, m13 SWAP m7, m14 %else PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7 PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7 PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7 SWAP m%1, m0 %endif %endif %endmacro %macro PREP_8TAP_H 2 ; dst, src_memloc PREP_8TAP_H_LOAD %1, %2 %if ARCH_X86_64 && notcpuflag(ssse3) SWAP m8, m1 SWAP m9, m7 %endif %xdefine mX m%+%1 %assign %%i regnumof%+mX %define mX m%+%%i mova m4, m2 PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0 PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4 PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4 PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0 %undef mX %if ARCH_X86_64 && notcpuflag(ssse3) SWAP m1, m8 SWAP m7, m9 %endif paddw m3, m4 paddw m%1, m2 PHADDW m%1, m3, m15, ARCH_X86_32 %if ARCH_X86_64 || cpuflag(ssse3) PMULHRSW_8192 m%1, m%1, m7 %else PMULHRSW_8192 m%1, m%1, [base+pw_2] %endif %endmacro %macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] %if cpuflag(ssse3) movu %1, [%2] pshufb m2, %1, shufB pshufb m3, %1, shufC pshufb %1, shufA %else PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4 PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4 PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4 %endif mova m1, m2 PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0 PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4 PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4 PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0 paddw m1, m3 ; C0+B4 paddw %1, m2 ; A0+C4 PHADDW %1, m1, %3, 1 %endmacro %macro PREP_8TAP 0 %if ARCH_X86_32 DECLARE_REG_TMP 1, 2 %elif WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif FN prep_8tap, sharp, SHARP, SHARP FN prep_8tap, sharp_smooth, SHARP, SMOOTH FN prep_8tap, smooth_sharp, SMOOTH, SHARP FN prep_8tap, smooth, SMOOTH, SMOOTH FN prep_8tap, sharp_regular, SHARP, REGULAR FN prep_8tap, regular_sharp, REGULAR, SHARP FN prep_8tap, smooth_regular, SMOOTH, REGULAR FN prep_8tap, regular_smooth, REGULAR, SMOOTH FN prep_8tap, regular, REGULAR, REGULAR %if ARCH_X86_32 %define base_reg r2 %define base base_reg-prep%+SUFFIX %else %define base_reg r7 %define base 0 %endif cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %assign org_stack_offset stack_offset imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v mov wd, wm movifnidn srcd, srcm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v LEA base_reg, prep_ssse3 tzcnt wd, wd movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] pxor m4, m4 add wq, base_reg movifnidn strided, stridem lea r6, [strideq*3] %assign stack_offset org_stack_offset %if WIN64 pop r8 pop r7 %endif jmp wq .h: LEA base_reg, prep%+SUFFIX test myd, 0xf00 jnz .hv %if cpuflag(ssse3) WIN64_SPILL_XMM 12 %else WIN64_SPILL_XMM 16 %endif %if ARCH_X86_32 %define strideq r6 mov strideq, stridem %endif cmp wd, 4 je .h_w4 tzcnt wd, wd %if cpuflag(ssse3) %if ARCH_X86_64 mova m10, [base+subpel_h_shufA] mova m11, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %else %define m10 [base+subpel_h_shufA] %define m11 [base+subpel_h_shufB] %define m9 [base+subpel_h_shufC] %endif %endif shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) mova m7, [base+pw_8192] pshufd m5, m6, q0000 pshufd m6, m6, q1111 %else punpcklbw m6, m6 psraw m6, 8 %if ARCH_X86_64 mova m7, [pw_2] mova m15, [pw_1] %else %define m15 m4 %endif pshufd m5, m6, q1010 punpckhqdq m6, m6 %endif add wq, base_reg jmp wq .h_w4: %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] %if cpuflag(ssse3) mova m6, [base+pw_8192] mova m5, [base+subpel_h_shufA] pshufd m4, m4, q0000 %else mova m6, [base+pw_2] %if ARCH_X86_64 mova m14, [pw_1] %else %define m14 m7 %endif punpcklbw m4, m4 psraw m4, 8 punpcklqdq m4, m4 %endif %if ARCH_X86_64 lea stride3q, [strideq*3] %endif .h_w4_loop: %if cpuflag(ssse3) movq m0, [srcq+strideq*0] ; 0 movq m1, [srcq+strideq*1] ; 1 %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m2, [srcq+strideq*0] ; 2 movq m3, [srcq+strideq*1] ; 3 lea srcq, [srcq+strideq*2] %else movq m2, [srcq+strideq*2] ; 2 movq m3, [srcq+stride3q ] ; 3 lea srcq, [srcq+strideq*4] %endif pshufb m0, m5 pshufb m1, m5 pshufb m2, m5 pshufb m3, m5 %elif ARCH_X86_64 movd m0, [srcq+strideq*0+0] movd m12, [srcq+strideq*0+1] movd m1, [srcq+strideq*1+0] movd m5, [srcq+strideq*1+1] movd m2, [srcq+strideq*2+0] movd m13, [srcq+strideq*2+1] movd m3, [srcq+stride3q +0] movd m7, [srcq+stride3q +1] punpckldq m0, m12 punpckldq m1, m5 punpckldq m2, m13 punpckldq m3, m7 movd m12, [srcq+strideq*0+2] movd m8, [srcq+strideq*0+3] movd m5, [srcq+strideq*1+2] movd m9, [srcq+strideq*1+3] movd m13, [srcq+strideq*2+2] movd m10, [srcq+strideq*2+3] movd m7, [srcq+stride3q +2] movd m11, [srcq+stride3q +3] lea srcq, [srcq+strideq*4] punpckldq m12, m8 punpckldq m5, m9 punpckldq m13, m10 punpckldq m7, m11 punpcklqdq m0, m12 ; 0 punpcklqdq m1, m5 ; 1 punpcklqdq m2, m13 ; 2 punpcklqdq m3, m7 ; 3 %else movd m0, [srcq+strideq*0+0] movd m1, [srcq+strideq*0+1] movd m2, [srcq+strideq*0+2] movd m3, [srcq+strideq*0+3] punpckldq m0, m1 punpckldq m2, m3 punpcklqdq m0, m2 ; 0 movd m1, [srcq+strideq*1+0] movd m2, [srcq+strideq*1+1] movd m3, [srcq+strideq*1+2] movd m7, [srcq+strideq*1+3] lea srcq, [srcq+strideq*2] punpckldq m1, m2 punpckldq m3, m7 punpcklqdq m1, m3 ; 1 movd m2, [srcq+strideq*0+0] movd m3, [srcq+strideq*0+1] movd m7, [srcq+strideq*0+2] movd m5, [srcq+strideq*0+3] punpckldq m2, m3 punpckldq m7, m5 punpcklqdq m2, m7 ; 2 movd m3, [srcq+strideq*1+0] movd m7, [srcq+strideq*1+1] punpckldq m3, m7 movd m7, [srcq+strideq*1+2] movd m5, [srcq+strideq*1+3] lea srcq, [srcq+strideq*2] punpckldq m7, m5 punpcklqdq m3, m7 ; 3 %endif PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 PMADDUBSW m1, m4, m5, m7, 0 PMADDUBSW m2, m4, m5, m7, 0 PMADDUBSW m3, m4, m5, m7, 0 PHADDW m0, m1, m14, ARCH_X86_32 PHADDW m2, m3, m14, 0 PMULHRSW_8192 m0, m0, m6 PMULHRSW_8192 m2, m2, m6 mova [tmpq+16*0], m0 mova [tmpq+16*1], m2 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: %if cpuflag(ssse3) PREP_8TAP_H 0, srcq+strideq*0 PREP_8TAP_H 1, srcq+strideq*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 lea srcq, [srcq+strideq*2] add tmpq, 32 sub hd, 2 %else PREP_8TAP_H 0, srcq mova [tmpq], m0 add srcq, strideq add tmpq, 16 dec hd %endif jg .h_w8 RET .h_w16: mov r3, -16*1 jmp .h_start .h_w32: mov r3, -16*2 jmp .h_start .h_w64: mov r3, -16*4 jmp .h_start .h_w128: mov r3, -16*8 .h_start: sub srcq, r3 mov r5, r3 .h_loop: %if cpuflag(ssse3) PREP_8TAP_H 0, srcq+r3+8*0 PREP_8TAP_H 1, srcq+r3+8*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 32 add r3, 16 %else PREP_8TAP_H 0, srcq+r3 mova [tmpq], m0 add tmpq, 16 add r3, 8 %endif jl .h_loop add srcq, strideq mov r3, r5 dec hd jg .h_loop RET .v: LEA base_reg, prep%+SUFFIX %if ARCH_X86_32 mov mxd, myd and mxd, 0x7f %else %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 movzx mxd, myb %endif shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) mova m2, [base+pw_512] mova m7, [base+pw_8192] punpcklwd m0, m0 %else punpcklbw m0, m0 psraw m0, 8 %endif %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed %if cpuflag(ssse3) ALLOC_STACK -mmsize*4 %else ALLOC_STACK -mmsize*5 %endif %assign regs_used 7 mov strideq, [rstk+stack_offset+gprsize*3] pshufd m1, m0, q0000 mova subpel0, m1 pshufd m1, m0, q1111 mova subpel1, m1 lea r5, [strideq*3] pshufd m1, m0, q2222 mova subpel2, m1 pshufd m1, m0, q3333 mova subpel3, m1 sub srcq, r5 %else %define subpel0 m8 %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 pshufd m8, m0, q0000 pshufd m9, m0, q1111 lea stride3q, [strideq*3] pshufd m10, m0, q2222 pshufd m11, m0, q3333 sub srcq, stride3q cmp wd, 8 jns .v_w8 %endif .v_w4: %if notcpuflag(ssse3) pxor m6, m6 %if ARCH_X86_64 mova m7, [base+pw_2] %endif %endif %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize %define srcm [esp+stack_size+gprsize*1] %define tmpm [esp+stack_size+gprsize*2] %endif mov tmpm, tmpq mov srcm, srcq lea r5d, [wq - 4] ; horizontal loop shl r5d, (16 - 2) ; (wq / 4) << 16 mov r5w, hw .v_w4_loop0: %endif movd m1, [srcq+strideq*0] movd m0, [srcq+strideq*1] %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movd m2, [srcq+strideq*0] movd m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movd m3, [srcq+strideq*0] movd m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] %else movd m2, [srcq+strideq*2] add srcq, stride3q movd m4, [srcq+strideq*0] movd m3, [srcq+strideq*1] movd m5, [srcq+strideq*2] add srcq, stride3q %endif punpckldq m1, m0 ; 0 1 punpckldq m0, m2 ; 1 2 punpcklbw m1, m0 ; 01 12 movd m0, [srcq+strideq*0] punpckldq m2, m4 ; 2 3 punpckldq m4, m3 ; 3 4 punpckldq m3, m5 ; 4 5 punpckldq m5, m0 ; 5 6 punpcklbw m2, m4 ; 23 34 punpcklbw m3, m5 ; 45 56 .v_w4_loop: %if ARCH_X86_32 && notcpuflag(ssse3) mova m7, subpel0 %define subpel0 m7 %endif mova m5, m1 PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0 %if ARCH_X86_32 && notcpuflag(ssse3) mova m7, subpel1 %define subpel1 m7 %endif mova m1, m2 PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1 paddw m5, m2 %if ARCH_X86_32 && notcpuflag(ssse3) mova m7, subpel2 %define subpel2 m7 %endif mova m2, m3 PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 movd m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] paddw m5, m3 punpckldq m3, m0, m4 ; 6 7 _ _ movd m0, [srcq+strideq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m12, m0 %else mova [esp+mmsize*4], m0 mova m7, subpel3 %define subpel3 m7 %endif %endif mova m4, m3 PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3 paddw m5, m4 %if ARCH_X86_64 || cpuflag(ssse3) %if notcpuflag(ssse3) SWAP m0, m12 %endif PMULHRSW_8192 m5, m5, m7 %else mova m0, [esp+mmsize*4] PMULHRSW_8192 m5, m5, [base+pw_2] %endif movq [tmpq+wq*0], m5 movhps [tmpq+wq*2], m5 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 mov srcq, srcm mov tmpq, tmpm movzx hd, r5w add srcq, 4 add tmpq, 8 mov srcm, srcq mov tmpm, tmpq sub r5d, 1<<16 ; horizontal-- jg .v_w4_loop0 %endif RET %if ARCH_X86_64 .v_w8: lea r6d, [wq*8-64] mov r5, srcq mov r8, tmpq lea r6d, [hq+r6*4] .v_w8_loop0: movq m1, [srcq+strideq*0] movq m2, [srcq+strideq*1] movq m3, [srcq+strideq*2] add srcq, stride3q movq m4, [srcq+strideq*0] movq m5, [srcq+strideq*1] movq m6, [srcq+strideq*2] add srcq, stride3q movq m0, [srcq+strideq*0] punpcklbw m1, m2 ; 01 punpcklbw m2, m3 ; 12 punpcklbw m3, m4 ; 23 punpcklbw m4, m5 ; 34 punpcklbw m5, m6 ; 45 punpcklbw m6, m0 ; 56 .v_w8_loop: movq m13, [srcq+strideq*1] lea srcq, [srcq+strideq*2] %if cpuflag(ssse3) pmaddubsw m14, m1, subpel0 ; a0 pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, subpel1 ; a1 pmaddubsw m4, subpel1 ; b1 paddw m14, m3 paddw m15, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, subpel2 ; a2 pmaddubsw m6, subpel2 ; b2 punpcklbw m12, m0, m13 ; 67 movq m0, [srcq+strideq*0] punpcklbw m13, m0 ; 78 paddw m14, m5 mova m5, m12 pmaddubsw m12, subpel3 ; a3 paddw m15, m6 mova m6, m13 pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 %else mova m14, m1 PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 mova m15, m2 PMADDUBSW m15, subpel0, m7, m12, 0 ; b0 mova m1, m3 PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 mova m2, m4 PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 paddw m14, m3 mova m3, m5 PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 paddw m15, m4 mova m4, m6 PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 paddw m15, m6 punpcklbw m12, m0, m13 ; 67 movq m0, [srcq+strideq*0] punpcklbw m13, m0 ; 78 paddw m14, m5 mova m5, m12 PMADDUBSW m12, subpel3, m7, m6, 0 ; a3 paddw m14, m12 mova m6, m13 PMADDUBSW m13, subpel3, m7, m12, 0 ; b3 paddw m15, m13 PMULHRSW_8192 m14, m14, [base+pw_2] PMULHRSW_8192 m15, m15, [base+pw_2] %endif movu [tmpq+wq*0], m14 movu [tmpq+wq*2], m15 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w8_loop add r5, 8 add r8, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r8 sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 %undef subpel0 %undef subpel1 %undef subpel2 %undef subpel3 .hv: %assign stack_offset org_stack_offset cmp wd, 4 jg .hv_w8 and mxd, 0x7f movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] %if ARCH_X86_32 mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] mov strideq, stridem %assign regs_used 6 ALLOC_STACK -mmsize*14 %assign regs_used 7 lea r5, [strideq*3+1] sub srcq, r5 %define subpelv0 [rsp+mmsize*0] %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] punpcklbw m0, m0 psraw m0, 8 pshufd m6, m0, q0000 mova subpelv0, m6 pshufd m6, m0, q1111 mova subpelv1, m6 pshufd m6, m0, q2222 mova subpelv2, m6 pshufd m6, m0, q3333 mova subpelv3, m6 %else movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) ALLOC_STACK mmsize*14, 14 %else ALLOC_STACK mmsize*14, 16 %endif lea stride3q, [strideq*3] sub srcq, stride3q dec srcq %define subpelv0 m10 %define subpelv1 m11 %define subpelv2 m12 %define subpelv3 m13 punpcklbw m0, m0 psraw m0, 8 %if cpuflag(ssse3) mova m8, [base+pw_8192] %else mova m8, [base+pw_2] %endif mova m9, [base+pd_32] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 %endif pshufd m7, m1, q0000 %if notcpuflag(ssse3) punpcklbw m7, m7 psraw m7, 8 %endif %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 %define hv4_line_0_3 7 %define hv4_line_0_4 8 %define hv4_line_0_5 9 %define hv4_line_1_0 10 %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 %if ARCH_X86_32 %if cpuflag(ssse3) %define w8192reg [base+pw_8192] %else %define w8192reg [base+pw_2] %endif %define d32reg [base+pd_32] %else %define w8192reg m8 %define d32reg m9 %endif ; lower shuffle 0 1 2 3 4 %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] %else %if ARCH_X86_64 mova m15, [pw_1] %else %define m15 m1 %endif %endif movq m5, [srcq+strideq*0] ; 0 _ _ _ movhps m5, [srcq+strideq*1] ; 0 _ 1 _ %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m4, [srcq+strideq*0] ; 2 _ _ _ movhps m4, [srcq+strideq*1] ; 2 _ 3 _ lea srcq, [srcq+strideq*2] %else movq m4, [srcq+strideq*2] ; 2 _ _ _ movhps m4, [srcq+stride3q ] ; 2 _ 3 _ lea srcq, [srcq+strideq*4] %endif PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 PMULHRSW_8192 m2, m2, w8192reg SAVELINE_W4 m2, 2, 0 ; upper shuffle 2 3 4 5 6 %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] %endif PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 PMULHRSW_8192 m2, m2, w8192reg %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m14, m2 %else mova [esp+mmsize*4], m2 %endif %endif ; lower shuffle %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] %endif movq m5, [srcq+strideq*0] ; 4 _ _ _ movhps m5, [srcq+strideq*1] ; 4 _ 5 _ %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m4, [srcq+strideq*0] ; 6 _ _ _ add srcq, strideq %else movq m4, [srcq+strideq*2] ; 6 _ _ _ add srcq, stride3q %endif PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 PMULHRSW_8192 m3, m3, w8192reg SAVELINE_W4 m3, 3, 0 ; upper shuffle %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] %endif PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 PMULHRSW_8192 m3, m3, w8192reg %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m2, m14 %else mova m2, [esp+mmsize*4] %endif %endif ;process high PALIGNR m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 ;process low RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 PALIGNR m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 .hv_w4_loop: ;process low pmaddwd m5, m1, subpelv0 ; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m14, m5 %else mova [esp+mmsize*4], m5 %define m15 m3 %endif %endif %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] %endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 PMULHRSW_8192 m4, m4, w8192reg PALIGNR m3, m4, m0, 12, m5 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m5, m14 %else mova m5, [esp+mmsize*4] %endif %endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m5, 6 SAVELINE_W4 m0, 0, 0 SAVELINE_W4 m1, 1, 0 SAVELINE_W4 m2, 2, 0 SAVELINE_W4 m3, 3, 0 SAVELINE_W4 m5, 5, 0 ;process high RESTORELINE_W4 m0, 0, 1 RESTORELINE_W4 m1, 1, 1 RESTORELINE_W4 m2, 2, 1 RESTORELINE_W4 m3, 3, 1 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m14, m5 %else mova [esp+0xA0], m5 %endif %endif %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] %endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 PMULHRSW_8192 m4, m4, w8192reg PALIGNR m3, m4, m0, 12, m5 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m5, m14 %else mova m5, [esp+0xA0] %endif %endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m4, m5, 6 RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 pshufd m5, m5, q3120 movu [tmpq], m5 lea srcq, [srcq+strideq*2] add tmpq, 16 sub hd, 2 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 RESTORELINE_W4 m0, 0, 0 RESTORELINE_W4 m1, 1, 0 RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 jg .hv_w4_loop RET %undef subpelv0 %undef subpelv1 %undef subpelv2 %undef subpelv3 .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 %define hv8_line_4 3 %define hv8_line_6 4 shr mxd, 16 %if ARCH_X86_32 %define subpelh0 [rsp+mmsize*5] %define subpelh1 [rsp+mmsize*6] %define subpelv0 [rsp+mmsize*7] %define subpelv1 [rsp+mmsize*8] %define subpelv2 [rsp+mmsize*9] %define subpelv3 [rsp+mmsize*10] %define accuv0 [rsp+mmsize*11] %define accuv1 [rsp+mmsize*12] movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] mov strideq, stridem %assign regs_used 6 ALLOC_STACK -mmsize*14 %assign regs_used 7 %if STACK_ALIGNMENT < mmsize %define tmpm [rsp+mmsize*13+gprsize*1] %define srcm [rsp+mmsize*13+gprsize*2] %define stridem [rsp+mmsize*13+gprsize*3] mov tmpm, tmpq mov stridem, strideq %endif %if cpuflag(ssse3) pshufd m0, m1, q0000 pshufd m1, m1, q1111 %else punpcklbw m1, m1 psraw m1, 8 pshufd m0, m1, q1010 punpckhqdq m1, m1 %endif punpcklbw m5, m5 psraw m5, 8 pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 pshufd m5, m5, q3333 mova subpelh0, m0 mova subpelh1, m1 mova subpelv0, m2 mova subpelv1, m3 mova subpelv2, m4 mova subpelv3, m5 lea r5, [strideq*3+3] sub srcq, r5 mov srcm, srcq %else ALLOC_STACK mmsize*5, 16 %define subpelh0 m10 %define subpelh1 m11 %define subpelv0 m12 %define subpelv1 m13 %define subpelv2 m14 %define subpelv3 m15 %define accuv0 m8 %define accuv1 m9 movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 %else punpcklbw m0, m0 psraw m0, 8 pshufd subpelh0, m0, q1010 pshufd subpelh1, m0, q3232 mova m7, [base+pw_2] %endif punpcklbw m1, m1 psraw m1, 8 pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 lea stride3q, [strideq*3] sub srcq, 3 sub srcq, stride3q mov r6, srcq mov r8, tmpq %endif lea r5d, [wq-4] shl r5d, 14 add r5d, hd .hv_w8_loop0: %if cpuflag(ssse3) %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %define shufA m7 %define shufB m8 %define shufC m9 %else %define shufA [base+subpel_h_shufA] %define shufB [base+subpel_h_shufB] %define shufC [base+subpel_h_shufC] %endif %endif PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 %if ARCH_X86_64 PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 add srcq, stride3q PREP_8TAP_HV m0, srcq+strideq*0, m7, m9 %else lea srcq, [srcq+strideq*2] %if notcpuflag(ssse3) mova [esp], m4 %endif PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 lea srcq, [srcq+strideq*2] %endif %if cpuflag(ssse3) mova m7, [base+pw_8192] %else mova m7, [base+pw_2] %if ARCH_X86_32 mova m4, [esp] %endif %endif PMULHRSW_8192 m4, m4, m7 PMULHRSW_8192 m5, m5, m7 PMULHRSW_8192 m6, m6, m7 PMULHRSW_8192 m0, m0, m7 punpcklwd m1, m4, m5 ; 01 punpcklwd m2, m5, m6 ; 12 punpcklwd m3, m6, m0 ; 23 SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 %if cpuflag(ssse3) mova m7, [base+subpel_h_shufA] %endif %if ARCH_X86_64 PREP_8TAP_HV m4, srcq+strideq*1, m8, m9 PREP_8TAP_HV m5, srcq+strideq*2, m8, m9 add srcq, stride3q PREP_8TAP_HV m6, srcq+strideq*0, m8, m9 %else %if notcpuflag(ssse3) mova [esp+0x30], m0 %endif PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 lea srcq, [srcq+strideq*2] PREP_8TAP_HV m6, srcq+strideq*0, m7, m0 %endif %if cpuflag(ssse3) mova m7, [base+pw_8192] %elif ARCH_X86_32 mova m0, [esp+0x30] mova m7, [base+pw_2] %endif PMULHRSW_8192 m1, m4, m7 PMULHRSW_8192 m2, m5, m7 PMULHRSW_8192 m3, m6, m7 punpcklwd m4, m0, m1 ; 34 punpcklwd m5, m1, m2 ; 45 punpcklwd m6, m2, m3 ; 56 SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 .hv_w8_loop: SAVELINE_W8 1, m3 SAVELINE_W8 2, m4 SAVELINE_W8 3, m5 SAVELINE_W8 4, m6 %if ARCH_X86_32 pmaddwd m0, m1, subpelv0 ; a0 pmaddwd m7, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd m0, m3 paddd m7, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd m0, m5 paddd m7, m6 mova m5, [base+pd_32] paddd m0, m5 paddd m7, m5 mova accuv0, m0 mova accuv1, m7 %else pmaddwd accuv0, m1, subpelv0 ; a0 pmaddwd accuv1, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd accuv0, m3 paddd accuv1, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd accuv0, m5 paddd accuv1, m6 mova m7, [base+pd_32] paddd accuv0, m7 paddd accuv1, m7 %if cpuflag(ssse3) mova m7, [base+subpel_h_shufB] mova m6, [base+subpel_h_shufC] mova m5, [base+subpel_h_shufA] %define shufA m5 %define shufB m7 %define shufC m6 %endif %endif PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 lea srcq, [srcq+strideq*2] PREP_8TAP_HV m4, srcq+strideq*0, m5, m6 %if cpuflag(ssse3) mova m5, [base+pw_8192] %else mova m5, [base+pw_2] %endif PMULHRSW_8192 m0, m0, m5 PMULHRSW_8192 m4, m4, m5 RESTORELINE_W8 6, m6 punpcklwd m5, m6, m0 ; 67 punpcklwd m6, m0, m4 ; 78 pmaddwd m1, m5, subpelv3 ; a3 paddd m2, m1, accuv0 pmaddwd m1, m6, subpelv3 ; b3 paddd m1, m1, accuv1 psrad m2, 6 psrad m1, 6 packssdw m2, m1 movq [tmpq+wq*0], m2 movhps [tmpq+wq*2], m2 lea tmpq, [tmpq+wq*4] sub hd, 2 jle .hv_w8_outer SAVELINE_W8 6, m4 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: %if ARCH_X86_32 mov srcq, srcm mov tmpq, tmpm movzx hd, r5w add srcq, 4 add tmpq, 8 mov srcm, srcq mov tmpm, tmpq %else add r6, 4 add r8, 8 movzx hd, r5b mov srcq, r6 mov tmpq, r8 %endif sub r5d, 1<<16 jg .hv_w8_loop0 RET %endmacro %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro SAVE_REG 1 %xdefine r%1_save r%1 %xdefine r%1q_save r%1q %xdefine r%1d_save r%1d %if ARCH_X86_32 %define r%1m_save [rstk+stack_offset+(%1+1)*4] %endif %endmacro %macro LOAD_REG 1 %xdefine r%1 r%1_save %xdefine r%1q r%1q_save %xdefine r%1d r%1d_save %if ARCH_X86_32 %define r%1m r%1m_save %endif %undef r%1d_save %undef r%1q_save %undef r%1_save %endmacro %macro REMAP_REG 2-3 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %if ARCH_X86_32 %if %3 == 0 %xdefine r%1m r%2m %else %define r%1m [rstk+stack_offset+(%1+1)*4] %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %if ARCH_X86_64 SAVE_REG 14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %else SAVE_REG 5 %assign %%i 5 %rep 5 %assign %%j %%i-1 REMAP_REG %%i, %%j, 0 %assign %%i %%i-1 %endrep %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %if ARCH_X86_64 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep LOAD_REG 14 %else %rep 4 %assign %%j %%i+1 REMAP_REG %%i, %%j, 1 %assign %%i %%i+1 %endrep LOAD_REG 5 %endif %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %if ARCH_X86_64 %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] SWAP m%2, m%5 movq m%1, [srcq+ r4] movq m%2, [srcq+ r6] movhps m%1, [srcq+ r7] movhps m%2, [srcq+ r9] movq m%3, [srcq+r10] movq m%4, [srcq+r11] movhps m%3, [srcq+r13] movhps m%4, [srcq+ rX] add srcq, ssq movq m%5, [srcq+ r4] movq m%6, [srcq+ r6] movhps m%5, [srcq+ r7] movhps m%6, [srcq+ r9] movq m%7, [srcq+r10] movq m%8, [srcq+r11] movhps m%7, [srcq+r13] movhps m%8, [srcq+ rX] add srcq, ssq pmaddubsw m%1, m%9 pmaddubsw m%5, m%9 pmaddubsw m%2, m%10 pmaddubsw m%6, m%10 pmaddubsw m%3, m%11 pmaddubsw m%7, m%11 pmaddubsw m%4, m%12 pmaddubsw m%8, m%12 phaddw m%1, m%2 phaddw m%5, m%6 phaddw m%3, m%4 phaddw m%7, m%8 phaddw m%1, m%3 phaddw m%5, m%7 pmulhrsw m%1, m12 pmulhrsw m%5, m12 SWAP m%2, m%5 %endmacro %else %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets %if %3 == 1 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] %endif movq m0, [srcq+r0] movq m1, [srcq+rX] movhps m0, [srcq+r4] movhps m1, [srcq+r5] add srcq, ssq movq m4, [srcq+r0] movq m5, [srcq+rX] movhps m4, [srcq+r4] movhps m5, [srcq+r5] mov r0, [esp+16] mov rX, [esp+24] mov r4, [esp+20] mov r5, [esp+28] sub srcq, ssq movq m2, [srcq+r0] movq m3, [srcq+rX] movhps m2, [srcq+r4] movhps m3, [srcq+r5] add srcq, ssq movq m6, [srcq+r0] movq m7, [srcq+rX] movhps m6, [srcq+r4] movhps m7, [srcq+r5] add srcq, ssq pmaddubsw m0, [esp+%1+ 0] pmaddubsw m4, [esp+%1+ 0] pmaddubsw m1, [esp+%1+16] pmaddubsw m5, [esp+%1+16] pmaddubsw m2, [esp+%1+32] pmaddubsw m6, [esp+%1+32] pmaddubsw m3, [esp+%1+48] pmaddubsw m7, [esp+%1+48] phaddw m0, m1 phaddw m4, m5 phaddw m2, m3 phaddw m6, m7 phaddw m0, m2 phaddw m4, m6 pmulhrsw m0, m12 pmulhrsw m4, m12 %if %2 != 0 mova [esp+%2+ 0], m0 mova [esp+%2+16], m4 %endif %endmacro %endif %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isprep 0 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy %else cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy %endif %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy %else cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy %endif %endif %xdefine base_reg r12 %define rndshift 10 %else ; prep %assign isprep 1 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy %xdefine tmp_stridem r14q %else cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy %define tmp_stridem qword [rsp+0x138] %endif %xdefine base_reg r11 %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy %else cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy %endif %define tmp_stridem dword [esp+0x138] %endif %define rndshift 6 %endif %if ARCH_X86_32 mov [esp+0x1f0], t0d mov [esp+0x1f4], t1d %if !isprep && required_stack_alignment > STACK_ALIGNMENT mov dstd, dstm mov dsd, dsm mov srcd, srcm mov ssd, ssm mov hd, hm mov r4, mxm %define r0m [esp+0x200] %define dsm [esp+0x204] %define dsmp dsm %define r1m dsm %define r2m [esp+0x208] %define ssm [esp+0x20c] %define r3m ssm %define hm [esp+0x210] %define mxm [esp+0x214] mov r0m, dstd mov dsm, dsd mov r2m, srcd mov ssm, ssd mov hm, hd mov r0, mym mov r1, dxm mov r2, dym %define mym [esp+0x218] %define dxm [esp+0x09c] %define dym [esp+0x21c] mov mxm, r4 mov mym, r0 mov dxm, r1 mov dym, r2 tzcnt wd, wm %endif %if isprep && required_stack_alignment > STACK_ALIGNMENT %xdefine base_reg r5 %else %xdefine base_reg r6 %endif mov ssd, ssm %endif LEA base_reg, %1_8tap_scaled_8bpc_ssse3 %xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3 %if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT tzcnt wd, wm %endif %if ARCH_X86_32 %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %endif movd m8, dxm movd m14, mxm pshufd m8, m8, q0000 pshufd m14, m14, q0000 %if isprep && UNIX64 mov r5d, t0d DECLARE_REG_TMP 5, 7 %endif %if ARCH_X86_64 mov dyd, dym %endif %ifidn %1, put %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %elif ARCH_X86_64 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %if ARCH_X86_64 %if required_stack_alignment > STACK_ALIGNMENT %define dsm [rsp+0x138] %define rX r1 %define rXd r1d %else %define dsm dsq %define rX r14 %define rXd r14d %endif %else %define rX r1 %endif %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %elif ARCH_X86_64 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+0x94] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %if ARCH_X86_64 %define rX r14 %define rXd r14d %else %define rX r3 %endif %endif %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m12, [base+pw_8192] %ifidn %1, put mova m13, [base+pd_512] %else mova m13, [base+pd_32] %endif %else %define m10 [base+pd_0x3ff] %define m12 [base+pw_8192] %ifidn %1, put %define m13 [base+pd_512] %else %define m13 [base+pd_32] %endif %endif pxor m9, m9 %if ARCH_X86_64 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q %else MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT mov r1, [esp+0x1f4] lea r0, [ssq*3] movzx r2, r1b shr r1, 16 cmp dword hm, 6 cmovs r1, r2 mov [esp+0x1f4], r1 mov r1, r1m mov r2, r2m sub srcq, r0 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define ss3q r0 %define myd r4 %define dyd dword dym %define hd dword hm %endif cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %else %define m11 [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [rsp+0x180], m14 SWAP m5, m0 SWAP m6, m3 %define m8 m5 %define m15 m6 %endif movq m0, [srcq+ssq*0] movq m2, [srcq+ssq*2] movhps m0, [srcq+ssq*1] movhps m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 %endif movq m1, [srcq+ssq*0] movq m3, [srcq+ssq*2] movhps m1, [srcq+ssq*1] movhps m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpckldq m15, m7 punpcklqdq m15, m15 %if ARCH_X86_64 pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 %else pand m7, m8, m11 pandn m8, m15 %define m8 m6 %define m15 m5 por m15, m7 mova [rsp+0x190], m15 %endif pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m2, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 1 2 3 pmulhrsw m1, m12 ; 4 5 6 7 palignr m2, m1, m0, 4 ; 1 2 3 4 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mov myd, mym mov r0, r0m mova [rsp+0x1a0], m3 mova [rsp+0x1b0], m0 mova [rsp+0x1c0], m2 mova [rsp+0x1d0], m4 %endif .w2_loop: and myd, 0x3ff %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m11, r6q punpcklbw m11, m11 psraw m11, 8 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 pmaddwd m5, m3, m8 pmaddwd m6, m0, m9 pmaddwd m7, m2, m10 pmaddwd m8, m4, m11 paddd m5, m6 paddd m7, m8 %else mov mym, myd mov r1, [esp+0x1f4] xor r3, r3 shr r4, 6 lea r1, [r1+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r1*8+0] cmovnz r3, [base+subpel_filters+r1*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m5, m7, q0000 pshufd m6, m7, q1111 pmaddwd m3, m5 pmaddwd m0, m6 pshufd m5, m7, q2222 pshufd m7, m7, q3333 pmaddwd m2, m5 pmaddwd m4, m7 paddd m3, m0 paddd m2, m4 SWAP m5, m3 SWAP m7, m2 %endif paddd m5, m13 paddd m5, m7 psrad m5, 10 packssdw m5, m5 packuswb m5, m5 %if ARCH_X86_64 pextrw r6d, m5, 0 mov [dstq], r6w add dstq, dsq dec hd jz .ret add myd, dyd %else pextrw r3d, m5, 0 mov [dstq], r3w add dstq, dsm dec hd jz .ret mov myd, mym add myd, dym %endif test myd, ~0x3ff %if ARCH_X86_32 SWAP m3, m5 SWAP m2, m7 mova m3, [rsp+0x1a0] mova m0, [rsp+0x1b0] mova m2, [rsp+0x1c0] mova m4, [rsp+0x1d0] %define m14 [esp+0x180] %define m15 [esp+0x190] %endif jz .w2_loop %if ARCH_X86_32 mov r3, r3m %endif movq m5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps m3, m0, q1032 ; 01 12 shufps m0, m2, q1032 ; 23 34 shufps m2, m4, q1032 ; 45 56 pshufb m5, m14 pmaddubsw m5, m15 phaddw m5, m5 pmulhrsw m5, m12 palignr m4, m5, m1, 12 punpcklqdq m1, m4, m4 ; 6 7 6 7 punpcklwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mova [rsp+0x1a0], m3 mova [rsp+0x1b0], m0 mova [rsp+0x1c0], m2 mova [rsp+0x1d0], m4 %endif jmp .w2_loop .w2_skip_line: movhps m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m3, m0 ; 01 12 mova m0, m2 ; 23 34 pshufb m5, m14 pmaddubsw m5, m15 phaddw m5, m5 pmulhrsw m5, m12 ; 6 7 6 7 palignr m4, m5, m1, 8 ; 4 5 6 7 pshufd m5, m4, q0321 ; 5 6 7 _ mova m1, m4 punpcklwd m2, m4, m5 ; 45 56 punpckhwd m4, m5 ; 67 __ %if ARCH_X86_32 mova [rsp+0x1a0], m3 mova [rsp+0x1b0], m0 mova [rsp+0x1c0], m2 mova [rsp+0x1d0], m4 %endif jmp .w2_loop %endif INIT_XMM ssse3 .w4: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m8 m0 %xdefine m14 m4 %define m15 m3 movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %else %define m11 [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 psrldq m7, m15, 8 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 psrldq m7, 4 movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r0, m15 movd rX, m7 psrldq m15, 4 psrldq m7, 4 movd r4, m15 movd r5, m7 movd m1, [base+subpel_filters+r0*8+2] movd m2, [base+subpel_filters+rX*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] movifprep r3, r3m SWAP m4, m7 %define m15 m1 %endif mova m5, [base+bdct_lb_dw] movq m6, [base+subpel_s_shuf2] psrld m14, 10 punpckldq m15, m3 punpckldq m2, m4 punpcklqdq m15, m2 punpcklqdq m6, m6 pshufb m14, m5 paddb m14, m6 %if ARCH_X86_64 pcmpeqd m0, m9 pand m11, m0 %else mova [esp+0x180], m14 SWAP m7, m4 pxor m3, m3 pcmpeqd m0, m3 pand m2, m11, m0 %define m11 m2 %endif pandn m0, m15 %if ARCH_X86_64 SWAP m15, m0 %else %define m15 m0 %endif por m15, m11 %if ARCH_X86_64 movu m7, [srcq+ssq*0] movu m9, [srcq+ssq*1] movu m8, [srcq+ssq*2] movu m10, [srcq+ss3q ] lea srcq, [srcq+ssq*4] movu m2, [srcq+ssq*0] movu m4, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m5, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m7, m14 pshufb m9, m14 pshufb m8, m14 pshufb m10, m14 pshufb m2, m14 pshufb m4, m14 pshufb m3, m14 pshufb m5, m14 pmaddubsw m7, m15 pmaddubsw m9, m15 pmaddubsw m8, m15 pmaddubsw m10, m15 pmaddubsw m2, m15 pmaddubsw m4, m15 pmaddubsw m3, m15 pmaddubsw m5, m15 phaddw m7, m9 phaddw m8, m10 phaddw m9, m2, m4 phaddw m3, m5 pmulhrsw m7, m12 ; 0 1 pmulhrsw m8, m12 ; 2 3 pmulhrsw m9, m12 ; 4 5 pmulhrsw m3, m12 ; 6 7 shufps m4, m7, m8, q1032 ; 1 2 shufps m5, m8, m9, q1032 ; 3 4 shufps m6, m9, m3, q1032 ; 5 6 psrldq m11, m3, 8 ; 7 _ punpcklwd m0, m7, m4 ; 01 punpckhwd m7, m4 ; 12 punpcklwd m1, m8, m5 ; 23 punpckhwd m8, m5 ; 34 punpcklwd m2, m9, m6 ; 45 punpckhwd m9, m6 ; 56 punpcklwd m3, m11 ; 67 mova [rsp+0x00], m7 mova [rsp+0x10], m8 mova [rsp+0x20], m9 %else mova [esp+0x190], m15 lea ss3q, [ssq*3] movu m2, [srcq+ssq*0] movu m3, [srcq+ssq*1] movu m7, [srcq+ssq*2] movu m6, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m2, m14 pshufb m3, m14 pshufb m7, m14 pshufb m6, m14 pmaddubsw m2, m15 pmaddubsw m3, m15 pmaddubsw m7, m15 pmaddubsw m6, m15 phaddw m2, m3 phaddw m7, m6 movu m1, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m6, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m1, m14 pshufb m5, m14 pshufb m3, m14 pshufb m6, m14 pmaddubsw m1, m15 pmaddubsw m5, m15 pmaddubsw m3, m15 pmaddubsw m6, m15 phaddw m1, m5 phaddw m3, m6 pmulhrsw m2, m12 pmulhrsw m7, m12 pmulhrsw m1, m12 pmulhrsw m3, m12 shufps m4, m2, m7, q1032 ; 1 2 shufps m5, m7, m1, q1032 ; 3 4 shufps m6, m1, m3, q1032 ; 5 6 psrldq m0, m3, 8 ; 7 _ mova [esp+0x1a0], m0 %define m11 [esp+0x1a0] punpcklwd m0, m2, m4 ; 01 punpckhwd m2, m4 ; 12 punpcklwd m4, m7, m5 ; 23 punpckhwd m7, m5 ; 34 punpcklwd m5, m1, m6 ; 45 punpckhwd m1, m6 ; 56 punpcklwd m3, [esp+0x1a0] ; 67 mov myd, mym mov r0, r0m mova [esp+0x1b0], m0 ; 01 mova [esp+0x1c0], m4 ; 23 mova [esp+0x1d0], m5 ; 45 mova [esp+0x1e0], m3 ; 67 mova [rsp+0x00], m2 ; 12 mova [rsp+0x10], m7 ; 34 mova [rsp+0x20], m1 ; 56 SWAP m1, m4 SWAP m2, m5 %endif .w4_loop: and myd, 0x3ff %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m10, r6q punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 pmaddwd m7, m3, m10 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 %else mov mym, myd mov r5, [esp+0x1f4] xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pshufd m6, m7, q2222 pshufd m7, m7, q3333 pmaddwd m0, m4 pmaddwd m1, m5 pmaddwd m2, m6 pmaddwd m3, m7 paddd m0, m1 paddd m2, m3 paddd m0, m13 paddd m0, m2 SWAP m4, m0 %endif psrad m4, rndshift packssdw m4, m4 %ifidn %1, put packuswb m4, m4 movd [dstq], m4 add dstq, dsmp %else movq [tmpq], m4 add tmpq, 8 %endif dec hd jz .ret %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .w4_loop %else SWAP m0, m4 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff jnz .w4_next_line mova m0, [esp+0x1b0] mova m1, [esp+0x1c0] mova m2, [esp+0x1d0] mova m3, [esp+0x1e0] jmp .w4_loop .w4_next_line: %define m14 [esp+0x180] %define m15 [esp+0x190] %endif movu m4, [srcq] test myd, 0x400 jz .w4_skip_line %if ARCH_X86_64 mova m0, [rsp+0x00] mova [rsp+0x00], m1 mova m1, [rsp+0x10] mova [rsp+0x10], m2 mova m2, [rsp+0x20] mova [rsp+0x20], m3 %else mova m5, [esp+0x1c0] mova m0, [rsp+0x000] mova [rsp+0x00], m5 mova [esp+0x1b0], m0 mova m6, [esp+0x1d0] mova m1, [rsp+0x010] mova [rsp+0x10], m6 mova [esp+0x1c0], m1 mova m7, [esp+0x1e0] mova m2, [rsp+0x020] mova [rsp+0x20], m7 mova [esp+0x1d0], m2 %endif pshufb m4, m14 pmaddubsw m4, m15 phaddw m4, m4 pmulhrsw m4, m12 punpcklwd m3, m11, m4 %if ARCH_X86_32 mova [esp+0x1e0], m3 %endif mova m11, m4 add srcq, ssq jmp .w4_loop .w4_skip_line: %if ARCH_X86_32 mova m0, [esp+0x1c0] mova m1, [esp+0x1d0] mova m2, [esp+0x1e0] %endif movu m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m6, [rsp+0x10] mova m7, [rsp+0x20] pshufb m4, m14 pshufb m5, m14 pmaddubsw m4, m15 pmaddubsw m5, m15 phaddw m4, m5 pmulhrsw m4, m12 punpcklwd m5, m11, m4 mova [rsp+0x00], m6 mova [rsp+0x10], m7 mova [rsp+0x20], m5 %if ARCH_X86_64 psrldq m11, m4, 8 mova m0, m1 mova m1, m2 mova m2, m3 punpcklwd m3, m4, m11 %else psrldq m6, m4, 8 punpcklwd m3, m4, m6 mova [esp+0x1a0], m6 mova [esp+0x1b0], m0 mova [esp+0x1c0], m1 mova [esp+0x1d0], m2 mova [esp+0x1e0], m3 %endif jmp .w4_loop INIT_XMM ssse3 .w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .w_start: %ifidn %1, put movifnidn dsm, dsq %endif %if ARCH_X86_64 shr t0d, 16 movd m15, t0d %else %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq ssm %endif mov r4, [esp+0x1f0] shr r4, 16 movd m15, r4 mov r0, r0m mov myd, mym %endif sub srcq, 3 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 mov r5, hm mov [esp+0x094], myd mov [esp+0x134], r5 %endif jmp .hloop .hloop_prep: dec dword [rsp+0x090] jz .ret %if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm %else add dword [esp+0x130], 8*(isprep+1) mov myd, [esp+0x094] mov r5, [esp+0x134] mov r0, [esp+0x130] %endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] %endif mova m15, [rsp+0x120] pxor m9, m9 mov srcq, [rsp+0x098] %if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq %else mov mym, myd mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .hloop: %if ARCH_X86_64 mova m11, [base+pq_0x40000000] %else %define m11 [base+pq_0x40000000] %endif psrld m2, m14, 10 mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 psrldq m2, m5, 8 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 psrldq m5, 4 psrldq m2, 4 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 psrldq m5, 4 psrldq m2, 4 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] pxor m2, m2 %define m9 m2 %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 psrldq m4, 4 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 psrldq m4, m14, 8 movd r10d, m14 movd r11d, m4 psrldq m14, 4 psrldq m4, 4 movd r13d, m14 movd rXd, m4 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m11, m4 pand m8, m11, m6 pand m15, m11, m14 pand m11, m11, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m11, m5 mova [rsp+0x10], m7 mova [rsp+0x20], m8 mova [rsp+0x30], m15 mova [rsp+0x40], m11 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1 mova [rsp+0x50], m1 mova [rsp+0x60], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3 mova [rsp+0x70], m3 mova [rsp+0x80], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7 SWAP m7, m0 SWAP m8, m14 mova m1, [rsp+0x50] mova m2, [rsp+0x60] mova m3, [rsp+0x70] mova m9, [rsp+0x80] mov myd, mym mov dyd, dym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova [rsp+0x50], m4 mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 SWAP m14, m8 .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m11, r6q punpcklbw m11, m11 psraw m11, 8 pshufd m5, m11, q0000 pshufd m7, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 pmaddwd m4, m5, m0 pmaddwd m5, m5, m1 pmaddwd m6, m7, m2 pmaddwd m7, m7, m3 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 pmaddwd m8, [rsp+0x70], m11 pmaddwd m9, [rsp+0x80], m11 paddd m4, m6 paddd m5, m7 paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 psrldq m15, 4 psrldq m4, 4 movd r4, m15 movd r5, m4 mova m14, [esp+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [esp+16], m14 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m11, m4 pand m1, m11, m6 pand m2, m11, m7 pand m3, m11, m5 pandn m4, [esp+0x20] pandn m6, [esp+0x30] pandn m7, [esp+0x40] pandn m5, [esp+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1 MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3 MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7 mova m5, [esp+0x180] mova m6, [esp+0x190] mova m7, [esp+0x1a0] mova m0, [esp+0x1b0] mov myd, mym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [esp+0x180], m4 mova [esp+0x190], m5 mova [esp+0x1a0], m6 mova [esp+0x1b0], m7 mova m1, [esp+0x140] mova m2, [esp+0x150] mova m3, [esp+0x160] mova m4, [esp+0x170] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [esp+0x140], m0 mova [esp+0x150], m1 mova [esp+0x160], m2 mova [esp+0x170], m3 .vloop: mov r0, r0m mov r5, [esp+0x1f4] and myd, 0x3ff mov mym, myd xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 pshufd m6, m7, q2222 pshufd m7, m7, q3333 paddd m0, m2 paddd m1, m3 pmaddwd m2, [esp+0x180], m6 pmaddwd m3, [esp+0x190], m6 pmaddwd m4, [esp+0x1a0], m7 pmaddwd m5, [esp+0x1b0], m7 paddd m0, m2 paddd m1, m3 paddd m0, m13 paddd m1, m13 paddd m4, m0 paddd m5, m1 %endif psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 %ifidn %1, put packuswb m4, m4 movq [dstq], m4 add dstq, dsm %else mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+0x140], myd mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] jz .skip_line mova m14, [base+unpckw] movq m6, [srcq+r10] movq m7, [srcq+r11] movhps m6, [srcq+r13] movhps m7, [srcq+ rX] movq m4, [srcq+ r4] movq m5, [srcq+ r6] movhps m4, [srcq+ r7] movhps m5, [srcq+ r9] add srcq, ssq mov myd, [rsp+0x140] mov dyd, dym pshufd m9, m14, q1032 pshufb m0, m14 ; 0a 1a pshufb m1, m14 ; 0b 1b pshufb m2, m9 ; 3a 2a pshufb m3, m9 ; 3b 2b pmaddubsw m6, [rsp+0x30] pmaddubsw m7, [rsp+0x40] pmaddubsw m4, [rsp+0x10] pmaddubsw m5, [rsp+0x20] phaddw m6, m7 phaddw m4, m5 phaddw m4, m6 pmulhrsw m4, m12 pshufb m5, [rsp+0x50], m14 ; 4a 5a pshufb m6, [rsp+0x60], m14 ; 4b 5b pshufb m7, [rsp+0x70], m9 ; 7a 6a pshufb m8, [rsp+0x80], m9 ; 7b 6b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b punpckhwd m5, m7 ; 56a punpckhwd m6, m8 ; 56b punpcklwd m7, m4 ; 78a punpckhqdq m4, m4 punpcklwd m8, m4 ; 78b mova [rsp+0x50], m5 mova [rsp+0x60], m6 mova [rsp+0x70], m7 mova [rsp+0x80], m8 jmp .vloop .skip_line: mova m0, [rsp+0x10] mova m1, [rsp+0x20] mova m14, [rsp+0x30] mova m15, [rsp+0x40] MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15 mov myd, [rsp+0x140] mov dyd, dym mova m0, m2 ; 01a mova m1, m3 ; 01b mova m2, [rsp+0x50] ; 23a mova m3, [rsp+0x60] ; 23b mova m5, [rsp+0x70] ; 45a mova m6, [rsp+0x80] ; 45b punpcklwd m7, m4, m8 ; 67a punpckhwd m4, m8 ; 67b mova [rsp+0x50], m5 mova [rsp+0x60], m6 mova [rsp+0x70], m7 mova [rsp+0x80], m4 %else mov r0m, r0 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff mov mym, myd jnz .next_line mova m0, [esp+0x140] mova m1, [esp+0x150] mova m2, [esp+0x160] mova m3, [esp+0x170] jmp .vloop .next_line: test myd, 0x400 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] jz .skip_line mova m6, [base+unpckw] mova m0, [esp+0x140] mova m1, [esp+0x150] mova m7, [esp+0x180] movq m4, [srcq+r0] movq m5, [srcq+rX] movhps m4, [srcq+r4] movhps m5, [srcq+r5] pshufb m0, m6 ; 0a 1a pshufb m1, m6 ; 0b 1b pshufb m7, m6 ; 4a 5a mov r0, [esp+16] mov rX, [esp+24] mov r4, [esp+20] mov r5, [esp+28] movq m3, [srcq+r0] movq m2, [srcq+rX] movhps m3, [srcq+r4] movhps m2, [srcq+r5] add srcq, ssq pmaddubsw m4, [esp+0x20] pmaddubsw m5, [esp+0x30] pmaddubsw m3, [esp+0x40] pmaddubsw m2, [esp+0x50] phaddw m4, m5 phaddw m3, m2 mova m5, [esp+0x190] mova m2, [esp+0x160] phaddw m4, m3 mova m3, [esp+0x170] pmulhrsw m4, m12 ; 8a 8b mov myd, mym pshufb m5, m6 ; 4b 5b pshufd m6, m6, q1032 pshufb m2, m6 ; 3a 2a pshufb m3, m6 ; 3b 2b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b mova [esp+0x140], m0 mova [esp+0x150], m1 mova m0, [esp+0x1a0] mova m1, [esp+0x1b0] punpcklwd m2, m7 ; 34a punpcklwd m3, m5 ; 34b mova [esp+0x160], m2 mova [esp+0x170], m3 pshufb m0, m6 ; 7a 6a pshufb m1, m6 ; 7b 6b punpckhwd m7, m0 ; 56a punpckhwd m5, m1 ; 56b punpcklwd m0, m4 punpckhqdq m4, m4 punpcklwd m1, m4 mova [esp+0x180], m7 mova [esp+0x190], m5 mova [esp+0x1a0], m0 mova [esp+0x1b0], m1 mova m0, [esp+0x140] mova m1, [esp+0x150] jmp .vloop .skip_line: MC_8TAP_SCALED_H 0x20, 0x1c0, 0 mov myd, mym mova m0, [esp+0x160] mova m1, [esp+0x170] mova m2, [esp+0x180] mova m3, [esp+0x190] mova [esp+0x140], m0 mova [esp+0x150], m1 mova m4, [esp+0x1a0] mova m5, [esp+0x1b0] mova [esp+0x160], m2 mova [esp+0x170], m3 mova m6, [esp+0x1c0] mova m7, [esp+0x1d0] mova [esp+0x180], m4 mova [esp+0x190], m5 punpcklwd m4, m6, m7 punpckhwd m6, m7 mova [esp+0x1a0], m4 mova [esp+0x1b0], m6 %endif jmp .vloop INIT_XMM ssse3 .dy1: movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy1_w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 movzx r5, byte [esp+0x1f0] dec srcd movd m15, r5 %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %else %define m11 [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [esp+0x00], m14 %define m14 [esp+0x00] SWAP m5, m0 SWAP m6, m3 %define m8 m5 %define m15 m6 %endif movq m0, [srcq+ssq*0] movq m2, [srcq+ssq*2] movhps m0, [srcq+ssq*1] movhps m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %if ARCH_X86_64 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 movq m10, r4 %else mov myd, mym mov r5, [esp+0x1f4] xor r3, r3 shr myd, 6 lea r5, [r5+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] %define m10 m4 movd m10, r4 movd m3, r3 mov r3, r3m punpckldq m10, m3 %endif movq m1, [srcq+ssq*0] movq m3, [srcq+ssq*2] movhps m1, [srcq+ssq*1] add srcq, ss3q punpcklbw m10, m10 psraw m10, 8 punpckldq m15, m7 punpcklqdq m15, m15 %if ARCH_X86_64 pand m11, m8 %else pand m7, m11, m8 %define m11 m7 %endif pandn m8, m15 SWAP m15, m8 por m15, m11 %if ARCH_X86_64 pshufd m8, m10, q0000 pshufd m9, m10, q1111 pshufd m11, m10, q3333 pshufd m10, m10, q2222 %else mova [esp+0x10], m15 %define m15 [esp+0x10] mov r0, r0m pshufd m5, m4, q0000 pshufd m6, m4, q1111 pshufd m7, m4, q2222 pshufd m4, m4, q3333 %define m8 [esp+0x20] %define m9 [esp+0x30] %define m10 [esp+0x40] %define m11 [esp+0x50] mova m8, m5 mova m9, m6 mova m10, m7 mova m11, m4 %endif pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m2, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 pmulhrsw m1, m12 palignr m2, m1, m0, 4 pshufd m4, m1, q2121 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 punpcklwd m2, m1, m4 ; 45 56 .dy1_w2_loop: movq m1, [srcq+ssq*0] movhps m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m3, m8 pmaddwd m6, m0, m9 pmaddwd m7, m2, m10 mova m3, m0 mova m0, m2 paddd m5, m13 paddd m6, m7 pshufb m1, m14 pmaddubsw m1, m15 phaddw m1, m1 pmulhrsw m1, m12 palignr m7, m1, m4, 12 punpcklwd m2, m7, m1 ; 67 78 pmaddwd m7, m2, m11 mova m4, m1 paddd m5, m6 paddd m5, m7 psrad m5, rndshift packssdw m5, m5 packuswb m5, m5 movd r4d, m5 mov [dstq+dsq*0], r4w shr r4d, 16 mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif INIT_XMM ssse3 .dy1_w4: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq r3 %endif movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd m15, m8 psrldq m7, m15, 8 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 psrldq m7, 4 movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else movd r1, m15 movd r3, m7 psrldq m15, 4 psrldq m7, 4 movd r4, m15 movd r5, m7 %define m15 m5 SWAP m4, m7 movd m15, [base+subpel_filters+r1*8+2] movd m2, [base+subpel_filters+r3*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m4, [base+subpel_filters+r5*8+2] mov myd, mym mov rX, [esp+0x1f4] xor r5, r5 shr myd, 6 lea rX, [rX+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+rX*8+0] cmovnz r5, [base+subpel_filters+rX*8+4] mov r3, r3m %if isprep lea ss3q, [ssq*3] %endif %endif punpckldq m15, m3 punpckldq m2, m4 punpcklqdq m15, m2 movq m6, [base+subpel_s_shuf2] %if ARCH_X86_64 pcmpeqd m8, m9 psrld m14, 10 pshufb m14, [base+bdct_lb_dw] movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpcklqdq m6, m6 movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m7, [srcq+ssq*2] add srcq, ss3q pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 paddb m14, m6 movq m10, r4q punpcklbw m10, m10 psraw m10, 8 pshufb m0, m14 pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 pshufb m4, m14 pshufb m5, m14 pshufb m7, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 pmaddubsw m2, m15 pmaddubsw m3, m15 pmaddubsw m4, m15 pmaddubsw m5, m15 pmaddubsw m7, m15 phaddw m0, m1 phaddw m2, m3 phaddw m4, m5 phaddw m6, m7, m7 pmulhrsw m0, m12 ; 0 1 pmulhrsw m2, m12 ; 2 3 pmulhrsw m4, m12 ; 4 5 pmulhrsw m6, m12 ; 6 _ shufps m1, m0, m2, q1032 ; 1 2 shufps m3, m2, m4, q1032 ; 3 4 shufps m5, m4, m6, q1032 ; 5 6 punpcklwd m7, m0, m1 ; 01 punpckhwd m0, m1 ; 12 punpcklwd m8, m2, m3 ; 23 punpckhwd m2, m3 ; 34 punpcklwd m9, m4, m5 ; 45 punpckhwd m4, m5 ; 56 %else pxor m3, m3 pcmpeqd m8, m3 psrld m14, 10 pshufb m14, [base+bdct_lb_dw] movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m3, [srcq+ssq*2] add srcq, ss3q punpcklqdq m6, m6 SWAP m4, m7 pand m7, m11, m8 pandn m8, m15 SWAP m5, m0 por m15, m7 paddb m14, m6 movu m0, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 pshufb m0, m14 pshufb m7, m14 pshufb m6, m14 pmaddubsw m1, m15 pmaddubsw m2, m15 pmaddubsw m3, m15 mova [esp+0x00], m14 mova [esp+0x10], m15 pmaddubsw m0, m15 pmaddubsw m7, m15 pmaddubsw m6, m15 phaddw m1, m2 movu m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] mov r0, r0m phaddw m3, m0 pshufb m2, m14 pmaddubsw m2, m15 %define m14 [esp+0x00] %define m15 [esp+0x10] phaddw m7, m6 phaddw m2, m2 movd m6, r4 movd m0, r5 punpckldq m6, m0 punpcklbw m6, m6 psraw m6, 8 mova [esp+0x20], m6 pmulhrsw m1, m12 ; 0 1 pmulhrsw m3, m12 ; 2 3 pmulhrsw m7, m12 ; 4 5 pmulhrsw m2, m12 ; 6 _ shufps m0, m1, m3, q1032 ; 1 2 shufps m4, m3, m7, q1032 ; 3 4 shufps m5, m7, m2, q1032 ; 5 6 punpcklwd m6, m1, m0 ; 01 punpckhwd m1, m0 ; 12 mova [esp+0x30], m1 punpcklwd m1, m3, m4 ; 23 punpckhwd m3, m4 ; 34 mova [esp+0x40], m3 punpcklwd m3, m7, m5 ; 45 punpckhwd m7, m5 ; 56 mova [esp+0x50], m7 mova [esp+0x60], m2 mova m0, [esp+0x20] %xdefine m8 m1 %xdefine m9 m3 %xdefine m10 m0 SWAP m7, m6 SWAP m1, m4 SWAP m3, m2 %endif pshufd m1, m10, q0000 pshufd m3, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_64 mova [rsp+0x00], m8 mova [rsp+0x10], m2 mova [rsp+0x20], m9 mova [rsp+0x30], m4 %else mova [esp+0x70], m8 mova [esp+0x80], m9 mova [esp+0x90], m1 mova [esp+0xa0], m3 mova [esp+0xb0], m5 mova [esp+0xc0], m10 %ifidn %1, put mov dsd, dsm %endif %define m11 m6 %endif .dy1_w4_loop: %if ARCH_X86_64 movu m11, [srcq+ssq*0] pmaddwd m7, m1 pmaddwd m8, m3 pmaddwd m0, m1 pmaddwd m2, m3 pmaddwd m9, m5 pmaddwd m4, m5 paddd m7, m8 paddd m0, m2 movu m8, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m11, m14 pmaddubsw m11, m15 paddd m7, m13 paddd m0, m13 paddd m7, m9 paddd m0, m4 pshufb m8, m14 pmaddubsw m8, m15 phaddw m11, m8 mova m8, [rsp+0x20] pmulhrsw m11, m12 punpcklwd m9, m6, m11 ; 67 psrldq m6, m11, 8 punpcklwd m4, m11, m6 ; 78 pmaddwd m2, m9, m10 pmaddwd m11, m4, m10 paddd m7, m2 mova m2, [rsp+0x30] paddd m0, m11 %else SWAP m7, m6 SWAP m1, m4 SWAP m3, m2 movu m5, [srcq+ssq*0] mova m0, [esp+0x30] mova m2, [esp+0x40] mova m4, [esp+0x50] pmaddwd m6, [esp+0x90] pmaddwd m1, [esp+0xa0] pmaddwd m0, [esp+0x90] pmaddwd m2, [esp+0xa0] pmaddwd m3, [esp+0xb0] pmaddwd m4, [esp+0xb0] paddd m6, m1 paddd m0, m2 movu m7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m5, m14 pmaddubsw m5, m15 paddd m6, m13 paddd m0, m13 paddd m6, m3 paddd m0, m4 pshufb m7, m14 pmaddubsw m7, m15 phaddw m5, m7 mova m7, [rsp+0x80] pmulhrsw m5, m12 punpcklwd m3, [esp+0x60], m5 ; 67 psrldq m1, m5, 8 punpcklwd m4, m5, m1 ; 78 pmaddwd m2, m3, [esp+0xc0] pmaddwd m5, m4, [esp+0xc0] mova [esp+0x60], m1 paddd m6, m2 mova m2, [esp+0x50] paddd m0, m5 SWAP m7, m6 %endif psrad m7, rndshift psrad m0, rndshift packssdw m7, m0 %if ARCH_X86_64 mova m0, [rsp+0x10] %else mova m0, [esp+0x40] %define m11 m5 %endif %ifidn %1, put packuswb m7, m7 psrldq m11, m7, 4 movd [dstq+dsq*0], m7 movd [dstq+dsq*1], m11 lea dstq, [dstq+dsq*2] %else mova [tmpq], m7 add tmpq, 16 %endif sub hd, 2 jz .ret %if ARCH_X86_64 mova m7, [rsp+0x00] mova [rsp+0x00], m8 mova [rsp+0x10], m2 mova [rsp+0x20], m9 mova [rsp+0x30], m4 %else mova m7, [esp+0x70] ; 01 mova m1, [esp+0x80] ; 23 mova m2, [esp+0x50] ; 34 mova [esp+0x30], m0 mova [esp+0x70], m1 mova [esp+0x40], m2 mova [esp+0x80], m3 mova [esp+0x50], m4 %endif jmp .dy1_w4_loop INIT_XMM ssse3 .dy1_w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .dy1_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif %if ARCH_X86_64 shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define m8 m0 %define m9 m1 %xdefine m14 m4 %xdefine m15 m3 %if isprep %define ssq ssm %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 sub srcq, 3 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q punpcklbw m3, m3 psraw m3, 8 %else movd m5, r4 movd m6, r5 punpckldq m5, m6 punpcklbw m5, m5 psraw m5, 8 SWAP m3, m5 %endif mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [rsp+0x140], m0 mova [rsp+0x150], m1 mova [rsp+0x160], m2 mova [rsp+0x170], m3 %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 SWAP m5, m3 mov r5, hm mov [esp+0x134], r5 %endif jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+0x090] jz .ret %if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm %else add dword [rsp+0x130], 8*(isprep+1) mov r5, [esp+0x134] mov r0, [esp+0x130] %endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] %else %define m10 [base+pd_0x3ff] %endif mova m15, [rsp+0x120] mov srcq, [rsp+0x098] %if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy1_hloop: pxor m9, m9 %if ARCH_X86_64 mova m11, [base+pq_0x40000000] %else %define m11 [base+pq_0x40000000] %endif psrld m2, m14, 10 mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 psrldq m2, m5, 8 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 psrldq m5, 4 psrldq m2, 4 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 psrldq m5, 4 psrldq m2, 4 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] pxor m2, m2 %define m9 m2 %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 psrldq m4, 4 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 psrldq m4, m14, 8 movd r10d, m14 movd r11d, m4 psrldq m14, 4 psrldq m4, 4 movd r13d, m14 movd rXd, m4 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m8, m11, m4 pand m9, m11, m6 pand m15, m11, m7 pand m11, m11, m5 pandn m4, m0 pandn m6, m1 pandn m7, m2 pandn m5, m3 por m8, m4 por m9, m6 por m15, m7 por m11, m5 mova [rsp+0x10], m8 mova [rsp+0x20], m9 mova [rsp+0x30], m15 mova [rsp+0x40], m11 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 mova [rsp+0x50], m1 mova [rsp+0x60], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 mova [rsp+0x70], m3 mova [rsp+0x80], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 SWAP m7, m0 SWAP m8, m14 mova m1, [rsp+0x50] mova m2, [rsp+0x60] mova m3, [rsp+0x70] mova m15, [rsp+0x80] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b SWAP m14, m8 mova m8, [rsp+0x140] mova m9, [rsp+0x150] mova m10, [rsp+0x160] mova m11, [rsp+0x170] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m15; 23a punpckhwd m3, m15 ; 23b mova [rsp+0x50], m4 mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 mova m14, [base+unpckw] %else movd r0, m15 movd rX, m4 psrldq m15, 4 psrldq m4, 4 movd r4, m15 movd r5, m4 mova m14, [esp+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [esp+16], m14 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m11, m4 pand m1, m11, m6 pand m2, m11, m7 pand m3, m11, m5 pandn m4, [esp+0x20] pandn m6, [esp+0x30] pandn m7, [esp+0x40] pandn m5, [esp+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 mova m5, [esp+0x1a0] mova m6, [esp+0x1b0] mova m7, [esp+0x1c0] mova m0, [esp+0x1d0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [esp+0x1a0], m4 mova [esp+0x1b0], m5 mova [esp+0x1c0], m6 mova [esp+0x1d0], m7 mova m1, [esp+0x060] mova m2, [esp+0x070] mova m3, [esp+0x180] mova m4, [esp+0x190] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [esp+0x060], m0 mova [esp+0x070], m1 mova [esp+0x180], m2 mova [esp+0x190], m3 %define m8 [esp+0x140] %define m9 [esp+0x150] %define m10 [esp+0x160] %define m11 [esp+0x170] %endif .dy1_vloop: %if ARCH_X86_32 mov r0, r0m %endif pmaddwd m4, m0, m8 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 pmaddwd m7, m3, m9 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 %else pmaddwd m6, [rsp+0x1a0], m10 pmaddwd m7, [rsp+0x1b0], m10 %endif paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x70], m11 pmaddwd m7, [rsp+0x80], m11 %else pmaddwd m6, [rsp+0x1c0], m11 pmaddwd m7, [rsp+0x1d0], m11 %endif paddd m4, m6 paddd m5, m7 psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 %ifidn %1, put packuswb m4, m4 movq [dstq], m4 add dstq, dsm %else mova [tmpq], m4 add tmpq, tmp_stridem %endif %if ARCH_X86_32 mov r0m, r0 %endif dec hd jz .dy1_hloop_prep %if ARCH_X86_64 movq m4, [srcq+ r4] movq m5, [srcq+ r6] movhps m4, [srcq+ r7] movhps m5, [srcq+ r9] movq m6, [srcq+r10] movq m7, [srcq+r11] movhps m6, [srcq+r13] movhps m7, [srcq+ rX] add srcq, ssq pshufd m15, m14, q1032 pshufb m0, m14 ; 0a 1a pshufb m1, m14 ; 0b 1b pshufb m2, m15 ; 3a 2a pshufb m3, m15 ; 3b 2b pmaddubsw m4, [rsp+0x10] pmaddubsw m5, [rsp+0x20] pmaddubsw m6, [rsp+0x30] pmaddubsw m7, [rsp+0x40] phaddw m4, m5 phaddw m6, m7 phaddw m4, m6 pmulhrsw m4, m12 pshufb m5, [rsp+0x70], m15 ; 7a 6a pshufb m7, [rsp+0x80], m15 ; 7b 6b pshufb m6, [rsp+0x50], m14 ; 4a 5a pshufb m15, [rsp+0x60], m14 ; 4b 5b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m6 ; 34a punpcklwd m3, m15 ; 34b punpckhwd m6, m5 ; 56a punpckhwd m15, m7 ; 56b punpcklwd m5, m4 ; 78a psrldq m4, 8 punpcklwd m7, m4 ; 78b mova [rsp+0x50], m6 mova [rsp+0x60], m15 mova [rsp+0x70], m5 mova [rsp+0x80], m7 %else mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova m6, [base+unpckw] mova m0, [esp+0x060] mova m1, [esp+0x070] mova m7, [esp+0x1a0] movq m4, [srcq+r0] movq m5, [srcq+rX] movhps m4, [srcq+r4] movhps m5, [srcq+r5] pshufb m0, m6 ; 0a 1a pshufb m1, m6 ; 0b 1b pshufb m7, m6 ; 4a 5a mov r0, [esp+16] mov rX, [esp+24] mov r4, [esp+20] mov r5, [esp+28] movq m3, [srcq+r0] movq m2, [srcq+rX] movhps m3, [srcq+r4] movhps m2, [srcq+r5] add srcq, ssq pmaddubsw m4, [esp+0x20] pmaddubsw m5, [esp+0x30] pmaddubsw m3, [esp+0x40] pmaddubsw m2, [esp+0x50] phaddw m4, m5 phaddw m3, m2 mova m5, [esp+0x1b0] mova m2, [esp+0x180] phaddw m4, m3 mova m3, [esp+0x190] pmulhrsw m4, m12 ; 8a 8b pshufb m5, m6 ; 4b 5b pshufd m6, m6, q1032 pshufb m2, m6 ; 3a 2a pshufb m3, m6 ; 3b 2b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b mova [esp+0x60], m0 mova [esp+0x70], m1 mova m0, [esp+0x1c0] mova m1, [esp+0x1d0] punpcklwd m2, m7 ; 34a punpcklwd m3, m5 ; 34b mova [esp+0x180], m2 mova [esp+0x190], m3 pshufb m0, m6 ; 7a 6a pshufb m1, m6 ; 7b 6b punpckhwd m7, m0 ; 56a punpckhwd m5, m1 ; 56b punpcklwd m0, m4 punpckhqdq m4, m4 punpcklwd m1, m4 mova [esp+0x1a0], m7 mova [esp+0x1b0], m5 mova [esp+0x1c0], m0 mova [esp+0x1d0], m1 mova m0, [esp+0x60] mova m1, [esp+0x70] %endif jmp .dy1_vloop INIT_XMM ssse3 .dy2: movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy2_w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 movzx r5, byte [esp+0x1f0] dec srcd movd m15, r5 %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [esp+0x00], m14 %define m14 [esp+0x00] SWAP m5, m0 SWAP m6, m3 %define m8 m5 %define m15 m6 %endif movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] movhps m0, [srcq+ssq*2] movhps m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %if ARCH_X86_64 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 movq m10, r4q %else mov myd, mym mov r3, [esp+0x1f4] xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r3, r3m %define m10 m4 movd m10, r4 movd m3, r5 punpckldq m10, m3 %endif movq m3, [srcq+ssq*0] movhps m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m10, m10 psraw m10, 8 punpckldq m15, m7 punpcklqdq m15, m15 %if ARCH_X86_64 pand m11, m8 %else pand m7, m11, m8 %define m11 m7 %endif pandn m8, m15 SWAP m15, m8 por m15, m11 %if ARCH_X86_64 pshufd m8, m10, q0000 pshufd m9, m10, q1111 pshufd m11, m10, q3333 pshufd m10, m10, q2222 %else mova [esp+0x10], m15 %define m15 [esp+0x10] mov r5, r0m %define dstq r5 mov dsd, dsm pshufd m5, m4, q0000 pshufd m6, m4, q1111 pshufd m7, m4, q2222 pshufd m4, m4, q3333 %define m8 [esp+0x20] %define m9 [esp+0x30] %define m10 [esp+0x40] %define m11 [esp+0x50] mova m8, m5 mova m9, m6 mova m10, m7 mova m11, m4 %endif pshufb m0, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 pslldq m2, m3, 8 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 2 _ 4 pmulhrsw m1, m12 ; 1 3 _ 5 pshufd m2, m0, q3110 ; 0 2 2 4 pshufd m1, m1, q3110 ; 1 3 3 5 punpcklwd m3, m2, m1 ; 01 23 punpckhwd m2, m1 ; 23 45 .dy2_w2_loop: movq m6, [srcq+ssq*0] movq m7, [srcq+ssq*1] movhps m6, [srcq+ssq*2] movhps m7, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pmaddwd m4, m3, m8 pmaddwd m5, m2, m9 pshufb m6, m14 pshufb m7, m14 pmaddubsw m6, m15 pmaddubsw m7, m15 phaddw m6, m7 pmulhrsw m6, m12 psrldq m7, m6, 8 palignr m6, m0, 8 palignr m7, m1, 8 mova m0, m6 mova m1, m7 pshufd m6, m6, q3221 pshufd m7, m7, q3221 punpcklwd m3, m6, m7 ; 45 67 punpckhwd m2, m6, m7 ; 67 89 pmaddwd m6, m3, m10 pmaddwd m7, m2, m11 paddd m4, m5 paddd m4, m13 paddd m6, m7 paddd m4, m6 psrad m4, rndshift packssdw m4, m4 packuswb m4, m4 movd r4d, m4 mov [dstq+dsq*0], r4w shr r4d, 16 mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif INIT_XMM ssse3 .dy2_w4: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %define dstq r0 %if isprep %define ssq r3 %endif movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd m15, m8 psrldq m7, m15, 8 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 psrldq m7, 4 movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] movq m6, [base+subpel_s_shuf2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else movd r1, m15 movd r3, m7 psrldq m15, 4 psrldq m7, 4 movd r4, m15 movd r5, m7 %define m15 m5 SWAP m4, m7 movd m15, [base+subpel_filters+r1*8+2] movd m2, [base+subpel_filters+r3*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m4, [base+subpel_filters+r5*8+2] movq m6, [base+subpel_s_shuf2] mov myd, mym mov r3, [esp+0x1f4] xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r3, r3m %if isprep lea ss3q, [ssq*3] %endif %endif punpckldq m15, m3 punpckldq m2, m4 punpcklqdq m15, m2 %if ARCH_X86_64 pcmpeqd m8, m9 psrld m14, 10 movu m0, [srcq+ssq*0] movu m2, [srcq+ssq*2] movu m1, [srcq+ssq*1] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpcklqdq m6, m6 pshufb m14, [base+bdct_lb_dw] movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 paddb m14, m6 movq m11, r4q punpcklbw m11, m11 psraw m11, 8 pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 pshufb m3, m14 pshufb m4, m14 pshufb m5, m14 pmaddubsw m0, m15 pmaddubsw m2, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 pmaddubsw m4, m15 pmaddubsw m5, m15 phaddw m0, m2 phaddw m1, m3 phaddw m4, m5 pmulhrsw m0, m12 ; 0 2 pmulhrsw m1, m12 ; 1 3 pmulhrsw m4, m12 ; 4 5 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 %else pxor m3, m3 pcmpeqd m8, m3 psrld m14, 10 pshufb m14, [base+bdct_lb_dw] movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*2] movu m3, [srcq+ssq*1] add srcq, ss3q punpcklqdq m6, m6 SWAP m4, m7 pand m7, m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m7 paddb m14, m6 movu m0, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 pshufb m0, m14 pshufb m7, m14 pshufb m6, m14 pmaddubsw m1, m15 pmaddubsw m2, m15 pmaddubsw m3, m15 mova [esp+0x00], m14 mova [esp+0x10], m15 pmaddubsw m0, m15 pmaddubsw m7, m15 pmaddubsw m6, m15 %define m14 [esp+0x00] %define m15 [esp+0x10] phaddw m1, m2 phaddw m3, m0 phaddw m7, m6 %ifidn %1, put mov dsd, dsm %define dstq r5 %else %define tmpq r5 %endif movd m6, r4 movd m0, r5 punpckldq m6, m0 punpcklbw m6, m6 psraw m6, 8 mov r5, r0m pmulhrsw m1, m12 ; 0 2 pmulhrsw m3, m12 ; 1 3 pmulhrsw m7, m12 ; 4 5 SWAP m0, m1, m3 SWAP m4, m7 pshufd m2, m6, q0000 pshufd m3, m6, q1111 pshufd m7, m6, q2222 pshufd m6, m6, q3333 mova [esp+0x30], m2 mova [esp+0x40], m3 mova [esp+0x50], m7 mova [esp+0x60], m6 %define m8 [esp+0x30] %define m9 [esp+0x40] %define m10 [esp+0x50] %define m11 [esp+0x60] %endif psrldq m5, m4, 8 ; 5 _ punpckhwd m2, m0, m1 ; 23 punpcklwd m0, m1 ; 01 punpcklwd m4, m5 ; 45 .dy2_w4_loop: pmaddwd m0, m8 ; a0 pmaddwd m5, m2, m8 ; b0 pmaddwd m2, m9 ; a1 pmaddwd m7, m4, m9 ; b1 pmaddwd m3, m4, m10 ; a2 paddd m0, m13 paddd m5, m13 paddd m0, m2 paddd m5, m7 paddd m0, m3 movu m6, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m6, m14 pshufb m7, m14 pshufb m3, m14 pshufb m1, m14 pmaddubsw m6, m15 pmaddubsw m7, m15 pmaddubsw m3, m15 pmaddubsw m1, m15 phaddw m6, m7 phaddw m3, m1 pmulhrsw m6, m12 ; 6 7 pmulhrsw m3, m12 ; 8 9 psrldq m7, m6, 8 psrldq m1, m3, 8 punpcklwd m6, m7 ; 67 punpcklwd m3, m1 ; 89 mova m2, m6 pmaddwd m1, m6, m10 ; b2 pmaddwd m6, m11 ; a3 pmaddwd m7, m3, m11 ; b3 paddd m5, m1 paddd m0, m6 paddd m5, m7 psrad m0, rndshift psrad m5, rndshift packssdw m0, m5 %ifidn %1, put packuswb m0, m0 psrldq m1, m0, 4 movd [dstq+dsq*0], m0 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] %else mova [tmpq], m0 add tmpq, 16 %endif mova m0, m4 mova m4, m3 sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET INIT_XMM ssse3 .dy2_w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .dy2_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif %if ARCH_X86_64 shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %define m9 m1 %xdefine m14 m4 %xdefine m15 m3 %if isprep %define tmpq r0 %define ssq ssm %else %define dstq r0 %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 sub srcq, 3 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q punpcklbw m3, m3 psraw m3, 8 %else movd m5, r4 movd m6, r5 punpckldq m5, m6 punpcklbw m5, m5 psraw m5, 8 SWAP m3, m5 %endif mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [rsp+0x140], m0 mova [rsp+0x150], m1 mova [rsp+0x160], m2 mova [rsp+0x170], m3 %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 SWAP m5, m3 mov r5, hm mov [esp+0x134], r5 %endif jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+0x090] jz .ret %if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm %else add dword [rsp+0x130], 8*(isprep+1) mov r5, [esp+0x134] mov r0, [esp+0x130] %endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] %else %define m10 [base+pd_0x3ff] %endif mova m15, [rsp+0x120] mov srcq, [rsp+0x098] %if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy2_hloop: pxor m9, m9 %if ARCH_X86_64 mova m11, [base+pq_0x40000000] %else %define m11 [base+pq_0x40000000] %endif psrld m2, m14, 10 mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 psrldq m2, m5, 8 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 psrldq m5, 4 psrldq m2, 4 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 psrldq m5, 4 psrldq m2, 4 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] pxor m2, m2 %define m9 m2 %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 psrldq m4, 4 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 psrldq m4, m14, 8 movd r10d, m14 movd r11d, m4 psrldq m14, 4 psrldq m4, 4 movd r13d, m14 movd rXd, m4 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m8, m11, m4 pand m9, m11, m6 pand m15, m11, m7 pand m11, m11, m5 pandn m4, m0 pandn m6, m1 pandn m7, m2 pandn m5, m3 por m8, m4 por m9, m6 por m15, m7 por m11, m5 mova [rsp+0x10], m8 mova [rsp+0x20], m9 mova [rsp+0x30], m15 mova [rsp+0x40], m11 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 mova [rsp+0x50], m1 mova [rsp+0x60], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 mova [rsp+0x70], m3 mova [rsp+0x80], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 SWAP m7, m0 SWAP m8, m14 mova m1, [rsp+0x50] mova m2, [rsp+0x60] mova m3, [rsp+0x70] mova m15, [rsp+0x80] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b SWAP m14, m8 mova m8, [rsp+0x140] mova m9, [rsp+0x150] mova m10, [rsp+0x160] mova m11, [rsp+0x170] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m15; 23a punpckhwd m3, m15 ; 23b mova [rsp+0x50], m4 mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 %else movd r0, m15 movd rX, m4 psrldq m15, 4 psrldq m4, 4 movd r4, m15 movd r5, m4 mova m14, [esp+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [esp+16], m14 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m11, m4 pand m1, m11, m6 pand m2, m11, m7 pand m3, m11, m5 pandn m4, [esp+0x20] pandn m6, [esp+0x30] pandn m7, [esp+0x40] pandn m5, [esp+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 mova m5, [esp+0x1a0] mova m6, [esp+0x1b0] mova m7, [esp+0x1c0] mova m0, [esp+0x1d0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [esp+0x1a0], m4 mova [esp+0x1b0], m5 mova [esp+0x1c0], m6 mova [esp+0x1d0], m7 mova m1, [esp+0x060] mova m2, [esp+0x070] mova m3, [esp+0x180] mova m4, [esp+0x190] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [esp+0x180], m2 mova [esp+0x190], m3 %define m8 [esp+0x140] %define m9 [esp+0x150] %define m10 [esp+0x160] %define m11 [esp+0x170] %endif .dy2_vloop: %if ARCH_X86_32 mov r0, r0m %endif pmaddwd m4, m0, m8 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 pmaddwd m7, m3, m9 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 %else pmaddwd m6, [esp+0x1a0], m10 pmaddwd m7, [esp+0x1b0], m10 %endif paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x70], m11 pmaddwd m7, [rsp+0x80], m11 %else pmaddwd m6, [esp+0x1c0], m11 pmaddwd m7, [esp+0x1d0], m11 %endif paddd m4, m6 paddd m5, m7 psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 %ifidn %1, put packuswb m4, m4 movq [dstq], m4 add dstq, dsm %else mova [tmpq], m4 add tmpq, tmp_stridem %endif %if ARCH_X86_32 mov r0m, r0 %endif dec hd jz .dy2_hloop_prep %if ARCH_X86_64 mova m8, [rsp+0x10] mova m9, [rsp+0x20] mova m10, [rsp+0x30] mova m11, [rsp+0x40] mova m0, m2 ; 01a mova m1, m3 ; 01b MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11 mova m3, [rsp+0x50] ; 23a mova m4, [rsp+0x60] ; 23b mova m5, [rsp+0x70] ; 45a mova m7, [rsp+0x80] ; 45b mova m8, [rsp+0x140] mova m9, [rsp+0x150] mova m10, [rsp+0x160] mova m11, [rsp+0x170] punpcklwd m14, m2, m6 ; 67a punpckhwd m2, m6 ; 67b mova [rsp+0x50], m5 mova [rsp+0x60], m7 mova [rsp+0x70], m14 mova [rsp+0x80], m2 mova m2, m3 mova m3, m4 %else MC_8TAP_SCALED_H 0x20, 0 punpcklwd m6, m0, m4 punpckhwd m7, m0, m4 mova m0, [esp+0x180] ; 01a mova m1, [esp+0x190] ; 01b mova m2, [rsp+0x1a0] ; 23a mova m3, [esp+0x1b0] ; 23b mova m4, [esp+0x1c0] ; 45a mova m5, [esp+0x1d0] ; 45b mova [esp+0x180], m2 mova [esp+0x190], m3 mova [esp+0x1a0], m4 mova [esp+0x1b0], m5 mova [esp+0x1c0], m6 ; 67a mova [esp+0x1d0], m7 ; 67b %endif jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT %define r0m [rstk+stack_offset+ 4] %define r1m [rstk+stack_offset+ 8] %define r2m [rstk+stack_offset+12] %define r3m [rstk+stack_offset+16] %endif %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_8bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, (5*15 << 16) | 5*15 jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 8 %else DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN put FN put_8tap_scaled, sharp, SHARP, SHARP FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP FN put_8tap_scaled, smooth, SMOOTH, SMOOTH FN put_8tap_scaled, sharp_regular, SHARP, REGULAR FN put_8tap_scaled, regular_sharp, REGULAR, SHARP FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH FN put_8tap_scaled, regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 7 %else DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN prep FN prep_8tap_scaled, sharp, SHARP, SHARP FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH FN prep_8tap_scaled, regular, REGULAR, REGULAR MC_8TAP_SCALED prep %if ARCH_X86_32 %macro SAVE_ALPHA_BETA 0 mov alpham, alphad mov betam, betad %endmacro %macro SAVE_DELTA_GAMMA 0 mov deltam, deltad mov gammam, gammad %endmacro %macro LOAD_ALPHA_BETA_MX 0 mov mym, myd mov alphad, alpham mov betad, betam mov mxd, mxm %endmacro %macro LOAD_DELTA_GAMMA_MY 0 mov mxm, mxd mov deltad, deltam mov gammad, gammam mov myd, mym %endmacro %define PIC_reg r2 %define PIC_base_offset $$ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) %else %define SAVE_ALPHA_BETA %define SAVE_DELTA_GAMMA %define PIC_sym(sym) sym %endif %if ARCH_X86_32 %if STACK_ALIGNMENT < required_stack_alignment %assign copy_args 8*4 %else %assign copy_args 0 %endif %endif %macro RELOC_ARGS 0 %if copy_args mov r0, r0m mov r1, r1m mov r2, r2m mov r3, r3m mov r5, r5m mov dstm, r0 mov dsm, r1 mov srcm, r2 mov ssm, r3 mov mxm, r5 mov r0, r6m mov mym, r0 %endif %endmacro %macro BLENDHWDW 2 ; blend high words from dwords, src1, src2 %if cpuflag(sse4) pblendw %1, %2, 0xAA %else pand %2, m10 por %1, %2 %endif %endmacro %macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 %if ARCH_X86_32 %define m8 m4 %define m9 m5 %define m14 m6 %define m15 m7 %define m11 m7 %endif %if notcpuflag(ssse3) || ARCH_X86_32 pxor m11, m11 %endif lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq m2, [filterq+myq *8] ; a movq m8, [filterq+tmp1q*8] ; e lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+deltaq*1] shr tmp2d, 10 shr tmp1d, 10 movq m3, [filterq+tmp2q*8] ; b movq m0, [filterq+tmp1q*8] ; f punpcklwd m2, m3 punpcklwd m8, m0 lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq m0, [filterq+myq *8] ; c movq m9, [filterq+tmp1q*8] ; g lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+gammaq] ; my += gamma shr tmp2d, 10 shr tmp1d, 10 movq m3, [filterq+tmp2q*8] ; d movq m1, [filterq+tmp1q*8] ; h punpcklwd m0, m3 punpcklwd m9, m1 punpckldq m1, m2, m0 punpckhdq m2, m0 punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 pmaddwd m0, %3 pmaddwd m3, %5 pmaddwd m1, %7 pmaddwd m14, %9 paddd m0, m3 paddd m1, m14 paddd m0, m1 mova %1, m0 %if ARCH_X86_64 SWAP m3, m14 %endif punpckldq m0, m8, m9 punpckhdq m8, m9 punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8 punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8 punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8 punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8 pmaddwd m1, %4 pmaddwd m14, %6 pmaddwd m2, %8 pmaddwd m15, %10 paddd m1, m14 paddd m2, m15 paddd m1, m2 mova %2, m1 %if ARCH_X86_64 SWAP m14, m3 %endif %endmacro %if ARCH_X86_64 %define counterd r4d %else %if copy_args == 0 %define counterd dword r4m %else %define counterd dword [esp+stack_size-4*7] %endif %endif %macro WARP_AFFINE_8X8T 0 %if ARCH_X86_64 cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts %else cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts %if copy_args %define tmpm [esp+stack_size-4*1] %define tsm [esp+stack_size-4*2] %endif %endif call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main .loop: %if ARCH_X86_32 %define m12 m4 %define m13 m5 %define m14 m6 %define m15 m7 mova m12, [esp+0xC0] mova m13, [esp+0xD0] mova m14, [esp+0xE0] mova m15, [esp+0xF0] %endif %if cpuflag(ssse3) psrad m12, 13 psrad m13, 13 psrad m14, 13 psrad m15, 13 packssdw m12, m13 packssdw m14, m15 mova m13, [PIC_sym(pw_8192)] pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7 pmulhrsw m14, m13 %else %if ARCH_X86_32 %define m10 m0 %endif mova m10, [PIC_sym(pd_16384)] paddd m12, m10 paddd m13, m10 paddd m14, m10 paddd m15, m10 psrad m12, 15 psrad m13, 15 psrad m14, 15 psrad m15, 15 packssdw m12, m13 packssdw m14, m15 %endif mova [tmpq+tsq*0], m12 mova [tmpq+tsq*2], m14 dec counterd jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end %if ARCH_X86_32 mov tmpm, tmpd mov r0, [esp+0x100] mov r1, [esp+0x104] %endif call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2 lea tmpq, [tmpq+tsq*4] jmp .loop %endmacro %macro WARP_AFFINE_8X8 0 %if ARCH_X86_64 cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ filter, tmp1, delta, my, gamma %else cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ filter, tmp1, delta, my, gamma %define alphaq r0 %define alphad r0 %define alpham [esp+gprsize+0x100] %define betaq r1 %define betad r1 %define betam [esp+gprsize+0x104] %define deltaq r0 %define deltad r0 %define deltam [esp+gprsize+0x108] %define gammaq r1 %define gammad r1 %define gammam [esp+gprsize+0x10C] %define filterq r3 %define tmp1q r4 %define tmp1d r4 %define tmp1m [esp+gprsize+0x110] %define myq r5 %define myd r5 %define mym r6m %if copy_args %define dstm [esp+stack_size-4*1] %define dsm [esp+stack_size-4*2] %define srcm [esp+stack_size-4*3] %define ssm [esp+stack_size-4*4] %define mxm [esp+stack_size-4*5] %define mym [esp+stack_size-4*6] %endif %endif call .main jmp .start .loop: %if ARCH_X86_32 mov dstm, dstd mov alphad, [esp+0x100] mov betad, [esp+0x104] %endif call .main2 lea dstq, [dstq+dsq*2] .start: %if notcpuflag(sse4) %if cpuflag(ssse3) %define roundval pw_8192 %else %define roundval pd_262144 %endif %if ARCH_X86_64 mova m10, [PIC_sym(roundval)] %else %define m10 [PIC_sym(roundval)] %endif %endif %if ARCH_X86_32 %define m12 m5 %define m13 m6 mova m12, [esp+0xC0] mova m13, [esp+0xD0] %endif %if cpuflag(sse4) %if ARCH_X86_32 %define m11 m4 pxor m11, m11 %endif psrad m12, 18 psrad m13, 18 packusdw m12, m13 pavgw m12, m11 ; (x + (1 << 10)) >> 11 %else %if cpuflag(ssse3) psrad m12, 17 psrad m13, 17 packssdw m12, m13 pmulhrsw m12, m10 %else paddd m12, m10 paddd m13, m10 psrad m12, 19 psrad m13, 19 packssdw m12, m13 %endif %endif %if ARCH_X86_32 %define m14 m6 %define m15 m7 mova m14, [esp+0xE0] mova m15, [esp+0xF0] %endif %if cpuflag(sse4) psrad m14, 18 psrad m15, 18 packusdw m14, m15 pavgw m14, m11 ; (x + (1 << 10)) >> 11 %else %if cpuflag(ssse3) psrad m14, 17 psrad m15, 17 packssdw m14, m15 pmulhrsw m14, m10 %else paddd m14, m10 paddd m15, m10 psrad m14, 19 psrad m15, 19 packssdw m14, m15 %endif %endif packuswb m12, m14 movq [dstq+dsq*0], m12 movhps [dstq+dsq*1], m12 dec counterd jg .loop .end: RET ALIGN function_align .main: %assign stack_offset stack_offset+gprsize %if ARCH_X86_32 %assign stack_size stack_size+4 %if copy_args %assign stack_offset stack_offset-4 %endif RELOC_ARGS LEA PIC_reg, $$ %define PIC_mem [esp+gprsize+0x114] mov abcdd, abcdm %if copy_args == 0 mov ssd, ssm mov mxd, mxm %endif mov PIC_mem, PIC_reg mov srcd, srcm %endif movsx deltad, word [abcdq+2*2] movsx gammad, word [abcdq+2*3] lea tmp1d, [deltaq*3] sub gammad, tmp1d ; gamma -= delta*3 SAVE_DELTA_GAMMA %if ARCH_X86_32 mov abcdd, abcdm %endif movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] lea tmp1q, [ssq*3+3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 + 3 %if ARCH_X86_32 mov srcm, srcd mov PIC_reg, PIC_mem %endif sub betad, tmp2d ; beta -= alpha*3 lea filterq, [PIC_sym(mc_warp_filter2)] %if ARCH_X86_64 mov myd, r6m %if cpuflag(ssse3) pxor m11, m11 %endif %endif call .h psrld m2, m0, 16 psrld m3, m1, 16 %if ARCH_X86_32 %if notcpuflag(ssse3) mova [esp+gprsize+0x00], m2 %endif mova [esp+gprsize+0x10], m3 %endif call .h psrld m4, m0, 16 psrld m5, m1, 16 %if ARCH_X86_32 mova [esp+gprsize+0x20], m4 mova [esp+gprsize+0x30], m5 %endif call .h %if ARCH_X86_64 %define blendmask [rsp+gprsize+0x80] %else %if notcpuflag(ssse3) mova m2, [esp+gprsize+0x00] %endif mova m3, [esp+gprsize+0x10] %define blendmask [esp+gprsize+0x120] %define m10 m7 %endif pcmpeqd m10, m10 pslld m10, 16 mova blendmask, m10 BLENDHWDW m2, m0 ; 0 BLENDHWDW m3, m1 ; 2 mova [rsp+gprsize+0x00], m2 mova [rsp+gprsize+0x10], m3 call .h %if ARCH_X86_32 mova m4, [esp+gprsize+0x20] mova m5, [esp+gprsize+0x30] %endif mova m10, blendmask BLENDHWDW m4, m0 ; 1 BLENDHWDW m5, m1 ; 3 mova [rsp+gprsize+0x20], m4 mova [rsp+gprsize+0x30], m5 call .h %if ARCH_X86_32 %if notcpuflag(ssse3) mova m2, [esp+gprsize+0x00] %endif mova m3, [esp+gprsize+0x10] %define m10 m5 %endif psrld m6, m2, 16 psrld m7, m3, 16 mova m10, blendmask BLENDHWDW m6, m0 ; 2 BLENDHWDW m7, m1 ; 4 mova [rsp+gprsize+0x40], m6 mova [rsp+gprsize+0x50], m7 call .h %if ARCH_X86_32 mova m4, [esp+gprsize+0x20] mova m5, [esp+gprsize+0x30] %endif psrld m2, m4, 16 psrld m3, m5, 16 mova m10, blendmask BLENDHWDW m2, m0 ; 3 BLENDHWDW m3, m1 ; 5 mova [rsp+gprsize+0x60], m2 mova [rsp+gprsize+0x70], m3 call .h %if ARCH_X86_32 mova m6, [esp+gprsize+0x40] mova m7, [esp+gprsize+0x50] %define m10 m7 %endif psrld m4, m6, 16 psrld m5, m7, 16 mova m10, blendmask BLENDHWDW m4, m0 ; 4 BLENDHWDW m5, m1 ; 6 %if ARCH_X86_64 add myd, 512+(64<<10) mova m6, m2 mova m7, m3 %else mova [esp+gprsize+0x80], m4 mova [esp+gprsize+0x90], m5 add dword mym, 512+(64<<10) %endif mov counterd, 4 SAVE_ALPHA_BETA .main2: call .h %if ARCH_X86_32 mova m6, [esp+gprsize+0x60] mova m7, [esp+gprsize+0x70] %define m10 m5 %endif psrld m6, 16 psrld m7, 16 mova m10, blendmask BLENDHWDW m6, m0 ; 5 BLENDHWDW m7, m1 ; 7 %if ARCH_X86_64 WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ m4, m5, \ [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ m6, m7 %else mova [esp+gprsize+0xA0], m6 mova [esp+gprsize+0xB0], m7 LOAD_DELTA_GAMMA_MY WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \ [esp+gprsize+0x00], [esp+gprsize+0x10], \ [esp+gprsize+0x80], [esp+gprsize+0x90], \ [esp+gprsize+0x20], [esp+gprsize+0x30], \ [esp+gprsize+0xA0], [esp+gprsize+0xB0] LOAD_ALPHA_BETA_MX %endif call .h mova m2, [rsp+gprsize+0x40] mova m3, [rsp+gprsize+0x50] %if ARCH_X86_32 mova m4, [rsp+gprsize+0x80] mova m5, [rsp+gprsize+0x90] %define m10 m7 %endif mova [rsp+gprsize+0x00], m2 mova [rsp+gprsize+0x10], m3 mova [rsp+gprsize+0x40], m4 mova [rsp+gprsize+0x50], m5 psrld m4, 16 psrld m5, 16 mova m10, blendmask BLENDHWDW m4, m0 ; 6 BLENDHWDW m5, m1 ; 8 %if ARCH_X86_64 WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ m6, m7, \ [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ m4, m5 %else mova [esp+gprsize+0x80], m4 mova [esp+gprsize+0x90], m5 LOAD_DELTA_GAMMA_MY WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \ [esp+gprsize+0x20], [esp+gprsize+0x30], \ [esp+gprsize+0xA0], [esp+gprsize+0xB0], \ [esp+gprsize+0x00], [esp+gprsize+0x10], \ [esp+gprsize+0x80], [esp+gprsize+0x90] mov mym, myd mov dstd, dstm mov dsd, dsm mov mxd, mxm %endif mova m2, [rsp+gprsize+0x60] mova m3, [rsp+gprsize+0x70] %if ARCH_X86_32 mova m6, [esp+gprsize+0xA0] mova m7, [esp+gprsize+0xB0] %endif mova [rsp+gprsize+0x20], m2 mova [rsp+gprsize+0x30], m3 mova [rsp+gprsize+0x60], m6 mova [rsp+gprsize+0x70], m7 ret ALIGN function_align .h: %if ARCH_X86_32 %define m8 m3 %define m9 m4 %define m10 m5 %define m14 m6 %define m15 m7 %endif lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] %if ARCH_X86_32 %assign stack_offset stack_offset+4 %assign stack_size stack_size+4 %define PIC_mem [esp+gprsize*2+0x114] mov PIC_mem, PIC_reg mov srcd, srcm %endif movu m10, [srcq] %if ARCH_X86_32 add srcd, ssm mov srcm, srcd mov PIC_reg, PIC_mem %else add srcq, ssq %endif shr mxd, 10 shr tmp1d, 10 movq m1, [filterq+mxq *8] ; 0 X movq m8, [filterq+tmp1q*8] ; 4 X lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] shr tmp2d, 10 shr tmp1d, 10 movhps m1, [filterq+tmp2q*8] ; 0 1 movhps m8, [filterq+tmp1q*8] ; 4 5 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 shr tmp1d, 10 %if cpuflag(ssse3) movq m14, [filterq+mxq *8] ; 2 X movq m9, [filterq+tmp1q*8] ; 6 X lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta shr tmp2d, 10 shr tmp1d, 10 movhps m14, [filterq+tmp2q*8] ; 2 3 movhps m9, [filterq+tmp1q*8] ; 6 7 pshufb m0, m10, [PIC_sym(warp_8x8_shufA)] pmaddubsw m0, m1 pshufb m1, m10, [PIC_sym(warp_8x8_shufB)] pmaddubsw m1, m8 pshufb m15, m10, [PIC_sym(warp_8x8_shufC)] pmaddubsw m15, m14 pshufb m10, m10, [PIC_sym(warp_8x8_shufD)] pmaddubsw m10, m9 phaddw m0, m15 phaddw m1, m10 %else %if ARCH_X86_32 %define m11 m2 %endif pcmpeqw m0, m0 psrlw m14, m0, 8 psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15 pand m14, m10 ; 00 02 04 06 08 10 12 14 packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 psrldq m9, m0, 4 pshufd m0, m14, q0220 pand m0, m9 psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ pslldq m15, m14, 12 por m0, m15 ; shufA psrlw m15, m0, 8 psraw m11, m1, 8 psllw m0, 8 psllw m1, 8 psrlw m0, 8 psraw m1, 8 pmullw m15, m11 pmullw m0, m1 paddw m0, m15 ; pmaddubsw m0, m1 pshufd m15, m14, q0220 pand m15, m9 psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ pslldq m1, m14, 12 por m15, m1 ; shufC pshufd m1, m14, q0220 pand m1, m9 psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ pslldq m11, m14, 12 por m1, m11 ; shufB pshufd m10, m14, q0220 pand m10, m9 psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __ pslldq m14, m14, 12 por m10, m14 ; shufD psrlw m9, m1, 8 psraw m11, m8, 8 psllw m1, 8 psllw m8, 8 psrlw m1, 8 psraw m8, 8 pmullw m9, m11 pmullw m1, m8 paddw m1, m9 ; pmaddubsw m1, m8 movq m14, [filterq+mxq *8] ; 2 X movq m9, [filterq+tmp1q*8] ; 6 X lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta shr tmp2d, 10 shr tmp1d, 10 movhps m14, [filterq+tmp2q*8] ; 2 3 movhps m9, [filterq+tmp1q*8] ; 6 7 psrlw m8, m15, 8 psraw m11, m14, 8 psllw m15, 8 psllw m14, 8 psrlw m15, 8 psraw m14, 8 pmullw m8, m11 pmullw m15, m14 paddw m15, m8 ; pmaddubsw m15, m14 psrlw m8, m10, 8 psraw m11, m9, 8 psllw m10, 8 psllw m9, 8 psrlw m10, 8 psraw m9, 8 pmullw m8, m11 pmullw m10, m9 paddw m10, m8 ; pmaddubsw m10, m9 pslld m8, m0, 16 pslld m9, m1, 16 pslld m14, m15, 16 pslld m11, m10, 16 paddw m0, m8 paddw m1, m9 paddw m15, m14 paddw m10, m11 psrad m0, 16 psrad m1, 16 psrad m15, 16 psrad m10, 16 packssdw m0, m15 ; phaddw m0, m15 packssdw m1, m10 ; phaddw m1, m10 %endif mova m14, [PIC_sym(pw_8192)] mova m9, [PIC_sym(pd_32768)] pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 pmaddwd m1, m14 paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword paddd m1, m9 ret %endmacro %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %macro BIDIR_FN 1 ; op %1 0 lea stride3q, [strideq*3] jmp wq .w4_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*4] .w4: ; tile 4x movd [dstq ], m0 ; copy dw[0] pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] movd [dstq+strideq*1], m1 ; copy dw[1] punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] movd [dstq+strideq*2], m0 ; dw[2] psrlq m0, 32 ; shift right in dw[3] movd [dstq+stride3q ], m0 ; copy sub hd, 4 jg .w4_loop RET .w8_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*2] .w8: movq [dstq ], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq] .w16: mova [dstq ], m0 dec hd jg .w16_loop RET .w32_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq] .w32: mova [dstq ], m0 %1 2 mova [dstq + 16 ], m0 dec hd jg .w32_loop RET .w64_loop: %1_INC_PTR 8 %1 0 add dstq, strideq .w64: %assign i 0 %rep 4 mova [dstq + i*16 ], m0 %assign i i+1 %if i < 4 %1 2*i %endif %endrep dec hd jg .w64_loop RET .w128_loop: %1_INC_PTR 16 %1 0 add dstq, strideq .w128: %assign i 0 %rep 8 mova [dstq + i*16 ], m0 %assign i i+1 %if i < 8 %1 2*i %endif %endrep dec hd jg .w128_loop RET %endmacro %macro AVG 1 ; src_offset ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] pmulhrsw m0, m2 pmulhrsw m1, m2 packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 LEA r6, avg_ssse3_table tzcnt wd, wm ; leading zeros movifnidn hd, hm ; move h(stack) to h(register) if not already that register movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m2, [tmp1q+(%1+0)*mmsize] mova m0, m2 psubw m2, [tmp2q+(%1+0)*mmsize] mova m3, [tmp1q+(%1+1)*mmsize] mova m1, m3 psubw m3, [tmp2q+(%1+1)*mmsize] pmulhw m2, m4 pmulhw m3, m4 paddw m0, m2 paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 LEA r6, w_avg_ssse3_table tzcnt wd, wm movd m4, r6m movifnidn hd, hm pxor m0, m0 movsxd wq, dword [r6+wq*4] mova m5, [pw_2048+r6-w_avg_ssse3_table] pshufb m4, m0 psllw m4, 12 ; (weight-16) << 12 when interpreted as signed add wq, r6 cmp dword r6m, 7 jg .weight_gt7 mov r6, tmp1q psubw m0, m4 mov tmp1q, tmp2q mova m4, m0 ; -weight mov tmp2q, r6 .weight_gt7: BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 mova m3, [maskq+(%1+0)*(mmsize/2)] mova m0, [tmp2q+(%1+0)*mmsize] ; b psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a mova m6, m3 ; m psubb m3, m4, m6 ; -m paddw m1, m1 ; (b - a) << 1 paddb m3, m3 ; -m << 1 punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16) pmulhw m1, m2 ; (-m * (b - a)) << 10 paddw m0, m1 ; + b mova m1, [tmp2q+(%1+1)*mmsize] ; b psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a paddw m2, m2 ; (b - a) << 1 mova m6, m3 ; (-m << 1) punpckhbw m3, m4, m6 ; (-m << 9) pmulhw m2, m3 ; (-m << 9) paddw m1, m2 ; (-m * (b - a)) << 10 pmulhrsw m0, m5 ; round pmulhrsw m1, m5 ; round packuswb m0, m1 ; interleave 16 -> 8 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*mmsize/2 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro %if ARCH_X86_64 cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 movifnidn hd, hm %else cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 %define hd dword r5m %endif %define base r6-mask_ssse3_table LEA r6, mask_ssse3_table tzcnt wd, wm movsxd wq, dword [r6+wq*4] pxor m4, m4 mova m5, [base+pw_2048] add wq, r6 mov maskq, r6m BIDIR_FN MASK %undef hd %macro W_MASK_420_END 1-* %rep %0 call .main paddw m2, [maskq+16*%1] mova [maskq+16*%1], m2 mova [dstq+strideq*1+16*(2*%1+0)], m0 call .main psubw m3, m7, m2 psubw m1, m7, [maskq+16*%1] psubw m3, [dstq+strideq*1+16*(2*%1+1)] psrlw m1, 2 psrlw m3, 2 packuswb m1, m3 mova [maskq+16*%1], m1 mova [dstq+strideq*1+16*(2*%1+1)], m0 %rotate 1 %endrep %endmacro %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_420_ssse3_table LEA t0, w_mask_420_ssse3_table tzcnt wd, wm mov r6d, r7m ; sign sub tmp2q, tmp1q movsxd wq, [t0+wq*4] mova m6, [base+pw_2048] movddup m7, [base+wm_420_sign+r6*8] ; 258 - sign add wq, t0 %if ARCH_X86_64 mova m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 movifnidn hd, hm %else %define m8 [base+pw_6903] %define hd dword hm %endif mov maskq, maskmp call .main jmp wq .w4_loop: call .main add maskq, 4 lea dstq, [dstq+strideq*2] .w4: pshufd m3, m2, q2020 pshufd m2, m2, q3131 psubw m1, m7, m3 psubw m1, m2 psrlw m1, 2 packuswb m1, m1 movd [maskq], m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 lea dstq, [dstq+strideq*2] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop RET .w8_loop: call .main add maskq, 4 lea dstq, [dstq+strideq*2] .w8: movhlps m3, m2 psubw m1, m7, m2 psubw m1, m3 psrlw m1, 2 packuswb m1, m1 movd [maskq], m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: call .main add maskq, 8 lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*1], m2 mova [dstq+strideq*0], m0 call .main psubw m1, m7, [dstq+strideq*1] psubw m1, m2 psrlw m1, 2 packuswb m1, m1 movq [maskq], m1 mova [dstq+strideq*1], m0 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add maskq, 16 lea dstq, [dstq+strideq*2] .w32: mova [maskq], m2 mova [dstq+strideq*0+16*0], m0 call .main mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*1], m0 W_MASK_420_END 0 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add maskq, 16*2 lea dstq, [dstq+strideq*2] .w64: mova [maskq+16*0], m2 mova [dstq+strideq*0+16*0], m0 call .main mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*1], m0 call .main mova [maskq+16*1], m2 mova [dstq+strideq*0+16*2], m0 call .main mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*3], m0 W_MASK_420_END 0, 1 sub hd, 2 jg .w64_loop RET .w128_loop: call .main add maskq, 16*4 lea dstq, [dstq+strideq*2] .w128: mova [maskq+16*0], m2 mova [dstq+strideq*0+16*0], m0 call .main mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*1], m0 call .main mova [maskq+16*1], m2 mova [dstq+strideq*0+16*2], m0 call .main mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*3], m0 call .main mova [maskq+16*2], m2 mova [dstq+strideq*0+16*4], m0 call .main mova [dstq+strideq*1+16*5], m2 mova [dstq+strideq*0+16*5], m0 call .main mova [maskq+16*3], m2 mova [dstq+strideq*0+16*6], m0 call .main mova [dstq+strideq*1+16*7], m2 mova [dstq+strideq*0+16*7], m0 W_MASK_420_END 0, 1, 2, 3 sub hd, 2 jg .w128_loop RET ALIGN function_align .main: mova m0, [tmp1q +16*0] mova m3, [tmp1q+tmp2q+16*0] mova m1, [tmp1q +16*1] mova m4, [tmp1q+tmp2q+16*1] add tmp1q, 16*2 psubw m3, m0 psubw m4, m1 pabsw m5, m3 psubusw m2, m8, m5 psrlw m2, 8 ; 64 - m psllw m5, m2, 10 pmulhw m3, m5 pabsw m5, m4 paddw m0, m3 psubusw m3, m8, m5 psrlw m3, 8 phaddw m2, m3 psllw m3, 10 pmulhw m4, m3 paddw m1, m4 pmulhrsw m0, m6 pmulhrsw m1, m6 packuswb m0, m1 ret %macro W_MASK_422_BACKUP 1 ; mask_offset %if ARCH_X86_64 mova m10, m2 %else mova [maskq+16*%1], m2 %endif %endmacro %macro W_MASK_422_END 1 ; mask_offset %if ARCH_X86_64 packuswb m10, m2 psubb m1, m7, m10 pavgb m1, m9 %else mova m3, [maskq+16*%1] packuswb m3, m2 pxor m2, m2 psubb m1, m7, m3 pavgb m1, m2 %endif mova [maskq+16*%1], m1 %endmacro cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_422_ssse3_table LEA t0, w_mask_422_ssse3_table tzcnt wd, wm mov r6d, r7m ; sign sub tmp2q, tmp1q movsxd wq, [t0+wq*4] mova m6, [base+pw_2048] movddup m7, [base+wm_422_sign+r6*8] ; 128 - sign add wq, t0 %if ARCH_X86_64 mova m8, [base+pw_6903] pxor m9, m9 movifnidn hd, hm %else add t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table %define hd dword hm %endif mov maskq, maskmp call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main jmp wq .w4_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 8 lea dstq, [dstq+strideq*2] .w4: packuswb m2, m2 psubb m1, m7, m2 %if ARCH_X86_64 pavgb m1, m9 %else pxor m2, m2 pavgb m1, m2 %endif movq [maskq], m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 lea dstq, [dstq+strideq*2] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop RET .w8_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16 lea dstq, [dstq+strideq*2] .w8: W_MASK_422_BACKUP 0 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main lea dstq, [dstq+strideq*2] W_MASK_422_END 0 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 4 jg .w8_loop RET .w16_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16 lea dstq, [dstq+strideq*2] .w16: W_MASK_422_BACKUP 0 mova [dstq+strideq*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+strideq*1], m0 sub hd, 2 jg .w16_loop RET .w32_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16 add dstq, strideq .w32: W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+16*1], m0 dec hd jg .w32_loop RET .w64_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16*2 add dstq, strideq .w64: W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+16*1], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 1 mova [dstq+16*2], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 1 mova [dstq+16*3], m0 dec hd jg .w64_loop RET .w128_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16*4 add dstq, strideq .w128: W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+16*1], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 1 mova [dstq+16*2], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 1 mova [dstq+16*3], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 2 mova [dstq+16*4], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 2 mova [dstq+16*5], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 3 mova [dstq+16*6], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 3 mova [dstq+16*7], m0 dec hd jg .w128_loop RET cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_444_ssse3_table LEA t0, w_mask_444_ssse3_table tzcnt wd, wm mov maskq, maskmp sub tmp2q, tmp1q movsxd wq, [t0+wq*4] mova m6, [base+pw_6903] mova m7, [base+pw_2048] add wq, t0 %if ARCH_X86_64 mova m8, [base+pb_64] movifnidn hd, hm %else %define m8 [base+pb_64] %define hd dword hm %endif call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 lea dstq, [dstq+strideq*2] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0], m0 call .main mova [dstq+strideq*1], m0 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 call .main mova [dstq+16*2], m0 call .main mova [dstq+16*3], m0 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 call .main mova [dstq+16*2], m0 call .main mova [dstq+16*3], m0 call .main mova [dstq+16*4], m0 call .main mova [dstq+16*5], m0 call .main mova [dstq+16*6], m0 call .main mova [dstq+16*7], m0 dec hd jg .w128_loop RET ALIGN function_align .main: mova m0, [tmp1q +16*0] mova m3, [tmp1q+tmp2q+16*0] mova m1, [tmp1q +16*1] mova m4, [tmp1q+tmp2q+16*1] add tmp1q, 16*2 psubw m3, m0 psubw m4, m1 pabsw m5, m3 psubusw m2, m6, m5 psrlw m2, 8 ; 64 - m psllw m5, m2, 10 pmulhw m3, m5 pabsw m5, m4 paddw m0, m3 psubusw m3, m6, m5 psrlw m3, 8 packuswb m2, m3 psllw m3, 10 pmulhw m4, m3 psubb m3, m8, m2 paddw m1, m4 pmulhrsw m0, m7 pmulhrsw m1, m7 mova [maskq], m3 add maskq, 16 packuswb m0, m1 ret %macro BLEND_64M 4; a, b, mask1, mask2 punpcklbw m0, %1, %2; {b;a}[7..0] punpckhbw %1, %2 ; {b;a}[15..8] pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16 pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16 pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16 packuswb m0, %1 ; {blendpx}[15..0] u8 %endmacro %macro BLEND 2; a, b psubb m3, m4, m0 ; m3 = (64 - m) punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] punpckhbw m3, m0 ; {m;(64-m)}[15..8] BLEND_64M %1, %2, m2, m3 %endmacro cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_ssse3_table LEA r6, blend_ssse3_table tzcnt wd, wm movifnidn hd, hm movifnidn maskq, maskmp movsxd wq, dword [r6+wq*4] mova m4, [base+pb_64] mova m5, [base+pw_512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: movq m0, [maskq]; m movd m1, [dstq+dsq*0] ; a movd m6, [dstq+dsq*1] punpckldq m1, m6 movq m6, [tmpq] ; b psubb m3, m4, m0 ; m3 = (64 - m) punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] punpcklbw m1, m6 ; {b;a}[7..0] pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16 pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 packuswb m1, m0 ; {blendpx}[15..0] u8 movd [dstq+dsq*0], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 add maskq, 8 add tmpq, 8 lea dstq, [dstq+dsq*2] ; dst_stride * 2 sub hd, 2 jg .w4 RET .w8: mova m0, [maskq]; m movq m1, [dstq+dsq*0] ; a movhps m1, [dstq+dsq*1] mova m6, [tmpq] ; b BLEND m1, m6 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 add maskq, 16 add tmpq, 16 lea dstq, [dstq+dsq*2] ; dst_stride * 2 sub hd, 2 jg .w8 RET .w16: mova m0, [maskq]; m mova m1, [dstq] ; a mova m6, [tmpq] ; b BLEND m1, m6 mova [dstq], m0 add maskq, 16 add tmpq, 16 add dstq, dsq ; dst_stride dec hd jg .w16 RET .w32: %assign i 0 %rep 2 mova m0, [maskq+16*i]; m mova m1, [dstq+16*i] ; a mova m6, [tmpq+16*i] ; b BLEND m1, m6 mova [dstq+i*16], m0 %assign i i+1 %endrep add maskq, 32 add tmpq, 32 add dstq, dsq ; dst_stride dec hd jg .w32 RET cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_v_ssse3_table LEA r5, blend_v_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r5+wq*4] mova m5, [base+pw_512] add wq, r5 add maskq, obmc_masks-blend_v_ssse3_table jmp wq .w2: movd m3, [maskq+4] punpckldq m3, m3 ; 2 mask blend is provided for 4 pixels / 2 lines .w2_loop: movd m1, [dstq+dsq*0] ; a {..;a;a} pinsrw m1, [dstq+dsq*1], 1 movd m2, [tmpq] ; b punpcklbw m0, m1, m2; {b;a}[7..0] pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16 pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 packuswb m0, m1 ; {blendpx}[8..0] u8 movd r3d, m0 mov [dstq+dsq*0], r3w shr r3d, 16 mov [dstq+dsq*1], r3w add tmpq, 2*2 lea dstq, [dstq + dsq * 2] sub hd, 2 jg .w2_loop RET .w4: movddup m3, [maskq+8] ; 4 mask blend is provided for 8 pixels / 2 lines .w4_loop: movd m1, [dstq+dsq*0] ; a movd m2, [dstq+dsq*1] ; punpckldq m1, m2 movq m2, [tmpq] ; b punpcklbw m1, m2 ; {b;a}[7..0] pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16 pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 packuswb m1, m1 ; {blendpx}[8..0] u8 movd [dstq], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 add tmpq, 2*4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET .w8: mova m3, [maskq+16] ; 8 mask blend is provided for 16 pixels .w8_loop: movq m1, [dstq+dsq*0] ; a movhps m1, [dstq+dsq*1] mova m2, [tmpq]; b BLEND_64M m1, m2, m3, m3 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 add tmpq, 16 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: ; 16 mask blend is provided for 32 pixels mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0]) mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1]) .w16_loop: mova m1, [dstq] ; a mova m2, [tmpq] ; b BLEND_64M m1, m2, m3, m4 mova [dstq], m0 add tmpq, 16 add dstq, dsq dec hd jg .w16_loop RET .w32: %if WIN64 mova [rsp+8], xmm6 %endif mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0]) mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1]) mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2]) ; 16 mask blend is provided for 64 pixels .w32_loop: mova m1, [dstq+16*0] ; a mova m2, [tmpq+16*0] ; b BLEND_64M m1, m2, m3, m4 movq m1, [dstq+16*1] ; a punpcklbw m1, [tmpq+16*1] ; b pmaddubsw m1, m6 pmulhrsw m1, m5 packuswb m1, m1 mova [dstq+16*0], m0 movq [dstq+16*1], m1 add tmpq, 32 add dstq, dsq dec hd jg .w32_loop %if WIN64 mova xmm6, [rsp+8] %endif RET cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask %define base t0-blend_h_ssse3_table %if ARCH_X86_32 ; We need to keep the PIC pointer for w4, reload wd from stack instead DECLARE_REG_TMP 6 %else DECLARE_REG_TMP 5 mov r6d, wd %endif LEA t0, blend_h_ssse3_table tzcnt wd, wm mov hd, hm movsxd wq, dword [t0+wq*4] mova m5, [base+pw_512] add wq, t0 lea maskq, [base+obmc_masks+hq*2] lea hd, [hq*3] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd m0, [dstq+dsq*0] pinsrw m0, [dstq+dsq*1], 1 movd m2, [maskq+hq*2] movd m1, [tmpq] punpcklwd m2, m2 punpcklbw m0, m1 pmaddubsw m0, m2 pmulhrsw m0, m5 packuswb m0, m0 movd r3d, m0 mov [dstq+dsq*0], r3w shr r3d, 16 mov [dstq+dsq*1], r3w lea dstq, [dstq+dsq*2] add tmpq, 2*2 add hq, 2 jl .w2 RET .w4: %if ARCH_X86_32 mova m3, [base+blend_shuf] %else mova m3, [blend_shuf] %endif .w4_loop: movd m0, [dstq+dsq*0] movd m2, [dstq+dsq*1] punpckldq m0, m2 ; a movq m1, [tmpq] ; b movq m2, [maskq+hq*2] ; m pshufb m2, m3 punpcklbw m0, m1 pmaddubsw m0, m2 pmulhrsw m0, m5 packuswb m0, m0 movd [dstq+dsq*0], m0 psrlq m0, 32 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add tmpq, 4*2 add hq, 2 jl .w4_loop RET .w8: movd m4, [maskq+hq*2] punpcklwd m4, m4 pshufd m3, m4, q0000 pshufd m4, m4, q1111 movq m1, [dstq+dsq*0] ; a movhps m1, [dstq+dsq*1] mova m2, [tmpq] BLEND_64M m1, m2, m3, m4 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add tmpq, 8*2 add hq, 2 jl .w8 RET ; w16/w32/w64/w128 .w16: %if ARCH_X86_32 mov r6d, wm %endif sub dsq, r6 .w16_loop0: movd m3, [maskq+hq*2] pshuflw m3, m3, q0000 punpcklqdq m3, m3 mov wd, r6d .w16_loop: mova m1, [dstq] ; a mova m2, [tmpq] ; b BLEND_64M m1, m2, m3, m3 mova [dstq], m0 add dstq, 16 add tmpq, 16 sub wd, 16 jg .w16_loop add dstq, dsq inc hq jl .w16_loop0 RET ; emu_edge args: ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, ; const pixel *ref, const ptrdiff_t ref_stride ; ; bw, bh total filled size ; iw, ih, copied block -> fill bottom, right ; x, y, offset in bw/bh -> fill top, left cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \ y, dst, dstride, src, sstride, \ bottomext, rightext, blk ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes pxor m1, m1 %if ARCH_X86_64 %define reg_zero r12q %define reg_tmp r10 %define reg_src srcq %define reg_bottomext bottomextq %define reg_rightext rightextq %define reg_blkm r9m %else %define reg_zero r6 %define reg_tmp r0 %define reg_src r1 %define reg_bottomext r0 %define reg_rightext r1 %define reg_blkm r2m %endif ; ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor reg_zero, reg_zero lea reg_tmp, [ihq-1] cmp yq, ihq cmovs reg_tmp, yq test yq, yq cmovs reg_tmp, reg_zero %if ARCH_X86_64 imul reg_tmp, sstrideq add srcq, reg_tmp %else imul reg_tmp, sstridem mov reg_src, srcm add reg_src, reg_tmp %endif ; ; ref += iclip(x, 0, iw - 1) lea reg_tmp, [iwq-1] cmp xq, iwq cmovs reg_tmp, xq test xq, xq cmovs reg_tmp, reg_zero add reg_src, reg_tmp %if ARCH_X86_32 mov srcm, reg_src %endif ; ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) %if ARCH_X86_32 mov r1, r1m ; restore bh %endif lea reg_bottomext, [yq+bhq] sub reg_bottomext, ihq lea r3, [bhq-1] cmovs reg_bottomext, reg_zero ; DEFINE_ARGS bw, bh, iw, ih, x, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, reg_zero cmp reg_bottomext, bhq cmovns reg_bottomext, r3 cmp topextq, bhq cmovg topextq, r3 %if ARCH_X86_32 mov r4m, reg_bottomext ; ; right_ext = iclip(x + bw - iw, 0, bw - 1) mov r0, r0m ; restore bw %endif lea reg_rightext, [xq+bwq] sub reg_rightext, iwq lea r2, [bwq-1] cmovs reg_rightext, reg_zero DEFINE_ARGS bw, bh, iw, ih, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, reg_zero cmp reg_rightext, bwq cmovns reg_rightext, r2 %if ARCH_X86_32 mov r3m, r1 %endif cmp leftextq, bwq cmovns leftextq, r2 %undef reg_zero %undef reg_tmp %undef reg_src %undef reg_bottomext %undef reg_rightext DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; center_h = bh - top_ext - bottom_ext %if ARCH_X86_64 lea r3, [bottomextq+topextq] sub centerhq, r3 %else mov r1, centerhm ; restore r1 sub centerhq, topextq sub centerhq, r4m mov r1m, centerhq %endif ; ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq %if ARCH_X86_64 imul r2, dstrideq %else mov r6, r6m ; restore dstq imul r2, dstridem %endif add dstq, r2 mov reg_blkm, dstq ; save pointer for ext ; ; center_w = bw - left_ext - right_ext mov centerwq, bwq %if ARCH_X86_64 lea r3, [rightextq+leftextq] sub centerwq, r3 %else sub centerwq, r3m sub centerwq, leftextq %endif ; vloop Macro %macro v_loop 3 ; need_left_ext, need_right_ext, suffix %if ARCH_X86_64 %define reg_tmp r12 %else %define reg_tmp r0 %endif .v_loop_%3: %if ARCH_X86_32 mov r0, r0m mov r1, r1m %endif %if %1 ; left extension %if ARCH_X86_64 movd m0, [srcq] %else mov r3, srcm movd m0, [r3] %endif pshufb m0, m1 xor r3, r3 .left_loop_%3: mova [dstq+r3], m0 add r3, mmsize cmp r3, leftextq jl .left_loop_%3 ; body lea reg_tmp, [dstq+leftextq] %endif xor r3, r3 .body_loop_%3: %if ARCH_X86_64 movu m0, [srcq+r3] %else mov r1, srcm movu m0, [r1+r3] %endif %if %1 movu [reg_tmp+r3], m0 %else movu [dstq+r3], m0 %endif add r3, mmsize cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 add reg_tmp, centerwq %else lea reg_tmp, [dstq+centerwq] %endif %if ARCH_X86_64 movd m0, [srcq+centerwq-1] %else mov r3, srcm movd m0, [r3+centerwq-1] %endif pshufb m0, m1 xor r3, r3 .right_loop_%3: movu [reg_tmp+r3], m0 add r3, mmsize %if ARCH_X86_64 cmp r3, rightextq %else cmp r3, r3m %endif jl .right_loop_%3 %endif %if ARCH_X86_64 add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %else add dstq, dstridem mov r0, sstridem add srcm, r0 sub dword centerhm, 1 jg .v_loop_%3 mov r0, r0m ; restore r0 %endif %endmacro ; vloop MACRO test leftextq, leftextq jnz .need_left_ext %if ARCH_X86_64 test rightextq, rightextq jnz .need_right_ext %else cmp leftextq, r3m ; leftextq == 0 jne .need_right_ext %endif v_loop 0, 0, 0 jmp .body_done ;left right extensions .need_left_ext: %if ARCH_X86_64 test rightextq, rightextq %else mov r3, r3m test r3, r3 %endif jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; r0 ; bw ; r1 ;; x loop ; r4 ;; y loop ; r5 ; topextq ; r6 ;dstq ; r7 ;dstrideq ; r8 ; srcq %if ARCH_X86_64 %define reg_dstride dstrideq %else %define reg_dstride r2 %endif ; ; bottom edge extension %if ARCH_X86_64 test bottomextq, bottomextq jz .top %else xor r1, r1 cmp r1, r4m je .top %endif ; %if ARCH_X86_64 mov srcq, dstq sub srcq, dstrideq xor r1, r1 %else mov r3, dstq mov reg_dstride, dstridem sub r3, reg_dstride mov srcm, r3 %endif ; .bottom_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1] lea r3, [dstq+r1] mov r4, bottomextq %else mov r3, srcm mova m0, [r3+r1] lea r3, [dstq+r1] mov r4, r4m %endif ; .bottom_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .bottom_y_loop add r1, mmsize cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end %if ARCH_X86_64 mov srcq, reg_blkm %else mov r3, reg_blkm mov reg_dstride, dstridem %endif mov dstq, dstm xor r1, r1 ; .top_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1] %else mov r3, reg_blkm mova m0, [r3+r1] %endif lea r3, [dstq+r1] mov r4, topextq ; .top_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .top_y_loop add r1, mmsize cmp r1, bwq jl .top_x_loop .end: RET %undef reg_dstride %undef reg_blkm %undef reg_tmp cextern resize_filter %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro %if ARCH_X86_64 cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 %elif STACK_ALIGNMENT >= 16 cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 %else cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 %endif movifnidn dstq, dstmp movifnidn srcq, srcmp %if STACK_ALIGNMENT >= 16 movifnidn dst_wd, dst_wm %endif %if ARCH_X86_64 movifnidn hd, hm %endif sub dword mx0m, 4<<14 sub dword src_wm, 8 movd m7, dxm movd m6, mx0m movd m5, src_wm pshufd m7, m7, q0000 pshufd m6, m6, q0000 pshufd m5, m5, q0000 %if ARCH_X86_64 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ %else DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x %define hd dword r5m %if STACK_ALIGNMENT >= 16 LEA r6, $$ %define base r6-$$ %else LEA r4, $$ %define base r4-$$ %endif %endif %if ARCH_X86_64 mova m10, [base+pw_m256] mova m9, [base+pd_63] mova m8, [base+pb_8x0_8x8] %else %define m10 [base+pw_m256] %define m9 [base+pd_63] %define m8 [base+pb_8x0_8x8] %endif pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] pslld m7, 2 ; dx*4 pslld m5, 14 paddd m6, m4 ; mx+[0..3]*dx SCRATCH 7, 13, 0 SCRATCH 6, 12, 1 SCRATCH 5, 11, 2 ; m10 = pmulhrsw constant for x=(x+64)>>7 ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8 .loop_y: xor xd, xd mova m0, m12 ; per-line working version of mx .loop_x: pxor m1, m1 pcmpgtd m1, m0 pandn m1, m0 psrad m2, m0, 8 ; filter offset (unmasked) pcmpgtd m3, m11, m1 pand m1, m3 pandn m3, m11 por m1, m3 psubd m3, m0, m1 ; pshufb offset psrad m1, 14 ; clipped src_x offset psrad m3, 14 ; pshufb edge_emu offset pand m2, m9 ; filter offset (masked) ; load source pixels %if ARCH_X86_64 movd r8d, m1 pshuflw m1, m1, q3232 movd r9d, m1 punpckhqdq m1, m1 movd r10d, m1 psrlq m1, 32 movd r11d, m1 movq m4, [srcq+r8] movq m5, [srcq+r10] movhps m4, [srcq+r9] movhps m5, [srcq+r11] %else movd r3d, m1 pshufd m1, m1, q3312 movd r1d, m1 pshuflw m1, m1, q3232 movq m4, [srcq+r3] movq m5, [srcq+r1] movd r3d, m1 punpckhqdq m1, m1 movd r1d, m1 movhps m4, [srcq+r3] movhps m5, [srcq+r1] %endif ; if no emulation is required, we don't need to shuffle or emulate edges ; this also saves 2 quasi-vpgatherdqs pxor m6, m6 pcmpeqb m6, m3 %if ARCH_X86_64 pmovmskb r8d, m6 cmp r8d, 0xffff %else pmovmskb r3d, m6 cmp r3d, 0xffff %endif je .filter %if ARCH_X86_64 movd r8d, m3 pshuflw m3, m3, q3232 movd r9d, m3 punpckhqdq m3, m3 movd r10d, m3 psrlq m3, 32 movd r11d, m3 movsxd r8, r8d movsxd r9, r9d movsxd r10, r10d movsxd r11, r11d movq m6, [base+resize_shuf+4+r8] movq m7, [base+resize_shuf+4+r10] movhps m6, [base+resize_shuf+4+r9] movhps m7, [base+resize_shuf+4+r11] %else movd r3d, m3 pshufd m3, m3, q3312 movd r1d, m3 pshuflw m3, m3, q3232 movq m6, [base+resize_shuf+4+r3] movq m7, [base+resize_shuf+4+r1] movd r3d, m3 punpckhqdq m3, m3 movd r1d, m3 movhps m6, [base+resize_shuf+4+r3] movhps m7, [base+resize_shuf+4+r1] %endif paddb m6, m8 paddb m7, m8 pshufb m4, m6 pshufb m5, m7 .filter: %if ARCH_X86_64 movd r8d, m2 pshuflw m2, m2, q3232 movd r9d, m2 punpckhqdq m2, m2 movd r10d, m2 psrlq m2, 32 movd r11d, m2 movq m6, [base+resize_filter+r8*8] movq m7, [base+resize_filter+r10*8] movhps m6, [base+resize_filter+r9*8] movhps m7, [base+resize_filter+r11*8] %else movd r3d, m2 pshufd m2, m2, q3312 movd r1d, m2 pshuflw m2, m2, q3232 movq m6, [base+resize_filter+r3*8] movq m7, [base+resize_filter+r1*8] movd r3d, m2 punpckhqdq m2, m2 movd r1d, m2 movhps m6, [base+resize_filter+r3*8] movhps m7, [base+resize_filter+r1*8] %endif pmaddubsw m4, m6 pmaddubsw m5, m7 phaddw m4, m5 phaddsw m4, m4 pmulhrsw m4, m10 ; x=(x+64)>>7 packuswb m4, m4 movd [dstq+xq], m4 paddd m0, m13 add xd, 4 %if STACK_ALIGNMENT >= 16 cmp xd, dst_wd %else cmp xd, dst_wm %endif jl .loop_x add dstq, dst_stridemp add srcq, src_stridemp dec hd jg .loop_y RET INIT_XMM ssse3 PREP_BILIN PREP_8TAP WARP_AFFINE_8X8 WARP_AFFINE_8X8T INIT_XMM sse4 WARP_AFFINE_8X8 WARP_AFFINE_8X8T INIT_XMM sse2 PREP_BILIN PREP_8TAP WARP_AFFINE_8X8 WARP_AFFINE_8X8T av-scenechange-0.14.1/src/asm/x86/sad_avx.asm000064400000000000000000000140611046102023000167030ustar 00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "config.asm" %include "src/asm/x86/x86inc.asm" SECTION .text %macro SAD_FN 4 %if %4 == 0 %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %else ; avg %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 %if ARCH_X86_64 %define n_rowsd r7d %else ; x86-32 %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 %endif ; avg/sad movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 lea src_stride3q, [src_strideq*3] lea ref_stride3q, [ref_strideq*3] %endif ; %3 == 7 %endmacro ; unsigned int aom_sad128x128_avx2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD128XN 1-2 0 SAD_FN 128, %1, 5, %2 mov n_rowsd, %1 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+32] movu m3, [refq+64] movu m4, [refq+96] %if %2 == 1 vpavgb m1, [second_predq+mmsize*0] vpavgb m2, [second_predq+mmsize*1] vpavgb m3, [second_predq+mmsize*2] vpavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif vpsadbw m1, [srcq] vpsadbw m2, [srcq+32] vpsadbw m3, [srcq+64] vpsadbw m4, [srcq+96] add refq, ref_strideq add srcq, src_strideq vpaddd m1, m2 vpaddd m3, m4 vpaddd m0, m1 vpaddd m0, m3 dec n_rowsd jg .loop vextracti128 xm1, m0, 1 paddd xm0, xm1 movhlps xm1, xm0 paddd xm0, xm1 movd eax, xm0 RET %endmacro INIT_YMM avx2 SAD128XN 128 ; sad128x128_avx2 SAD128XN 128, 1 ; sad128x128_avg_avx2 SAD128XN 64 ; sad128x64_avx2 SAD128XN 64, 1 ; sad128x64_avg_avx2 ; unsigned int aom_sad64x64_avx2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 mov n_rowsd, %1/2 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+32] movu m3, [refq+ref_strideq] movu m4, [refq+ref_strideq+32] %if %2 == 1 vpavgb m1, [second_predq+mmsize*0] vpavgb m2, [second_predq+mmsize*1] vpavgb m3, [second_predq+mmsize*2] vpavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif vpsadbw m1, [srcq] vpsadbw m2, [srcq+32] vpsadbw m3, [srcq+src_strideq] vpsadbw m4, [srcq+src_strideq+32] vpaddd m1, m2 vpaddd m3, m4 lea refq, [refq+ref_strideq*2] vpaddd m0, m1 lea srcq, [srcq+src_strideq*2] vpaddd m0, m3 dec n_rowsd jg .loop vextracti128 xm1, m0, 1 paddd xm0, xm1 movhlps xm1, xm0 paddd xm0, xm1 movd eax, xm0 RET %endmacro INIT_YMM avx2 SAD64XN 128 ; sad64x128_avx2 SAD64XN 128, 1 ; sad64x128_avg_avx2 SAD64XN 64 ; sad64x64_avx2 SAD64XN 32 ; sad64x32_avx2 SAD64XN 64, 1 ; sad64x64_avg_avx2 SAD64XN 32, 1 ; sad64x32_avg_avx2 SAD64XN 16 ; sad64x16_avx2 SAD64XN 16, 1 ; sad64x16_avg_avx2 ; unsigned int aom_sad32x32_avx2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+ref_strideq] movu m3, [refq+ref_strideq*2] movu m4, [refq+ref_stride3q] %if %2 == 1 vpavgb m1, [second_predq+mmsize*0] vpavgb m2, [second_predq+mmsize*1] vpavgb m3, [second_predq+mmsize*2] vpavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+src_strideq] psadbw m3, [srcq+src_strideq*2] psadbw m4, [srcq+src_stride3q] vpaddd m1, m2 vpaddd m3, m4 lea refq, [refq+ref_strideq*4] vpaddd m0, m1 lea srcq, [srcq+src_strideq*4] vpaddd m0, m3 dec n_rowsd jg .loop vextracti128 xm1, m0, 1 paddd xm0, xm1 movhlps xm1, xm0 paddd xm0, xm1 movd eax, xm0 RET %endmacro INIT_YMM avx2 SAD32XN 64 ; sad32x64_avx2 SAD32XN 32 ; sad32x32_avx2 SAD32XN 16 ; sad32x16_avx2 SAD32XN 64, 1 ; sad32x64_avg_avx2 SAD32XN 32, 1 ; sad32x32_avg_avx2 SAD32XN 16, 1 ; sad32x16_avg_avx2 SAD32XN 8 ; sad_32x8_avx2 SAD32XN 8, 1 ; sad_32x8_avg_avx2 av-scenechange-0.14.1/src/asm/x86/sad_plane.asm000064400000000000000000000163201046102023000172040ustar 00000000000000; Copyright (c) 2022, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. %include "config.asm" %include "src/asm/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA align 32 mask_lut: db \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, %macro JMP_TABLE 3-* %xdefine %%func mangle(private_prefix %+ _%1_%2) %xdefine %%table %1_%2_table %%table: %rep %0 - 2 dd (%%func %+ .%3) - (%%table) %rotate 1 %endrep %endmacro JMP_TABLE sad_plane_8bpc, avx2, vec0, vec1, vec2, vec3 JMP_TABLE sad_plane_8bpc, sse2, vec0, vec1, vec2, vec3 %use ifunc SECTION .text %macro SAD_PLANE_FN 0 cglobal sad_plane_8bpc, 5, 9, 9, p1, p2, stride, width, rows, \ resid_simd, resid, width_unrll, tmp0 mov resid_simdq, widthq mov residd, widthd and residd, mmsize - 1 and resid_simdq, -(mmsize) and widthq, -(4*mmsize) ; LUT row size is always 32 regardless of mmsize (because the ; start of the rows would be the same, so we reuse the same LUT) shl residd, ilog2(32) pxor xm0, xm0 pxor xm1, xm1 pxor xm2, xm2 pxor xm3, xm3 ; load mask from lookup table into m8 lea tmp0q, [mask_lut] mova m8, [tmp0q + residq] DEFINE_ARGS p1, p2, stride, width, rows, \ resid_simd, resid, width_unrll, skip_ptr sub resid_simdq, widthq ; need to divide by mmsize to load skip pointer shr resid_simdq, ilog2(mmsize) %if mmsize == 32 %define jmp_table sad_plane_8bpc_avx2_table %elif mmsize == 16 %define jmp_table sad_plane_8bpc_sse2_table %endif lea r6, [jmp_table] movsxd skip_ptrq, [r6 + 4*resid_simdq] add skip_ptrq, r6 ; shift back (for residual to load correct number of bytes) shl resid_simdq, ilog2(mmsize) ; set pointer to point after end of width of first row add p1q, widthq add p2q, widthq mov width_unrllq, widthq neg widthq .loop_row: test widthq, widthq jz .skip .loop: mova m4, [p1q + widthq + 0*mmsize] mova m5, [p1q + widthq + 1*mmsize] mova m6, [p1q + widthq + 2*mmsize] mova m7, [p1q + widthq + 3*mmsize] psadbw m4, m4, [p2q + widthq + 0*mmsize] psadbw m5, m5, [p2q + widthq + 1*mmsize] psadbw m6, m6, [p2q + widthq + 2*mmsize] psadbw m7, m7, [p2q + widthq + 3*mmsize] paddq m0, m4 paddq m1, m5 paddq m2, m6 paddq m3, m7 add widthq, 4*mmsize jnz .loop .skip: jmp skip_ptrq .vec3: mova m6, [p1q + 2*mmsize] psadbw m6, m6, [p2q + 2*mmsize] paddq m2, m6 .vec2: mova m5, [p1q + 1*mmsize] psadbw m5, m5, [p2q + 1*mmsize] paddq m1, m5 .vec1: mova m4, [p1q + 0*mmsize] psadbw m4, m4, [p2q + 0*mmsize] paddq m0, m4 .vec0: ; skip residual element add if necessary test residd, residd jz .next_row ; load residual elements and mask out elements past the width pand m4, m8, [p1q + resid_simdq] pand m5, m8, [p2q + resid_simdq] psadbw m4, m4, m5 paddq m2, m4 .next_row: ; width is 0 after the unrolled loop, so subtracting is basically a mov + neg sub widthq, width_unrllq ; since we started with p1+width, adding stride will get the ; pointer at the end of the next row add p1q, strideq add p2q, strideq dec rowsd jnz .loop_row ; final horizontal reduction paddq m2, m3 paddq m0, m1 paddq m0, m2 %if mmsize == 32 vextracti128 xm1, ym0, 1 paddq xm0, xm1 %endif pshufd xm1, xm0, q0032 paddq xm0, xm1 movq rax, xm0 RET %endmacro INIT_XMM sse2 SAD_PLANE_FN INIT_YMM avx2 SAD_PLANE_FN %endif ; ARCH_X86_64 av-scenechange-0.14.1/src/asm/x86/sad_sse2.asm000064400000000000000000000252671046102023000167730ustar 00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "config.asm" %include "src/asm/x86/x86inc.asm" SECTION .text %macro SAD_FN 4 %if %4 == 0 %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %else ; avg %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 %if ARCH_X86_64 %define n_rowsd r7d %else ; x86-32 %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 %endif ; avg/sad movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 lea src_stride3q, [src_strideq*3] lea ref_stride3q, [ref_strideq*3] %endif ; %3 == 7 %endmacro ; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD128XN 1-2 0 SAD_FN 128, %1, 5, %2 mov n_rowsd, %1 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] %endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+32] psadbw m4, [srcq+48] paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 movu m1, [refq+64] movu m2, [refq+80] movu m3, [refq+96] movu m4, [refq+112] %if %2 == 1 pavgb m1, [second_predq+mmsize*4] pavgb m2, [second_predq+mmsize*5] pavgb m3, [second_predq+mmsize*6] pavgb m4, [second_predq+mmsize*7] lea second_predq, [second_predq+mmsize*8] %endif psadbw m1, [srcq+64] psadbw m2, [srcq+80] psadbw m3, [srcq+96] psadbw m4, [srcq+112] add refq, ref_strideq add srcq, src_strideq paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 sub n_rowsd, 1 jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD128XN 128 ; sad128x128_sse2 SAD128XN 128, 1 ; sad128x128_avg_sse2 SAD128XN 64 ; sad128x64_sse2 SAD128XN 64, 1 ; sad128x64_avg_sse2 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 mov n_rowsd, %1 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+32] psadbw m4, [srcq+48] paddd m1, m2 paddd m3, m4 add refq, ref_strideq paddd m0, m1 add srcq, src_strideq paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD64XN 128 ; sad64x128_sse2 SAD64XN 128, 1 ; sad64x128_avg_sse2 SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 SAD64XN 16 ; sad64x16_sse2 SAD64XN 16, 1 ; sad64x16_avg_sse2 ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 5, %2 mov n_rowsd, %1/2 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+ref_strideq] movu m4, [refq+ref_strideq+16] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+src_strideq] psadbw m4, [srcq+src_strideq+16] paddd m1, m2 paddd m3, m4 lea refq, [refq+ref_strideq*2] paddd m0, m1 lea srcq, [srcq+src_strideq*2] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD32XN 64 ; sad32x64_sse2 SAD32XN 32 ; sad32x32_sse2 SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 SAD32XN 8 ; sad_32x8_sse2 SAD32XN 8, 1 ; sad_32x8_avg_sse2 ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD16XN 1-2 0 SAD_FN 16, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+ref_strideq] movu m3, [refq+ref_strideq*2] movu m4, [refq+ref_stride3q] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+src_strideq] psadbw m3, [srcq+src_strideq*2] psadbw m4, [srcq+src_stride3q] paddd m1, m2 paddd m3, m4 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD16XN 32 ; sad16x32_sse2 SAD16XN 16 ; sad16x16_sse2 SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 SAD16XN 4 ; sad_16x4_sse2 SAD16XN 4, 1 ; sad_16x4_avg_sse2 SAD16XN 64 ; sad_16x64_sse2 SAD16XN 64, 1 ; sad_16x64_avg_sse2 ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD8XN 1-2 0 SAD_FN 8, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 .loop: movh m1, [refq] movhps m1, [refq+ref_strideq] movh m2, [refq+ref_strideq*2] movhps m2, [refq+ref_stride3q] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] lea second_predq, [second_predq+mmsize*2] %endif movh m3, [srcq] movhps m3, [srcq+src_strideq] movh m4, [srcq+src_strideq*2] movhps m4, [srcq+src_stride3q] psadbw m1, m3 psadbw m2, m4 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] paddd m0, m2 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD8XN 16 ; sad8x16_sse2 SAD8XN 8 ; sad8x8_sse2 SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 SAD8XN 32 ; sad_8x32_sse2 SAD8XN 32, 1 ; sad_8x32_avg_sse2 ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 .loop: movd m1, [refq] movd m2, [refq+ref_strideq] movd m3, [refq+ref_strideq*2] movd m4, [refq+ref_stride3q] punpckldq m1, m2 punpckldq m3, m4 movlhps m1, m3 %if %2 == 1 pavgb m1, [second_predq+mmsize*0] lea second_predq, [second_predq+mmsize*1] %endif movd m2, [srcq] movd m5, [srcq+src_strideq] movd m4, [srcq+src_strideq*2] movd m3, [srcq+src_stride3q] punpckldq m2, m5 punpckldq m4, m3 movlhps m2, m4 psadbw m1, m2 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse SAD4XN 16 ; sad_4x16_sse2 SAD4XN 16, 1 ; sad_4x16_avg_sse2 av-scenechange-0.14.1/src/asm/x86/satd.asm000064400000000000000000000775421046102023000162260ustar 00000000000000; Copyright (c) 2019, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. %include "config.asm" %include "src/asm/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 maddubsw_hsub: times 16 db 1, -1 SECTION .text %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) ; Perform 4x4 hadamard transform on input with 2 rows per register. ; Rows 0 and 2 are in m0 and rows 1 and 3 are in m1. ; A second set of packed input can also be taken in m2 and m3. ; Ends with sums in every other entry (i.e. already reduced horizontally). %macro HADAMARD_4x4_PACKED 1 %if %1 == 1 %define tmp m2 ; 2->0, 1->2, 0->2 %define ROTATE SWAP 2, 1, 0 %elif %1 == 2 %define tmp m4 ; 4->0, 3->2, 2->3, 1->2, 0->1 %define ROTATE SWAP 4, 3, 2, 1, 0 %endif ; m0 d2 c2 b2 a2 d0 c0 b0 a0 ; m1 d3 c3 b3 a3 d1 c1 b1 a1 ; Stage 1 ; m0 d2+d3 c2+c3 b2+b3 a2+a3 d0+d1 c0+c1 b0+b1 a0+a1 ; m1 d2-d3 c2-c3 b2-b3 a2-a3 d0-d1 c0-c1 b0-b1 a0-a1 paddw tmp, m0, m1 psubw m0, m1 %if %1 == 2 paddw m1, m2, m3 psubw m2, m3 %endif ROTATE ; Stage 2 ; m0 d0-d1 d0+d1 c0-c1 c0+c1 b0-b1 b0+b1 a0-a1 a0+a1 ; m1 d2-d3 d2+d3 c2-c3 c2+c3 b2-b3 b2+b3 a2-a3 a2+a3 punpcklwd tmp, m0, m1 punpckhwd m0, m1 %if %1 == 2 punpcklwd m1, m2, m3 punpckhwd m2, m3 %endif ROTATE ; m0 d0-d1+d2-d3 d0+d1+d2+d3 c0-c1+c2-c3 c0+c1+c2+c3 ; b0-b1+b2-b3 b0+b1+b2+b3 a0-a1+a2-a3 a0+a1+a2+a3 ; m1 d0-d2-d2+d3 d0+d1-d2-d3 c0-c1-c2+c3 c0+c1-c2-c3 ; b0-b1-b2+b3 b0+b1-b2-b3 a0-a1-a2-a3 a0+a1-a2-a3 paddw tmp, m0, m1 psubw m0, m1 %if %1 == 2 paddw m1, m2, m3 psubw m2, m3 %endif ROTATE ; m0 s2 s0 r2 r0 q2 q0 p2 p0 ; m1 s3 s1 r3 r1 q3 q1 p3 p1 ; Stage 1 ; m0 q3 q1 q2 q0 p3 p1 p2 p0 ; m1 s3 s1 s2 s0 r3 r1 r2 r0 punpckldq tmp, m0, m1 punpckhdq m0, m1 %if %1 == 2 punpckldq m1, m2, m3 punpckhdq m2, m3 %endif ROTATE ; m0 q3+s3 q1+s1 q2+s2 q0+s0 p3+r3 p1+r1 p2+r2 p0+r0 ; m1 q3-s3 q1-s1 q2-s2 q0-s0 p3-r3 p1-r1 p2-r2 p0-r0 paddw tmp, m0, m1 psubw m0, m1 %if %1 == 2 paddw m1, m2, m3 psubw m2, m3 %endif ROTATE ; Stage 2 ; m0 p3-r3 p1-r1 p2-r2 p0-r0 p3+r3 p1+r1 p2+r2 p0+r0 ; m1 q3-s3 q1-s1 q2-s2 q0-s0 q3+s3 q1+s1 q2+s2 q0+s0 punpcklqdq tmp, m0, m1 punpckhqdq m0, m1 %if %1 == 2 punpcklqdq m1, m2, m3 punpckhqdq m2, m3 %endif ROTATE ; Use the fact that ; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b)) ; to merge the final butterfly with the abs and the first stage of ; accumulation. ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead. ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF. ; The final sum must be offset to compensate for subtracting 0x7FFF. paddw tmp, m0, m1 pmaxsw m0, m1 ; m1 is free ; 0x7FFF pcmpeqb m1, m1 psrlw m1, 1 paddsw tmp, m1 psubw m0, tmp %if %1 == 2 paddw tmp, m2, m3 pmaxsw m2, m3 paddsw tmp, m1 psubw m2, tmp paddw m0, m2 %endif %endmacro ; Load diffs of 4 entries for 2 rows %macro LOAD_PACK_DIFF_Dx2 7 movd m%1, %2 movd m%6, %4 punpckldq m%1, m%6 pmovzxbw m%1, m%1 movd m%6, %3 movd m%7, %5 punpckldq m%6, m%7 pmovzxbw m%6, m%6 psubw m%1, m%6 %endmacro ; Can only use 128-bit vectors %macro SATD_4x4_FN 0 cglobal satd_4x4, 4, 6, 4, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] ; Load rows 0 and 2 to m0 and 1 and 3 to m1 LOAD_PACK_DIFF_Dx2 0, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 2, 3 LOAD_PACK_DIFF_Dx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ 2, 3 HADAMARD_4x4_PACKED 1 ; Reduce horizontally pshufd m1, m0, q3232 paddw m0, m1 pshuflw m1, m0, q3232 paddw m0, m1 pshuflw m1, m0, q1111 ; Perform normalization during the final stage of accumulation pavgw m0, m1 movd eax, m0 movzx eax, ax ; Add an offset for how the final butterfly stage and the first stage of ; accumulation was done. Since this offset is an even number, this can ; safely be done after normalization using pavgw. sub ax, 4 RET %endmacro INIT_XMM sse4 SATD_4x4_FN INIT_XMM avx2 SATD_4x4_FN ; Load diffs of 8 entries for 2 row ; Each set of 4 columns share an 128-bit lane %macro LOAD_PACK_DIFF_Qx2 7 movq xm%1, %2 movq xm%6, %4 punpckldq xm%1, xm%6 pmovzxbw m%1, xm%1 movq xm%6, %3 movq xm%7, %5 punpckldq xm%6, xm%7 pmovzxbw m%6, xm%6 psubw m%1, m%6 %endmacro INIT_YMM avx2 cglobal satd_8x4, 4, 6, 4, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] ; Load rows 0 and 2 to m0 and 1 and 3 to m1 ; Each set of 4 columns share 128-bit lanes LOAD_PACK_DIFF_Qx2 0, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 2, 3 LOAD_PACK_DIFF_Qx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ 2, 3 HADAMARD_4x4_PACKED 1 ; Reduce horizontally vextracti128 xm1, m0, 1 paddw xm0, xm1 pshufd xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q1111 ; Perform normalization during the final stage of accumulation pavgw xm0, xm1 movd eax, xm0 movzx eax, ax ; Add an offset for how the final butterfly stage and the first stage of ; accumulation was done. Since this offset is an even number, this can ; safely be done after normalization using pavgw. sub ax, 8 RET ; Load diffs of 4 entries for 4 rows ; Each set of two rows share 128-bit lanes %macro LOAD_PACK_DIFF_Dx4 12 movd xm%1, %2 movd xm%10, %4 punpckldq xm%1, xm%10 movd xm%10, %6 movd xm%11, %8 punpckldq xm%10, xm%11 punpcklqdq xm%1, xm%10 pmovzxbw m%1, xm%1 movd xm%10, %3 movd xm%11, %5 punpckldq xm%10, xm%11 movd xm%11, %7 movd xm%12, %9 punpckldq xm%11, xm%12 punpcklqdq xm%10, xm%11 pmovzxbw m%10, xm%10 psubw m%1, m%10 %endmacro INIT_YMM avx2 cglobal satd_4x8, 4, 8, 5, src, src_stride, dst, dst_stride, \ src4, dst4, src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] lea src4q, [srcq+src_strideq*4] lea dst4q, [dstq+dst_strideq*4] ; Load rows 0, 2, 4 and 6 to m0 and 1, 3, 5 and 7 to m1. ; Lanes split the low and high rows of m0 and m1. LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src4q], [dst4q], \ [src4q+src_strideq*2], [dst4q+dst_strideq*2], \ 2, 3, 4 LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ [src4q+src_strideq*1], [dst4q+dst_strideq*1], \ [src4q+src_stride3q], [dst4q+dst_stride3q], \ 2, 3, 4 HADAMARD_4x4_PACKED 1 ; Reduce horizontally vextracti128 xm1, m0, 1 paddw xm0, xm1 pshufd xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q1111 ; Perform normalization during the final stage of accumulation. pavgw xm0, xm1 movd eax, xm0 movzx eax, ax sub ax, 8 RET ; Rudimentary fast hadamard transform ; Two Hadamard transforms share an 128-bit lane. %macro HADAMARD_4x4 0 ; 4->0, 3->2, 2->3, 1->2, 0->1 %define ROTATE SWAP 4, 3, 2, 1, 0 ; Stage 1 paddw m0, m1, m2 psubw m1, m2 paddw m2, m3, m4 psubw m3, m4 ROTATE ; Stage 2 paddw m0, m1, m3 psubw m1, m3 paddw m3, m2, m4 psubw m2, m4 SWAP 3, 2, 1 ROTATE ; Transpose ; Since two transforms share an 128-bit lane, unpacking results in a single ; transform's values on each register. This has to be resolved later. ; A and B indicate different 4x4 transforms. ; Start ; m1 B (a3 a2 a1 a0) A (a3 a2 a1 a0) ; m2 B (b3 b2 b1 b0) A (b3 b2 b1 b0) ; m3 B (c3 c2 c1 c0) A (c3 c2 c1 c0) ; m4 B (d3 d2 d1 d0) A (d3 d2 d1 d0) ; Stage 1 ; m1 A (b3 a3 b2 a2 b1 a1 b0 a0) ; m2 B (b3 a3 b2 a2 b1 a1 b0 a0) ; m3 A (d3 c3 d2 c2 d1 c1 d0 c0) ; m4 B (d3 c3 d2 c2 d1 c1 d0 c0) punpcklwd m0, m1, m2 punpckhwd m1, m2 punpcklwd m2, m3, m4 punpckhwd m3, m4 ROTATE ; m1 A (d3 c3 b3 a3 d2 c2 b2 a2) ; m2 A (d1 c1 b1 a1 d0 c0 b0 a0) ; m3 B (d3 c3 b3 a3 d2 c2 b2 a2) ; m4 B (d1 c1 b1 a1 d0 c0 b0 a0) punpckldq m0, m1, m3 punpckhdq m1, m3 punpckldq m3, m2, m4 punpckhdq m2, m4 SWAP 3, 2, 1 ROTATE ; Make the transforms share 128-bit lanes again. ; m1 B (d0 c0 b0 a0) A (d0 c0 b0 a0) ; m2 B (d1 c1 b1 a1) A (d1 c1 b1 a1) ; m3 B (d2 c2 b2 a2) A (d2 c2 b2 a2) ; m4 B (d3 c3 b3 a3) A (d3 c3 b3 a3) punpcklqdq m0, m1, m2 punpckhqdq m1, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 ROTATE ; Stage 1 paddw m0, m1, m2 psubw m1, m2 paddw m2, m3, m4 psubw m3, m4 ROTATE ; Use the fact that ; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b)) ; to merge the final butterfly with the abs and the first stage of ; accumulation. ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead. ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF. ; The final sum must be offset to compensate for subtracting 0x7FFF. paddw m0, m1, m3 pmaxsw m1, m3 ; m2 is free ; 0x7FFF pcmpeqb m3, m3 psrlw m3, 1 paddsw m0, m3 psubw m1, m0 paddw m0, m2, m4 pmaxsw m2, m4 paddsw m0, m3 psubw m2, m0 paddw m1, m2 SWAP 1, 0 %endmacro ; Load diffs of 16 entries for 1 row %macro LOAD_DIFF_DQ 4 movu xm%1, %2 movu xm%4, %3 vpmovzxbw m%1, xm%1 vpmovzxbw m%4, xm%4 psubw m%1, m%4 %endmacro INIT_YMM avx2 cglobal satd_16x4, 4, 6, 5, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] LOAD_DIFF_DQ 1, [srcq], [dstq], 0 LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0 HADAMARD_4x4 ; Reduce horizontally vextracti128 xm1, m0, 1 paddw xm0, xm1 pshufd xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q1111 ; Perform normalization during the final stage of accumulation ; Avoids overflow in this case pavgw xm0, xm1 movd eax, xm0 movzx eax, ax ; Add an offset for how the final butterfly stage and the first stage of ; accumulation was done. Since this offset is an even number, this can ; safely be done after normalization using pavgw. sub ax, 16 RET INIT_YMM avx2 cglobal satd_4x16, 4, 8, 7, src, src_stride, dst, dst_stride, \ src4, dst4, src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] lea src4q, [srcq+src_strideq*4] lea dst4q, [dstq+dst_strideq*4] LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src4q], [dst4q], \ [src4q+src_strideq*2], [dst4q+dst_strideq*2], \ 4, 5, 6 LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ [src4q+src_strideq*1], [dst4q+dst_strideq*1], \ [src4q+src_stride3q], [dst4q+dst_stride3q], \ 4, 5, 6 lea srcq, [srcq+src_strideq*8] lea dstq, [dstq+dst_strideq*8] lea src4q, [src4q+src_strideq*8] lea dst4q, [dst4q+dst_strideq*8] LOAD_PACK_DIFF_Dx4 2, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src4q], [dst4q], \ [src4q+src_strideq*2], [dst4q+dst_strideq*2], \ 4, 5, 6 LOAD_PACK_DIFF_Dx4 3, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ [src4q+src_strideq*1], [dst4q+dst_strideq*1], \ [src4q+src_stride3q], [dst4q+dst_stride3q], \ 4, 5, 6 HADAMARD_4x4_PACKED 2 ; Reduce horizontally vextracti128 xm1, m0, 1 paddw xm0, xm1 pshufd xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q1111 ; Perform normalization during the final stage of accumulation pavgw xm0, xm1 movd eax, xm0 movzx eax, ax ; Add an offset for how the final butterfly stage and the first stage of ; accumulation was done. Since this offset is an even number, this can ; safely be done after normalization using pavgw. sub ax, 16 RET ; On x86-64 we can transpose in-place without spilling registers. ; By clever choices of the order to apply the butterflies and the order of ; their outputs, we can take the rows in order and output the columns in order ; without any extra operations and using just one temporary register. %macro TRANSPOSE8x8 9 punpckhwd m%9, m%5, m%6 punpcklwd m%5, m%6 ; m%6 is free punpckhwd m%6, m%1, m%2 punpcklwd m%1, m%2 ; m%2 is free punpckhwd m%2, m%7, m%8 punpcklwd m%7, m%8 ; m%8 is free punpckhwd m%8, m%3, m%4 punpcklwd m%3, m%4 ; m%4 is free punpckhdq m%4, m%1, m%3 punpckldq m%1, m%3 ; m%3 is free punpckldq m%3, m%5, m%7 punpckhdq m%5, m%7 ; m%7 is free punpckhdq m%7, m%6, m%8 punpckldq m%6, m%8 ; m%8 is free punpckldq m%8, m%9, m%2 punpckhdq m%9, m%2 ; m%2 is free punpckhqdq m%2, m%1, m%3 punpcklqdq m%1, m%3 ; m%3 is free punpcklqdq m%3, m%4, m%5 punpckhqdq m%4, m%5 ; m%5 is free punpcklqdq m%5, m%6, m%8 punpckhqdq m%6, m%8 ; m%8 is free punpckhqdq m%8, m%7, m%9 punpcklqdq m%7, m%9 %endmacro ; Load diff of 8 entries for 1 row %macro LOAD_DIFF_Q 4 movq %1, %2 movq %4, %3 punpcklbw %1, %4 pmaddubsw %1, hsub %endmacro %macro HADAMARD_8_STAGE_1 9 paddw m%9, m%1, m%2 psubw m%1, m%2 paddw m%2, m%3, m%4 psubw m%3, m%4 paddw m%4, m%5, m%6 psubw m%5, m%6 paddw m%6, m%7, m%8 psubw m%7, m%8 ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1 SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9 %endmacro %macro HADAMARD_8_STAGE_2 9 paddw m%9, m%1, m%3 ; 0 psubw m%1, m%3 ; 2 paddw m%3, m%2, m%4 ; 1 psubw m%2, m%4 ; 3 SWAP %3, %2, %1 paddw m%4, m%5, m%7 ; 4 psubw m%5, m%7 ; 6 paddw m%7, m%6, m%8 ; 5 psubw m%6, m%8 ; 7 SWAP %7, %6, %5 ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1 SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9 %endmacro %macro HADAMARD_8_STAGE_3 9 paddw m%9, m%1, m%5 ; 0 psubw m%1, m%5 ; 4 paddw m%5, m%2, m%6 ; 1 psubw m%2, m%6 ; 5 paddw m%6, m%3, m%7 ; 2 psubw m%3, m%7 ; 6 paddw m%7, m%4, m%8 ; 3 psubw m%4, m%8 ; 7 SWAP %5, %2, %6, %3, %7, %4, %1 ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1 SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9 %endmacro ; Rudimentary fast hadamard transform %macro HADAMARD_8x8 0 HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0 HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0 HADAMARD_8_STAGE_3 1, 2, 3, 4, 5, 6, 7, 8, 0 TRANSPOSE8x8 1, 2, 3, 4, 5, 6, 7, 8, 0 HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0 HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0 ; Stage 3 ; Use the fact that ; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b)) ; to merge the final butterfly with the abs and the first stage of ; accumulation. ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead. ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF. ; The final sum must be offset to compensate for subtracting 0x7FFF. paddw m0, m1, m5 pmaxsw m1, m5 ; m1 is free ; 0x7FFF pcmpeqb m5, m5 psrlw m5, 1 paddsw m0, m5 psubw m1, m0 paddw m0, m2, m6 pmaxsw m2, m6 paddsw m0, m5 psubw m2, m0 paddw m0, m3, m7 pmaxsw m3, m7 paddsw m0, m5 psubw m3, m0 paddw m0, m4, m8 pmaxsw m4, m8 paddsw m0, m5 psubw m4, m0 paddw m1, m2 paddw m3, m4 paddw m1, m3 SWAP 1, 0 %endmacro ; Only works with 128 bit vectors %macro SATD_8x8_FN 0 cglobal satd_8x8, 4, 6, 10, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 %define hsub m0 mova hsub, [maddubsw_hsub] ; Load rows into m1-m8 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] LOAD_DIFF_Q m1, [srcq], [dstq], m2 LOAD_DIFF_Q m2, [srcq+src_strideq*1], [dstq+dst_strideq*1], m3 LOAD_DIFF_Q m3, [srcq+src_strideq*2], [dstq+dst_strideq*2], m4 LOAD_DIFF_Q m4, [srcq+src_stride3q], [dstq+dst_stride3q], m5 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] LOAD_DIFF_Q m5, [srcq], [dstq], m6 LOAD_DIFF_Q m6, [srcq+src_strideq*1], [dstq+dst_strideq*1], m7 LOAD_DIFF_Q m7, [srcq+src_strideq*2], [dstq+dst_strideq*2], m8 LOAD_DIFF_Q m8, [srcq+src_stride3q], [dstq+dst_stride3q], m9 HADAMARD_8x8 ; Reduce horizontally and convert to 32 bits pxor m2, m2 punpcklwd m1, m0, m2 punpckhwd m0, m2 paddd m0, m1 pshufd m1, m0, q3232 paddd m0, m1 pshuflw m1, m0, q3232 paddd m0, m1 movd eax, m0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, 32-2 shr eax, 2 RET %endmacro INIT_XMM ssse3 SATD_8x8_FN INIT_XMM avx2 SATD_8x8_FN INIT_YMM avx2 cglobal satd_16x8, 4, 6, 9, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 ; Load rows into m1-m8 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] LOAD_DIFF_DQ 1, [srcq], [dstq], 0 LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] LOAD_DIFF_DQ 5, [srcq], [dstq], 0 LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0 HADAMARD_8x8 ; Reduce horizontally and convert to 32 bits pxor m2, m2 punpcklwd m1, m0, m2 punpckhwd m0, m2 paddd m0, m1 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, q3232 paddd xm0, xm1 pshuflw xm1, xm0, q3232 paddd xm0, xm1 movd eax, xm0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, 64-2 shr eax, 2 RET %macro LOAD_DIFF_Qx2 7 movq xm%1, %2 movq xm%6, %3 punpcklbw xm%1, xm%6 movq xm%6, %4 movq xm%7, %5 punpcklbw xm%6, xm%7 vinserti128 m%1, xm%6, 1 pmaddubsw m%1, hsub %endmacro INIT_YMM avx2 cglobal satd_8x16, 4, 8, 11, src, src_stride, dst, dst_stride, \ src8, dst8, src_stride3, dst_stride3 %define hsub m0 mova hsub, [maddubsw_hsub] ; Load rows into m1-m8 lea src8q, [srcq+src_strideq*8] lea dst8q, [dstq+dst_strideq*8] lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] LOAD_DIFF_Qx2 1, [srcq], [dstq], \ [src8q], [dst8q], \ 9, 10 LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 9, 10 LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 9, 10 LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \ [src8q+src_stride3q], [dst8q+dst_stride3q], \ 9, 10 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] lea src8q, [src8q+src_strideq*4] lea dst8q, [dst8q+dst_strideq*4] LOAD_DIFF_Qx2 5, [srcq], [dstq], \ [src8q], [dst8q], \ 9, 10 LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 9, 10 LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 9, 10 LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \ [src8q+src_stride3q], [dst8q+dst_stride3q], \ 9, 10 HADAMARD_8x8 ; Reduce horizontally and convert to 32 bits pxor m2, m2 punpcklwd m1, m0, m2 punpckhwd m0, m2 paddd m0, m1 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, q3232 paddd xm0, xm1 pshuflw xm1, xm0, q3232 paddd xm0, xm1 movd eax, xm0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, 64-2 shr eax, 2 RET ; Less optimized, boilerplate implementations INIT_YMM avx2 cglobal satd_8x32, 4, 9, 13, src, src_stride, dst, dst_stride, \ src8, dst8, src_stride3, dst_stride3, cnt ; ones for converting to 32-bit with pmaddwd pcmpeqw m11, m11 pabsw m11, m11 ; sum pxor m12, m12 mov cntd, 1 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] lea src8q, [srcq+src_strideq*8] lea dst8q, [dstq+dst_strideq*8] .loop: %define hsub m0 mova hsub, [maddubsw_hsub] ; Load rows into m1-m8 LOAD_DIFF_Qx2 1, [srcq], [dstq], \ [src8q], [dst8q], \ 9, 10 LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 9, 10 LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 9, 10 LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \ [src8q+src_stride3q], [dst8q+dst_stride3q], \ 9, 10 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] lea src8q, [src8q+src_strideq*4] lea dst8q, [dst8q+dst_strideq*4] LOAD_DIFF_Qx2 5, [srcq], [dstq], \ [src8q], [dst8q], \ 9, 10 LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 9, 10 LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 9, 10 LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \ [src8q+src_stride3q], [dst8q+dst_stride3q], \ 9, 10 HADAMARD_8x8 ; Reduce horizontally and convert to 32 bits pmaddwd m0, m11 paddd m12, m0 lea srcq, [srcq+src_stride3q*4] lea dstq, [dstq+dst_stride3q*4] lea src8q, [src8q+src_stride3q*4] lea dst8q, [dst8q+dst_stride3q*4] dec cntd jge .loop vextracti128 xm0, m12, 1 paddd xm0, xm12 pshufd xm1, xm0, q3232 paddd xm0, xm1 pshuflw xm1, xm0, q3232 paddd xm0, xm1 movd eax, xm0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, 128-2 shr eax, 2 RET INIT_YMM avx2 cglobal satd_16x8_internal, 0, 0, 0, \ dummy1, src_stride, dummy2, dst_stride, \ src_stride3, dst_stride3, src, dst %define hadd m9 %define sum m10 ; Load rows into m1-m8 LOAD_DIFF_DQ 1, [srcq], [dstq], 0 LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] LOAD_DIFF_DQ 5, [srcq], [dstq], 0 LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0 HADAMARD_8x8 pmaddwd m0, hadd paddd sum, m0 ret %macro SATD_NXM 2 %if %1 > 16 %if %2 > 8 cglobal satd_%1x%2, 4, 10, 11, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3, call_src, call_dst, \ w, h %else cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3, call_src, call_dst, \ w %endif %else ; %2 > 8 cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3, call_src, call_dst, \ h %endif ; ones for converting to 32-bit with pmaddwd pcmpeqw m9, m9 pabsw m9, m9 ; sum pxor m10, m10 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] %if %2 > 8 mov hd, %2/8 - 1 .looph: %endif %if %1 > 16 mov wd, %1/16 - 1 .loopv: %endif mov call_srcq, srcq mov call_dstq, dstq call m(satd_16x8_internal) %if %1 > 16 add srcq, 16 add dstq, 16 dec wd jge .loopv sub srcq, %1 sub dstq, %1 %endif %if %2 > 8 lea srcq, [srcq+src_strideq*8] lea dstq, [dstq+dst_strideq*8] dec hd jge .looph %endif ; Reduce horizontally vextracti128 xm0, m10, 1 paddd xm0, xm10 pshufd xm1, xm0, q3232 paddd xm0, xm1 pshuflw xm1, xm0, q3232 paddd xm0, xm1 movd eax, xm0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, %1*%2/2 - 2 shr eax, 2 RET %endmacro INIT_YMM avx2 SATD_NXM 16, 16 SATD_NXM 32, 32 SATD_NXM 64, 64 SATD_NXM 128, 128 SATD_NXM 16, 32 SATD_NXM 32, 16 SATD_NXM 32, 64 SATD_NXM 64, 32 SATD_NXM 64, 128 SATD_NXM 128, 64 SATD_NXM 32, 8 SATD_NXM 16, 64 SATD_NXM 64, 16 %endif ; ARCH_X86_64 av-scenechange-0.14.1/src/asm/x86/satd16_avx2.asm000064400000000000000000000726351046102023000173330ustar 00000000000000; Copyright (c) 2022, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. %include "config.asm" %include "src/asm/x86/x86inc.asm" %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %if ARCH_X86_64 SECTION_RODATA 32 align 32 pw_1x16: times 16 dw 1 SECTION .text %macro NORMALIZE4PT 0 add eax, 2 shr eax, 2 %endmacro %macro NORMALIZE8PT 0 add eax, 4 shr eax, 3 %endmacro ; Add and subtract registers ; ; Takes m0 and m1 as both input and output. ; Requires m2 as a free register. ; ; If we start with this permutation: ; ; m0 0 1 2 3 4 5 6 7 ; m1 8 9 10 11 12 13 14 15 ; ; Then the output will be as such: ; ; m0 [0+8][1+9][2+10][3+11] [4+12][5+13][6+14][7+15] ; m1 [0-8][1-9][2-10][3-11] [4-12][5-13][6-14][7-15] %macro BUTTERFLY 3 %define BIT_PRECISION %1 %define VEC_SIZE %2 ; use alternate registers 3,4,5 %define USE_ALT %3 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %if VEC_SIZE == 32 %define V ym %elif VEC_SIZE == 16 %define V xm %endif ; Use m2 as a temporary register, then swap ; so that m0 and m1 contain the output. %if BIT_PRECISION == 16 paddw V%+ 2, V%+ 0, V%+ 1 psubw V%+ 0, V%+ 1 %elif BIT_PRECISION == 32 paddd ym2, ym0, ym1 psubd ym0, ym1 %else %error Incorrect precision specified (16 or 32 expected) %endif SWAP 2, 1, 0 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %endmacro ; Interleave packed rows together (in m0 and m1). ; m2 should contain a free register. ; ; Macro argument takes size in bits of each element (where one ; element is the difference between two original source pixels). ; ; If we start with this permutation: ; ; m0 0 1 2 3 4 5 6 7 ; m1 8 9 10 11 12 13 14 15 ; ; Then, after INTERLEAVE, this will be the permutation: ; ; m0 0 8 1 9 2 10 3 11 ; m1 4 12 5 13 6 14 7 15 %macro INTERLEAVE 3 %define BIT_PRECISION %1 %define VEC_SIZE %2 %define USE_ALT %3 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %if VEC_SIZE == 16 %define V xm %elif VEC_SIZE == 32 %define V ym %else %error Invalid vector size (expected 16 or 32) %endif %if BIT_PRECISION == 16 punpcklwd V%+ 2, V%+ 0, V%+ 1 punpckhwd V%+ 0, V%+ 1 SWAP 2, 1, 0 %elif BIT_PRECISION == 32 punpckldq ym2, ym0, ym1 punpckhdq ym0, ym1 ; AVX2 shuffles operate over 128-bit halves of the full ymm register ; in parallel, so these shuffles are required to fix up the permutation. vperm2i128 ym1, ym2, ym0, 0x20 vperm2i128 ym0, ym2, ym0, 0x31 SWAP 0, 1 %else %error Incorrect precision specified (16 or 32 expected) %endif %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %endmacro ; Interleave pairs of 2 elements (in m0 and m1) ; m2 should contain a free register. %macro INTERLEAVE_PAIRS 3 %define BIT_PRECISION %1 %define VEC_SIZE %2 %define USE_ALT %3 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %if VEC_SIZE == 16 %define V xm %elif VEC_SIZE == 32 %define V ym %else %error Invalid vector size (expected 16 or 32) %endif %if BIT_PRECISION == 16 punpckldq V%+ 2, V%+ 0, V%+ 1 punpckhdq V%+ 0, V%+ 1 %elif BIT_PRECISION == 32 punpcklqdq ym2, ym0, ym1 punpckhqdq ym0, ym1 %else %error Incorrect precision specified (16 or 32 expected) %endif SWAP 2, 1, 0 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %endmacro %macro HADAMARD_4X4_PACKED 2 %define BIT_PRECISION %1 ; Register size to use (in bytes) %define VEC_SIZE %2 %if VEC_SIZE == 16 %define V xm %elif VEC_SIZE == 32 %define V ym %else %error Invalid vector size (expected 16 or 32) %endif ; Starting registers: ; m0 0 1 2 3 ; m1 4 5 6 7 ; m2 8 9 10 11 ; m3 12 13 14 15 ; Where each number represents an index of the ; original block of differences. ; Pack rows 0,2 and 1,3 into m0 and m1 %if BIT_PRECISION == 16 %if VEC_SIZE == 16 ; In this case, each row only has 64 bits, so we use ; punpcklqdq only. The high 64 bits are always 0. punpcklqdq xm0, xm2 punpcklqdq xm1, xm3 %elif VEC_SIZE == 32 ; The upper 128 bits of all input registers are zeroed punpcklqdq m4, m0, m2 punpcklqdq m5, m1, m3 punpckhqdq m0, m0, m2 punpckhqdq m1, m1, m3 vinserti128 m0, m4, xm0, 1 vinserti128 m1, m5, xm1, 1 %endif %elif BIT_PRECISION == 32 vinserti128 ym0, ym0, xm2, 1 vinserti128 ym1, ym1, xm3, 1 %else %error Invalid bit precision (expected 16 or 32) %endif ; Now that we've packed rows 0-2 and 1-3 together, ; this is our permutation: ; m0 0 1 2 3 8 9 10 11 ; m1 4 5 6 7 12 13 14 15 ; For a 8x4 transform (with 16-bit coefficients), this pattern is ; extended for each 128-bit half but for the second block, and thus ; all comments also apply to the upper 128-bits for the 8x4 transform. BUTTERFLY %1, %2, 0 ; m0 [0+4][1+5][2+6][3+7] [8+12][9+13][10+14][11+15] ; m1 [0-4][1-5][2-6][3-7] [8-12][9-13][10-14][11-15] INTERLEAVE %1, %2, 0 ; m0 [ 0+4][ 0-4][ 1+5][ 1-5] [2 + 6][2 - 6][3 + 7][3 - 7] ; m1 [8+12][8-12][9+13][9-13] [10+14][10-14][11+15][11-15] BUTTERFLY %1, %2, 0 ; m0 [0+4+8+12][0-4+8-12][1+5+9+13][1-5+9-13] [2+6+10+14][2-6+10-14][3+7+11+15][3-7+11-15] ; m1 [0+4-8-12][0-4-8+12][1+5-9-13][1-5-9+13] [2+6-10-14][2-6-10+14][3+7-11-15][3-7-11+15] ; for one row: ; [0+1+2+3][0-1+2-3][0+1-2-3][0-1-2+3] ; For the vertical transform, these are packed into a new column. INTERLEAVE_PAIRS %1, %2, 0 ; p0 p1 p2 p3 ; m0 [0+4+ 8+12][0-4+ 8-12][0+4- 8-12][0-4- 8+12] [1+5+ 9+13][1-5+ 9-13][1+5- 9-13][1-5- 9+13] ; m1 [2+6+10+14][2-6+10-14][2+6-10-14][2-6-10+14] [3+7+11+15][3-7+11-15][3+7-11-15][3-7-11+15] ; According to this grid: ; p0 q0 r0 s0 ; p1 q1 r1 s1 ; p2 q2 r2 s2 ; p3 q3 r3 s3 ; Horizontal transform; since the output is transposed from the original order, ; we can do the same steps as the vertical transform and the result will be the same. BUTTERFLY %1, %2, 0 INTERLEAVE %1, %2, 0 BUTTERFLY %1, %2, 0 ; Finished horizontal transform except for the last step (interleaving pairs), ; which we skip, because after this we add up the absolute value of the ; coefficients, which is a commutative operation (order does not matter). %endmacro ; Horizontal sum of mm register ; ; Inputs: ; %1 = Element size in bits (16 or 32) ; %2 = Size of input register in bytes (16 or 32) ; You can e.g. pass 16 for this argument if you ; only want to sum up the bottom 128-bits of a ; ymm register. ; %3 = Input register number ; %4 = Temporary register number ; %5 = Output register (e.g., eax) %macro HSUM 5 %define E_SIZE %1 %define REG_SIZE %2 %define INPUT %3 %define TMP %4 %define OUTPUT %5 %if REG_SIZE == 16 %define V xm %elif REG_SIZE == 32 %define V ym %else %error Invalid register size (expected 16 or 32) %endif %if E_SIZE == 16 ; Add adjacent pairs of 16-bit elements to produce 32-bit results, ; then proceed with 32-bit sum pmaddwd V%+INPUT, [pw_1x16] %endif %if mmsize == 32 && REG_SIZE == 32 ; Add upper half of ymm to xmm vextracti128 xm%+TMP, ym%+INPUT, 1 paddd xm%+INPUT, xm%+TMP %endif ; Reduce 32-bit results pshufd xm%+TMP, xm%+INPUT, q2323 paddd xm%+INPUT, xm%+TMP pshufd xm%+TMP, xm%+INPUT, q1111 paddd xm%+INPUT, xm%+TMP movd OUTPUT, xm%+INPUT %endmacro ; given m0-7, do butterfly as follows: ; (m0, m1) = butterfly(m0, m1) ; (m2, m3) = butterfly(m2, m3) ; (m4, m5) = butterfly(m4, m5) ; (m6, m7) = butterfly(m6, m7) %macro BUTTERFLY_8X8 0 ; m8 is free paddd m8, m0, m1 psubd m0, m1 SWAP 8, 1, 0 ; m8 is free paddd m8, m2, m3 psubd m2, m3 SWAP 8, 3, 2 paddd m8, m4, m5 psubd m4, m5 SWAP 8, 5, 4 paddd m8, m6, m7 psubd m6, m7 SWAP 8, 7, 6 %endmacro %macro HADAMARD_8X8_VERTICAL 0 BUTTERFLY_8X8 ; m0-7 contain a0-7 SWAP 2, 1 SWAP 6, 5 BUTTERFLY_8X8 SWAP 1, 4 SWAP 3, 6 BUTTERFLY_8X8 SWAP 2, 1 SWAP 2, 4 SWAP 3, 6 SWAP 5, 6 %endmacro ; Transpose rows m0-7. ; Output is also contained in m0-7. ; ; Uses m8, m10-15 as temporary registers (i.e. m9 is left unchanged.) %macro TRANSPOSE8X8D 0 SWAP 9, 0 SWAP 10, 1 SWAP 11, 2 SWAP 12, 3 SWAP 13, 4 SWAP 14, 5 SWAP 15, 6 SWAP 2, 7 punpckldq m6, m9, m10 punpckldq m1, m11, m12 punpckhdq m8, m9, m10 punpckldq m4, m13, m14 punpckldq m9, m15, m2 vshufps m3, m6, m1, 0x4e vpblendd m10, m6, m3, 0xcc vshufps m6, m4, m9, 0x4e punpckhdq m7, m11, m12 vpblendd m11, m4, m6, 0xcc vpblendd m12, m3, m1, 0xcc vperm2i128 m3, m10, m11, 0x20 punpckhdq m5, m13, m14 vpblendd m13, m6, m9, 0xcc punpckhdq m4, m15, m2 vperm2i128 m2, m12, m13, 0x20 vshufps m14, m8, m7, 0x4e vpblendd m15, m14, m7, 0xcc vshufps m7, m5, m4, 0x4e vpblendd m8, m8, m14, 0xcc vpblendd m5, m5, m7, 0xcc vperm2i128 m6, m8, m5, 0x20 vpblendd m4, m7, m4, 0xcc vperm2i128 m7, m15, m4, 0x20 vperm2i128 m1, m10, m11, 0x31 vperm2i128 m9, m12, m13, 0x31 vperm2i128 m5, m8, m5, 0x31 vperm2i128 m4, m15, m4, 0x31 SWAP 0, 9 ; Output order is as follows: ; 3 2 6 7 1 0 5 4 ; sort rows SWAP 3, 0 ; 0 2 6 7 1 3 5 4 SWAP 1, 2 ; 0 1 6 7 2 3 5 4 SWAP 6, 2 ; 0 1 2 7 6 3 5 4 SWAP 7, 3 ; 0 1 2 3 6 7 5 4 SWAP 6, 4 ; 0 1 2 3 4 7 5 6 SWAP 7, 5 ; 0 1 2 3 4 5 7 6 SWAP 6, 7 ; 0 1 2 3 4 5 6 7 %endmacro ; m0-7 as input; add coefficients to ymm9. INIT_YMM avx2 cglobal satd_8x8_hbd_internal, 0, 0, 0, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 HADAMARD_8X8_VERTICAL TRANSPOSE8X8D HADAMARD_8X8_VERTICAL REPX {pabsd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 ; Add m0-7 paddd m0, m4 paddd m1, m5 paddd m2, m6 paddd m3, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd ymm9, m0 ret %macro LOAD_DIFF_8X8 0 movu xm0, [srcq + 0*src_strideq] movu xm1, [srcq + 1*src_strideq] movu xm2, [srcq + 2*src_strideq] movu xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] movu xm4, [srcq + 0*src_strideq] movu xm5, [srcq + 1*src_strideq] movu xm6, [srcq + 2*src_strideq] movu xm7, [srcq + src_stride3q ] psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] pmovsxwd m0, xm0 pmovsxwd m1, xm1 pmovsxwd m2, xm2 pmovsxwd m3, xm3 pmovsxwd m4, xm4 pmovsxwd m5, xm5 pmovsxwd m6, xm6 pmovsxwd m7, xm7 %endmacro INIT_YMM avx2 cglobal satd_8x8_hbd, 5, 7, 16, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] LOAD_DIFF_8X8 ; m0-7 contain rows of 8x8 block to transform ; with 32-bit coefficients HADAMARD_8X8_VERTICAL TRANSPOSE8X8D HADAMARD_8X8_VERTICAL REPX {pabsd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 ; Add m0-7 paddd m0, m4 paddd m1, m5 paddd m2, m6 paddd m3, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 HSUM 32, 32, 0, 1, eax NORMALIZE8PT RET INIT_YMM avx2 cglobal satd_4x4_hbd, 5, 7, 8, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc ; Load src rows movq xm0, [srcq + 0*src_strideq] movq xm1, [srcq + 1*src_strideq] movq xm2, [srcq + 2*src_strideq] movq xm3, [srcq + src_stride3q ] ; src -= dst psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] HADAMARD_4X4_PACKED 16, 16 ; Sum up absolute value of transform coefficients pabsw xm0, xm0 pabsw xm1, xm1 paddw xm0, xm1 HSUM 16, 16, 0, 1, eax NORMALIZE4PT RET .12bpc: ; this gives a nicer disassembly RESET_MM_PERMUTATION ; Load src rows pmovzxwd xm0, [srcq + 0*src_strideq] pmovzxwd xm1, [srcq + 1*src_strideq] pmovzxwd xm2, [srcq + 2*src_strideq] pmovzxwd xm3, [srcq + src_stride3q ] ; Load dst rows pmovzxwd xm4, [dstq + 0*dst_strideq] pmovzxwd xm5, [dstq + 1*dst_strideq] pmovzxwd xm6, [dstq + 2*dst_strideq] pmovzxwd xm7, [dstq + dst_stride3q ] ; src -= dst psubd xm0, xm4 psubd xm1, xm5 psubd xm2, xm6 psubd xm3, xm7 HADAMARD_4X4_PACKED 32, 32 pabsd m0, m0 pabsd m1, m1 paddd m0, m1 HSUM 32, 32, 0, 1, eax NORMALIZE4PT RET ; 32-bit input rows are in m0-3; result is in m0. ; Uses m0-5 as temporary registers. %macro HADAMARD_8X4_12BPC 0 vperm2i128 m4, m0, m2, 0x31 vperm2i128 m5, m1, m3, 0x31 vinserti128 m0, m0, xm2, 1 vinserti128 m1, m1, xm3, 1 ; Swap so m3,m4 are used as inputs. SWAP 3, 4, 5 ; instead of using HADAMARD_4X4_PACKED twice, we interleave ; 2 transforms operating over different registers for more ; opportunity for instruction level parallelism. BUTTERFLY 32, 32, 0 BUTTERFLY 32, 32, 1 INTERLEAVE 32, 32, 0 INTERLEAVE 32, 32, 1 BUTTERFLY 32, 32, 0 BUTTERFLY 32, 32, 1 INTERLEAVE_PAIRS 32, 32, 0 INTERLEAVE_PAIRS 32, 32, 1 BUTTERFLY 32, 32, 0 BUTTERFLY 32, 32, 1 INTERLEAVE 32, 32, 0 INTERLEAVE 32, 32, 1 BUTTERFLY 32, 32, 0 BUTTERFLY 32, 32, 1 pabsd m0, m0 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 paddd m0, m1 paddd m3, m4 paddd m0, m3 %endmacro INIT_YMM avx2 cglobal satd_16x4_hbd, 5, 7, 12, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc ; Load src rows movu m0, [srcq + 0*src_strideq] movu m1, [srcq + 1*src_strideq] movu m2, [srcq + 2*src_strideq] movu m3, [srcq + src_stride3q ] ; src -= dst psubw m0, [dstq + 0*dst_strideq] psubw m1, [dstq + 1*dst_strideq] psubw m2, [dstq + 2*dst_strideq] psubw m3, [dstq + dst_stride3q ] .10bpc_main: ; Original permutation ; m0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ; m1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 ; m2 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 ; m3 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 ; Two registers perform 2 4x4 transforms in parallel punpcklqdq m4, m0, m2 punpcklqdq m5, m1, m3 punpckhqdq m0, m0, m2 punpckhqdq m1, m1, m3 SWAP 4, 3 SWAP 5, 4 ; New permutation ; m0 0 1 2 3 32 33 34 35 8 9 10 11 40 41 42 43 ; m1 16 17 18 19 48 49 50 51 24 25 26 27 56 57 58 59 ; m3 4 5 6 7 36 37 38 39 12 13 14 15 44 45 46 47 ; m4 20 21 22 23 52 53 54 55 28 29 30 31 60 61 62 63 BUTTERFLY 16, 32, 0 BUTTERFLY 16, 32, 1 INTERLEAVE 16, 32, 0 INTERLEAVE 16, 32, 1 BUTTERFLY 16, 32, 0 BUTTERFLY 16, 32, 1 INTERLEAVE_PAIRS 16, 32, 0 INTERLEAVE_PAIRS 16, 32, 1 BUTTERFLY 16, 32, 0 BUTTERFLY 16, 32, 1 INTERLEAVE 16, 32, 0 INTERLEAVE 16, 32, 1 BUTTERFLY 16, 32, 0 BUTTERFLY 16, 32, 1 pabsw m0, m0 pabsw m1, m1 pabsw m3, m3 pabsw m4, m4 paddw m0, m1 paddw m3, m4 paddw m0, m3 HSUM 16, 32, 0, 1, eax NORMALIZE4PT RET .12bpc: RESET_MM_PERMUTATION mov bdmaxd, 2 pxor m6, m6 .12bpc_loop: movu xm0, [srcq + 0*src_strideq] movu xm1, [srcq + 1*src_strideq] movu xm2, [srcq + 2*src_strideq] movu xm3, [srcq + src_stride3q ] psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] pmovsxwd m0, xm0 pmovsxwd m1, xm1 pmovsxwd m2, xm2 pmovsxwd m3, xm3 add srcq, 16 add dstq, 16 HADAMARD_8X4_12BPC paddd m6, m0 dec bdmaxd jnz .12bpc_loop HSUM 32, 32, 6, 1, eax NORMALIZE4PT RET INIT_YMM avx2 cglobal satd_4x16_hbd, 5, 7, 12, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc ; BLOCK 1 movq xm0, [srcq + 0*src_strideq] movq xm1, [srcq + 1*src_strideq] movq xm2, [srcq + 2*src_strideq] movq xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] ; BLOCK 2 movq xm4, [srcq + 0*src_strideq] movq xm5, [srcq + 1*src_strideq] movq xm6, [srcq + 2*src_strideq] movq xm7, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 ; BLOCK 3 movq xm4, [srcq + 0*src_strideq] movq xm5, [srcq + 1*src_strideq] movq xm6, [srcq + 2*src_strideq] movq xm7, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] ; BLOCK 4 movq xm8, [srcq + 0*src_strideq] movq xm9, [srcq + 1*src_strideq] movq xm10, [srcq + 2*src_strideq] movq xm11, [srcq + src_stride3q ] psubw xm8, [dstq + 0*dst_strideq] psubw xm9, [dstq + 1*dst_strideq] psubw xm10, [dstq + 2*dst_strideq] psubw xm11, [dstq + dst_stride3q ] vinserti128 m4, m4, xm8, 1 vinserti128 m5, m5, xm9, 1 vinserti128 m6, m6, xm10, 1 vinserti128 m7, m7, xm11, 1 punpcklqdq m0, m0, m4 punpcklqdq m1, m1, m5 punpcklqdq m2, m2, m6 punpcklqdq m3, m3, m7 jmp m(satd_16x4_hbd).10bpc_main .12bpc: mov bdmaxd, 2 pxor m8, m8 .12bpc_loop: ; BLOCK 1 movq xm0, [srcq + 0*src_strideq] movq xm1, [srcq + 1*src_strideq] movq xm2, [srcq + 2*src_strideq] movq xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] pmovsxwd xm0, xm0 pmovsxwd xm1, xm1 pmovsxwd xm2, xm2 pmovsxwd xm3, xm3 ; BLOCK 2 movq xm4, [srcq + 0*src_strideq] movq xm5, [srcq + 1*src_strideq] movq xm6, [srcq + 2*src_strideq] movq xm7, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] pmovsxwd xm4, xm4 pmovsxwd xm5, xm5 pmovsxwd xm6, xm6 pmovsxwd xm7, xm7 vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 HADAMARD_8X4_12BPC paddd m8, m0 dec bdmaxd jnz .12bpc_loop HSUM 32, 32, 8, 0, eax NORMALIZE4PT RET INIT_YMM avx2 cglobal satd_8x4_hbd, 5, 7, 12, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc ; Load src rows movu xm0, [srcq + 0*src_strideq] movu xm1, [srcq + 1*src_strideq] movu xm2, [srcq + 2*src_strideq] movu xm3, [srcq + src_stride3q ] ; src -= dst psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] .10bpc_main: HADAMARD_4X4_PACKED 16, 32 pabsw m0, m0 pabsw m1, m1 paddw m0, m1 HSUM 16, 32, 0, 1, eax NORMALIZE4PT RET .12bpc: RESET_MM_PERMUTATION pmovzxwd m0, [srcq + 0*src_strideq] pmovzxwd m1, [srcq + 1*src_strideq] pmovzxwd m2, [srcq + 2*src_strideq] pmovzxwd m3, [srcq + src_stride3q ] pmovzxwd m4, [dstq + 0*dst_strideq] pmovzxwd m5, [dstq + 1*dst_strideq] pmovzxwd m6, [dstq + 2*dst_strideq] pmovzxwd m7, [dstq + dst_stride3q ] ; src -= dst psubd m0, m4 psubd m1, m5 psubd m2, m6 psubd m3, m7 .12bpc_main: HADAMARD_8X4_12BPC HSUM 32, 32, 0, 1, eax NORMALIZE4PT RET INIT_YMM avx2 cglobal satd_4x8_hbd, 5, 7, 12, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc movq xm0, [srcq + 0*src_strideq] movq xm1, [srcq + 1*src_strideq] movq xm2, [srcq + 2*src_strideq] movq xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] movq xm4, [srcq + 0*src_strideq] movq xm5, [srcq + 1*src_strideq] movq xm6, [srcq + 2*src_strideq] movq xm7, [srcq + src_stride3q ] ; This loads past the number of elements we are technically supposed ; to read, however, this should still be safe, since at least one ; valid element is in the memory address. psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] punpcklqdq xm0, xm0, xm4 punpcklqdq xm1, xm1, xm5 punpcklqdq xm2, xm2, xm6 punpcklqdq xm3, xm3, xm7 ; Jump to HADAMARD_4X4_PACKED in 8x4 satd, this saves us some binary size ; by deduplicating the shared code. jmp m(satd_8x4_hbd).10bpc_main ; No return; we return in the other function. .12bpc: RESET_MM_PERMUTATION pmovzxwd xm0, [srcq + 0*src_strideq] pmovzxwd xm1, [srcq + 1*src_strideq] pmovzxwd xm2, [srcq + 2*src_strideq] pmovzxwd xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] pmovzxwd xm4, [dstq + 0*dst_strideq] pmovzxwd xm5, [dstq + 1*dst_strideq] pmovzxwd xm6, [dstq + 2*dst_strideq] pmovzxwd xm7, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] ; src -= dst psubd xm0, xm4 psubd xm1, xm5 psubd xm2, xm6 psubd xm3, xm7 pmovzxwd xm4, [srcq + 0*src_strideq] pmovzxwd xm5, [srcq + 1*src_strideq] pmovzxwd xm6, [srcq + 2*src_strideq] pmovzxwd xm7, [srcq + src_stride3q ] pmovzxwd xm8, [dstq + 0*dst_strideq] pmovzxwd xm9, [dstq + 1*dst_strideq] pmovzxwd xm10, [dstq + 2*dst_strideq] pmovzxwd xm11, [dstq + dst_stride3q ] ; src -= dst (second block) psubd xm4, xm8 psubd xm5, xm9 psubd xm6, xm10 psubd xm7, xm11 vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 ; Jump to HADAMARD_4X4_PACKED in 8x4 satd, this saves us some binary size ; by deduplicating the shared code. jmp m(satd_8x4_hbd).12bpc_main ; No return; we return in the other function. ; , %macro SATD_NXM 2 INIT_YMM avx2 cglobal satd_%1x%2_hbd, 5, 10, 16, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3, nsrc_stride4, ndst_stride4, rows lea nsrc_stride4q, [4*src_strideq] lea ndst_stride4q, [4*dst_strideq] lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] neg nsrc_stride4q neg ndst_stride4q pxor m9, m9 ; Height contains the number of rows. mov rowsd, %2/8 .outer: mov bdmaxd, %1/8 ; Loop over blocks in same row. .loop: LOAD_DIFF_8X8 ; Fix up pointers and go to next block in same row. lea srcq, [srcq + nsrc_stride4q + 16] lea dstq, [dstq + ndst_stride4q + 16] call m(satd_8x8_hbd_internal) dec bdmaxd jnz .loop lea srcq, [srcq + 8*src_strideq - (%1*16)/8] lea dstq, [dstq + 8*dst_strideq - (%1*16)/8] dec rowsd jnz .outer HSUM 32, 32, 9, 0, eax NORMALIZE8PT RET %endmacro %macro SATD_NX8 1 INIT_YMM avx2 cglobal satd_%1x8_hbd, 5, 9, 16, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3, nsrc_stride4, ndst_stride4 lea nsrc_stride4q, [4*src_strideq] lea ndst_stride4q, [4*dst_strideq] lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] neg nsrc_stride4q neg ndst_stride4q pxor m9, m9 mov bdmaxd, %1/8 .loop: LOAD_DIFF_8X8 lea srcq, [srcq + nsrc_stride4q + 16] lea dstq, [dstq + ndst_stride4q + 16] call m(satd_8x8_hbd_internal) dec bdmaxd jnz .loop HSUM 32, 32, 9, 0, eax NORMALIZE8PT RET %endmacro %macro SATD_8XM 1 INIT_YMM avx2 cglobal satd_8x%1_hbd, 5, 7, 16, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] pxor m9, m9 mov bdmaxd, %1/8 .loop: LOAD_DIFF_8X8 lea srcq, [srcq + 4*src_strideq] lea dstq, [dstq + 4*dst_strideq] call m(satd_8x8_hbd_internal) dec bdmaxd jnz .loop HSUM 32, 32, 9, 0, eax NORMALIZE8PT RET %endmacro SATD_NXM 16, 16 SATD_NXM 32, 32 SATD_NXM 64, 64 SATD_NXM 128, 128 SATD_NXM 16, 32 SATD_NXM 16, 64 SATD_NXM 32, 16 SATD_NXM 32, 64 SATD_NXM 64, 16 SATD_NXM 64, 32 SATD_NXM 64, 128 SATD_NXM 128, 64 SATD_NX8 16 SATD_NX8 32 SATD_8XM 16 SATD_8XM 32 %endif ; ARCH_X86_64 av-scenechange-0.14.1/src/asm/x86/tables.asm000064400000000000000000001317041046102023000165340ustar 00000000000000; Copyright (c) 2019-2022, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "src/asm/x86/x86inc.asm" SECTION_RODATA 16 align 8, db 0 const mc_subpel_filters, db 0, 1, -3, 63, 4, -1, 0, 0, ; REGULAR db 0, 1, -5, 61, 9, -2, 0, 0, db 0, 1, -6, 58, 14, -4, 1, 0, db 0, 1, -7, 55, 19, -5, 1, 0, db 0, 1, -7, 51, 24, -6, 1, 0, db 0, 1, -8, 47, 29, -6, 1, 0, db 0, 1, -7, 42, 33, -6, 1, 0, db 0, 1, -7, 38, 38, -7, 1, 0, db 0, 1, -6, 33, 42, -7, 1, 0, db 0, 1, -6, 29, 47, -8, 1, 0, db 0, 1, -6, 24, 51, -7, 1, 0, db 0, 1, -5, 19, 55, -7, 1, 0, db 0, 1, -4, 14, 58, -6, 1, 0, db 0, 0, -2, 9, 61, -5, 1, 0, db 0, 0, -1, 4, 63, -3, 1, 0, db 0, 1, 14, 31, 17, 1, 0, 0, ; SMOOTH db 0, 0, 13, 31, 18, 2, 0, 0, db 0, 0, 11, 31, 20, 2, 0, 0, db 0, 0, 10, 30, 21, 3, 0, 0, db 0, 0, 9, 29, 22, 4, 0, 0, db 0, 0, 8, 28, 23, 5, 0, 0, db 0, -1, 8, 27, 24, 6, 0, 0, db 0, -1, 7, 26, 26, 7, -1, 0, db 0, 0, 6, 24, 27, 8, -1, 0, db 0, 0, 5, 23, 28, 8, 0, 0, db 0, 0, 4, 22, 29, 9, 0, 0, db 0, 0, 3, 21, 30, 10, 0, 0, db 0, 0, 2, 20, 31, 11, 0, 0, db 0, 0, 2, 18, 31, 13, 0, 0, db 0, 0, 1, 17, 31, 14, 1, 0, db -1, 1, -3, 63, 4, -1, 1, 0, ; SHARP db -1, 3, -6, 62, 8, -3, 2, -1, db -1, 4, -9, 60, 13, -5, 3, -1, db -2, 5, -11, 58, 19, -7, 3, -1, db -2, 5, -11, 54, 24, -9, 4, -1, db -2, 5, -12, 50, 30, -10, 4, -1, db -2, 5, -12, 45, 35, -11, 5, -1, db -2, 6, -12, 40, 40, -12, 6, -2, db -1, 5, -11, 35, 45, -12, 5, -2, db -1, 4, -10, 30, 50, -12, 5, -2, db -1, 4, -9, 24, 54, -11, 5, -2, db -1, 3, -7, 19, 58, -11, 5, -2, db -1, 3, -5, 13, 60, -9, 4, -1, db -1, 2, -3, 8, 62, -6, 3, -1, db 0, 1, -1, 4, 63, -3, 1, -1, db 0, 0, -2, 63, 4, -1, 0, 0, ; REGULAR 4 db 0, 0, -4, 61, 9, -2, 0, 0, db 0, 0, -5, 58, 14, -3, 0, 0, db 0, 0, -6, 55, 19, -4, 0, 0, db 0, 0, -6, 51, 24, -5, 0, 0, db 0, 0, -7, 47, 29, -5, 0, 0, db 0, 0, -6, 42, 33, -5, 0, 0, db 0, 0, -6, 38, 38, -6, 0, 0, db 0, 0, -5, 33, 42, -6, 0, 0, db 0, 0, -5, 29, 47, -7, 0, 0, db 0, 0, -5, 24, 51, -6, 0, 0, db 0, 0, -4, 19, 55, -6, 0, 0, db 0, 0, -3, 14, 58, -5, 0, 0, db 0, 0, -2, 9, 61, -4, 0, 0, db 0, 0, -1, 4, 63, -2, 0, 0, db 0, 0, 15, 31, 17, 1, 0, 0, ; SMOOTH 4 db 0, 0, 13, 31, 18, 2, 0, 0, db 0, 0, 11, 31, 20, 2, 0, 0, db 0, 0, 10, 30, 21, 3, 0, 0, db 0, 0, 9, 29, 22, 4, 0, 0, db 0, 0, 8, 28, 23, 5, 0, 0, db 0, 0, 7, 27, 24, 6, 0, 0, db 0, 0, 6, 26, 26, 6, 0, 0, db 0, 0, 6, 24, 27, 7, 0, 0, db 0, 0, 5, 23, 28, 8, 0, 0, db 0, 0, 4, 22, 29, 9, 0, 0, db 0, 0, 3, 21, 30, 10, 0, 0, db 0, 0, 2, 20, 31, 11, 0, 0, db 0, 0, 2, 18, 31, 13, 0, 0, db 0, 0, 1, 17, 31, 15, 0, 0, ; Bilin scaled being very rarely used, add a new table entry ; and use the put/prep_8tap_scaled code, thus acting as a ; scaled bilinear filter. db 0, 0, 0, 60, 4, 0, 0, 0, db 0, 0, 0, 56, 8, 0, 0, 0, db 0, 0, 0, 52, 12, 0, 0, 0, db 0, 0, 0, 48, 16, 0, 0, 0, db 0, 0, 0, 44, 20, 0, 0, 0, db 0, 0, 0, 40, 24, 0, 0, 0, db 0, 0, 0, 36, 28, 0, 0, 0, db 0, 0, 0, 32, 32, 0, 0, 0, db 0, 0, 0, 28, 36, 0, 0, 0, db 0, 0, 0, 24, 40, 0, 0, 0, db 0, 0, 0, 20, 44, 0, 0, 0, db 0, 0, 0, 16, 48, 0, 0, 0, db 0, 0, 0, 12, 52, 0, 0, 0, db 0, 0, 0, 8, 56, 0, 0, 0, db 0, 0, 0, 4, 60, 0, 0, 0 align 64, db 0 const filter_intra_taps, db -6, 10, -5, 2, -3, 1, -3, 1, ; 0 db -4, 6, -3, 2, -3, 2, -3, 1, db 0, 0, 10, 0, 1, 10, 1, 2, db 0, 0, 6, 0, 2, 6, 2, 2, db 0, 12, 0, 9, 0, 7, 10, 5, db 0, 2, 0, 2, 0, 2, 6, 3, db 0, 0, 0, 0, 0, 0, 0, 0, db 12, 0, 9, 0, 7, 0, 5, 0, db -10, 16, -6, 0, -4, 0, -2, 0, ; 1 db -10, 16, -6, 0, -4, 0, -2, 0, db 0, 0, 16, 0, 0, 16, 0, 0, db 0, 0, 16, 0, 0, 16, 0, 0, db 0, 10, 0, 6, 0, 4, 16, 2, db 0, 0, 0, 0, 0, 0, 16, 0, db 0, 0, 0, 0, 0, 0, 0, 0, db 10, 0, 6, 0, 4, 0, 2, 0, db -8, 8, -8, 0, -8, 0, -8, 0, ; 2 db -4, 4, -4, 0, -4, 0, -4, 0, db 0, 0, 8, 0, 0, 8, 0, 0, db 0, 0, 4, 0, 0, 4, 0, 0, db 0, 16, 0, 16, 0, 16, 8, 16, db 0, 0, 0, 0, 0, 0, 4, 0, db 0, 0, 0, 0, 0, 0, 0, 0, db 16, 0, 16, 0, 16, 0, 16, 0, db -2, 8, -1, 3, -1, 2, 0, 1, ; 3 db -1, 4, -1, 3, -1, 2, -1, 2, db 0, 0, 8, 0, 3, 8, 2, 3, db 0, 0, 4, 0, 3, 4, 2, 3, db 0, 10, 0, 6, 0, 4, 8, 2, db 0, 3, 0, 4, 0, 4, 4, 3, db 0, 0, 0, 0, 0, 0, 0, 0, db 10, 0, 6, 0, 4, 0, 3, 0, db -12, 14, -10, 0, -9, 0, -8, 0, ; 4 db -10, 12, -9, 1, -8, 0, -7, 0, db 0, 0, 14, 0, 0, 14, 0, 0, db 0, 0, 12, 0, 0, 12, 0, 1, db 0, 14, 0, 12, 0, 11, 14, 10, db 0, 0, 0, 0, 0, 1, 12, 1, db 0, 0, 0, 0, 0, 0, 0, 0, db 14, 0, 12, 0, 11, 0, 9, 0 align 64, db 0 const sgr_x_by_x, db 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, db 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, db 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, db 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, db 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, db 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, db 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, db 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, db 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, db 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, db 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, db 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 0 align 8, db 0 const mc_warp_filter, db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0, ; [-1, 0) db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0, db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0, db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0, db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0, db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0, db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0, db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0, db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0, db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0, db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0, db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0, db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0, db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0, db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0, db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0, db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0, db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0, db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0, db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0, db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0, db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0, db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0, db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0, db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0, db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0, db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0, db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0, db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0, db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0, db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0, db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0, db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0, ; [0, 1) db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0, db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1, db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1, db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1, db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1, db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1, db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1, db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2, db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2, db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2, db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2, db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2, db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2, db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2, db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2, db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2, db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2, db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2, db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2, db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2, db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2, db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2, db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2, db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2, db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1, db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2, db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1, db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1, db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1, db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0, db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0, db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0, ; [1, 2) db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, db 0, 0, 2, -1, 0, 0, 127, 0 ; dummy (replicate row index 191) ; Values that are 0 will never be used align 2, db 0 const dr_intra_derivative, dw 0, ; Angles: dw 1023, 0, ; 3, 93, 183 dw 547, ; 6, 96, 186 dw 372, 0, 0, ; 9, 99, 189 dw 273, ; 14, 104, 194 dw 215, 0, ; 17, 107, 197 dw 178, ; 20, 110, 200 dw 151, 0, ; 23, 113, 203 (113 & 203 are base angles) dw 132, ; 26, 116, 206 dw 116, 0, ; 29, 119, 209 dw 102, 0, ; 32, 122, 212 dw 90, ; 36, 126, 216 dw 80, 0, ; 39, 129, 219 dw 71, ; 42, 132, 222 dw 64, 0, ; 45, 135, 225 (45 & 135 are base angles) dw 57, ; 48, 138, 228 dw 51, 0, ; 51, 141, 231 dw 45, 0, ; 54, 144, 234 dw 40, ; 58, 148, 238 dw 35, 0, ; 61, 151, 241 dw 31, ; 64, 154, 244 dw 27, 0, ; 67, 157, 247 (67 & 157 are base angles) dw 23, ; 70, 160, 250 dw 19, 0, ; 73, 163, 253 dw 15, 0, ; 76, 166, 256 dw 11, 0, ; 81, 171, 261 dw 7, ; 84, 174, 264 dw 3 ; 87, 177, 267 ; Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512 align 2, db 0 const gaussian_sequence, dw 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, dw 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, dw 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, dw -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, dw 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, dw 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, dw 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, dw 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, dw 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, dw 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, dw 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, dw -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, dw 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, dw 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, dw -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, dw -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, dw -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, dw -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, dw 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, dw 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, dw 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, dw -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, dw -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, dw -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, dw 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, dw 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, dw 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, dw -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, dw 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, dw -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, dw 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, dw -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, dw 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, dw -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, dw -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, dw -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, dw -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, dw -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, dw 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, dw 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, dw -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, dw -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, dw 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, dw 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, dw -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, dw 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, dw 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, dw -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, dw 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, dw -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, dw 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, dw -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, dw -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, dw 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, dw -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, dw -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, dw 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, dw 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, dw -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, dw 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, dw 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, dw 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, dw -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, dw -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, dw -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, dw 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, dw -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, dw -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, dw -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, dw -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, dw -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, dw 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, dw -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, dw -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, dw 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, dw -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, dw -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, dw -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, dw 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, dw -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, dw 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, dw 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, dw 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, dw -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, dw -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, dw 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, dw 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, dw -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, dw -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, dw -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, dw -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, dw 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, dw 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, dw 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, dw 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, dw 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, dw 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, dw 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, dw -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, dw 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, dw -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, dw -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, dw -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, dw 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, dw -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, dw -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, dw 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, dw 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, dw 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, dw 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, dw 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, dw 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, dw 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, dw -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, dw -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, dw -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, dw 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, dw -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, dw -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, dw 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, dw -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, dw 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, dw 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, dw 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, dw -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, dw 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, dw -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, dw 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, dw 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, dw 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, dw 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, dw -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, dw -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, dw 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, dw -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, dw 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, dw 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, dw 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, dw -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, dw -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, dw 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, dw 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, dw 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, dw -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, dw -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, dw 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, dw -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, dw -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, dw -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, dw 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, dw -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, dw 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, dw -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, dw 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, dw -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, dw 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, dw 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, dw 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, dw 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, dw -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, dw -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, dw -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, dw -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, dw 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, dw 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, dw 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, dw 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, dw -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, dw 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, dw -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, dw 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, dw 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, dw -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, dw -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, dw -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, dw -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, dw 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, dw -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, dw -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, dw -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, dw -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, dw 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, dw 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, dw -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, dw -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, dw 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, dw 428, -484 align 8, db 0 const resize_filter, db 0, 0, 0, -128, 0, 0, 0, 0, 0, 0, 1, -128, -2, 1, 0, 0, db 0, -1, 3, -127, -4, 2, -1, 0, 0, -1, 4, -127, -6, 3, -1, 0, db 0, -2, 6, -126, -8, 3, -1, 0, 0, -2, 7, -125, -11, 4, -1, 0, db 1, -2, 8, -125, -13, 5, -2, 0, 1, -3, 9, -124, -15, 6, -2, 0, db 1, -3, 10, -123, -18, 6, -2, 1, 1, -3, 11, -122, -20, 7, -3, 1, db 1, -4, 12, -121, -22, 8, -3, 1, 1, -4, 13, -120, -25, 9, -3, 1, db 1, -4, 14, -118, -28, 9, -3, 1, 1, -4, 15, -117, -30, 10, -4, 1, db 1, -5, 16, -116, -32, 11, -4, 1, 1, -5, 16, -114, -35, 12, -4, 1, db 1, -5, 17, -112, -38, 12, -4, 1, 1, -5, 18, -111, -40, 13, -5, 1, db 1, -5, 18, -109, -43, 14, -5, 1, 1, -6, 19, -107, -45, 14, -5, 1, db 1, -6, 19, -105, -48, 15, -5, 1, 1, -6, 19, -103, -51, 16, -5, 1, db 1, -6, 20, -101, -53, 16, -6, 1, 1, -6, 20, -99, -56, 17, -6, 1, db 1, -6, 20, -97, -58, 17, -6, 1, 1, -6, 20, -95, -61, 18, -6, 1, db 2, -7, 20, -93, -64, 18, -6, 2, 2, -7, 20, -91, -66, 19, -6, 1, db 2, -7, 20, -88, -69, 19, -6, 1, 2, -7, 20, -86, -71, 19, -6, 1, db 2, -7, 20, -84, -74, 20, -7, 2, 2, -7, 20, -81, -76, 20, -7, 1, db 2, -7, 20, -79, -79, 20, -7, 2, 1, -7, 20, -76, -81, 20, -7, 2, db 2, -7, 20, -74, -84, 20, -7, 2, 1, -6, 19, -71, -86, 20, -7, 2, db 1, -6, 19, -69, -88, 20, -7, 2, 1, -6, 19, -66, -91, 20, -7, 2, db 2, -6, 18, -64, -93, 20, -7, 2, 1, -6, 18, -61, -95, 20, -6, 1, db 1, -6, 17, -58, -97, 20, -6, 1, 1, -6, 17, -56, -99, 20, -6, 1, db 1, -6, 16, -53, -101, 20, -6, 1, 1, -5, 16, -51, -103, 19, -6, 1, db 1, -5, 15, -48, -105, 19, -6, 1, 1, -5, 14, -45, -107, 19, -6, 1, db 1, -5, 14, -43, -109, 18, -5, 1, 1, -5, 13, -40, -111, 18, -5, 1, db 1, -4, 12, -38, -112, 17, -5, 1, 1, -4, 12, -35, -114, 16, -5, 1, db 1, -4, 11, -32, -116, 16, -5, 1, 1, -4, 10, -30, -117, 15, -4, 1, db 1, -3, 9, -28, -118, 14, -4, 1, 1, -3, 9, -25, -120, 13, -4, 1, db 1, -3, 8, -22, -121, 12, -4, 1, 1, -3, 7, -20, -122, 11, -3, 1, db 1, -2, 6, -18, -123, 10, -3, 1, 0, -2, 6, -15, -124, 9, -3, 1, db 0, -2, 5, -13, -125, 8, -2, 1, 0, -1, 4, -11, -125, 7, -2, 0, db 0, -1, 3, -8, -126, 6, -2, 0, 0, -1, 3, -6, -127, 4, -1, 0, db 0, -1, 2, -4, -127, 3, -1, 0, 0, 0, 1, -2, -128, 1, 0, 0, align 16, db 0 ; Unused const obmc_masks, db 0, 0, ; 2 db 19, 0, ; 4 db 25, 14, 5, 0, ; 8 db 28, 22, 16, 11, 7, 3, 0, 0, ; 16 db 30, 27, 24, 21, 18, 15, 12, 10, 8, 6, 4, 3, 0, 0, 0, 0, ; 32 db 31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11, 9, db 8, 7, 6, 5, 4, 4, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, av-scenechange-0.14.1/src/asm/x86/x86inc.asm000064400000000000000000001571151046102023000164050ustar 00000000000000;***************************************************************************** ;* x86inc.asm: x86 abstraction layer ;***************************************************************************** ;* Copyright (C) 2005-2022 x264 project ;* ;* Authors: Loren Merritt ;* Henrik Gramner ;* Anton Mitrofanov ;* Fiona Glaser ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above ;* copyright notice and this permission notice appear in all copies. ;* ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;***************************************************************************** ; This is a header file for the x86inc.asm assembly language, which uses ; NASM/YASM syntax combined with a large number of macros to provide easy ; abstraction between different calling conventions (x86_32, win64, linux64). ; It also has various other useful features to simplify writing the kind of ; DSP functions that are most often used. %ifndef private_prefix %error private_prefix not defined %endif %ifndef public_prefix %define public_prefix private_prefix %endif %ifndef STACK_ALIGNMENT %if ARCH_X86_64 %define STACK_ALIGNMENT 16 %else %define STACK_ALIGNMENT 4 %endif %endif %define WIN64 0 %define UNIX64 0 %if ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,win64 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,x64 %define WIN64 1 %else %define UNIX64 1 %endif %endif %define FORMAT_ELF 0 %define FORMAT_MACHO 0 %ifidn __OUTPUT_FORMAT__,elf %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf32 %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf64 %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,macho %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho32 %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho64 %define FORMAT_MACHO 1 %endif %ifdef PREFIX %define mangle(x) _ %+ x %else %define mangle(x) x %endif ; Use VEX-encoding even in non-AVX functions %ifndef FORCE_VEX_ENCODING %define FORCE_VEX_ENCODING 0 %endif %macro SECTION_RODATA 0-1 16 %ifidn __OUTPUT_FORMAT__,win32 SECTION .rdata align=%1 %elif WIN64 SECTION .rdata align=%1 %else SECTION .rodata align=%1 %endif %endmacro %if ARCH_X86_64 %define PIC 1 ; always use PIC on x86-64 default rel %elifidn __OUTPUT_FORMAT__,win32 %define PIC 0 ; PIC isn't used on 32-bit Windows %elifndef PIC %define PIC 0 %endif %define HAVE_PRIVATE_EXTERN 1 %ifdef __NASM_VER__ %use smartalign %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 %define HAVE_PRIVATE_EXTERN 0 %endif %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that ; covers most use cases. ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. ; %4 = (optional) stack size to be allocated. The stack will be aligned before ; allocating the specified stack size. If the required stack alignment is ; larger than the known stack alignment the stack will be manually aligned ; and an extra register will be allocated to hold the original stack ; pointer (to not invalidate r0m etc.). To prevent the use of an extra ; register as stack pointer, request a negative stack size. ; %4+/%5+ = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. ; cglobal foo, 2,3,7,0x40, dst, src, tmp ; declares a function (foo) that automatically loads two arguments (dst and ; src) into registers, uses one additional register (tmp) plus 7 vector ; registers (m0-m6) and allocates 0x40 bytes of stack space. ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle ; we need more flexible macro. ; RET: ; Pops anything that was pushed by PROLOGUE, and returns. ; REP_RET: ; Use this instead of RET if it's a branch target. ; registers: ; rN and rNq are the native-size register holding function argument N ; rNd, rNw, rNb are dword, word, and byte size ; rNh is the high 8 bits of the word size ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size %macro DECLARE_REG 2-3 %define r%1q %2 %define r%1d %2d %define r%1w %2w %define r%1b %2b %define r%1h %2h %define %2q %2 %if %0 == 2 %define r%1m %2d %define r%1mp %2 %elif ARCH_X86_64 ; memory %define r%1m [rstk + stack_offset + %3] %define r%1mp qword r %+ %1 %+ m %else %define r%1m [rstk + stack_offset + %3] %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 %endmacro %macro DECLARE_REG_SIZE 3 %define r%1q r%1 %define e%1q r%1 %define r%1d e%1 %define e%1d e%1 %define r%1w %1 %define e%1w %1 %define r%1h %3 %define e%1h %3 %define r%1b %2 %define e%1b %2 %if ARCH_X86_64 == 0 %define r%1 e%1 %endif %endmacro DECLARE_REG_SIZE ax, al, ah DECLARE_REG_SIZE bx, bl, bh DECLARE_REG_SIZE cx, cl, ch DECLARE_REG_SIZE dx, dl, dh DECLARE_REG_SIZE si, sil, null DECLARE_REG_SIZE di, dil, null DECLARE_REG_SIZE bp, bpl, null ; t# defines for when per-arch register allocation is more complex than just function arguments %macro DECLARE_REG_TMP 1-* %assign %%i 0 %rep %0 CAT_XDEFINE t, %%i, r%1 %assign %%i %%i+1 %rotate 1 %endrep %endmacro %macro DECLARE_REG_TMP_SIZE 0-* %rep %0 %define t%1q t%1 %+ q %define t%1d t%1 %+ d %define t%1w t%1 %+ w %define t%1h t%1 %+ h %define t%1b t%1 %+ b %rotate 1 %endrep %endmacro DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if ARCH_X86_64 %define gprsize 8 %else %define gprsize 4 %endif %macro LEA 2 %if ARCH_X86_64 lea %1, [%2] %elif PIC call $+5 ; special-cased to not affect the RSB on most CPU:s pop %1 add %1, (%2)-$+1 %else mov %1, %2 %endif %endmacro ; Repeats an instruction/operation for multiple arguments. ; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" %macro REPX 2-* ; operation, args %xdefine %%f(x) %1 %rep %0 - 1 %rotate 1 %%f(%1) %endrep %endmacro %macro PUSH 1 push %1 %ifidn rstk, rsp %assign stack_offset stack_offset+gprsize %endif %endmacro %macro POP 1 pop %1 %ifidn rstk, rsp %assign stack_offset stack_offset-gprsize %endif %endmacro %macro PUSH_IF_USED 1-* %rep %0 %if %1 < regs_used PUSH r%1 %endif %rotate 1 %endrep %endmacro %macro POP_IF_USED 1-* %rep %0 %if %1 < regs_used pop r%1 %endif %rotate 1 %endrep %endmacro %macro LOAD_IF_USED 1-* %rep %0 %if %1 < num_args mov r%1, r %+ %1 %+ mp %endif %rotate 1 %endrep %endmacro %macro SUB 2 sub %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset+(%2) %endif %endmacro %macro ADD 2 add %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset-(%2) %endif %endmacro %macro movifnidn 2 %ifnidn %1, %2 mov %1, %2 %endif %endmacro %if ARCH_X86_64 == 0 %define movsxd movifnidn %endif %macro movsxdifnidn 2 %ifnidn %1, %2 movsxd %1, %2 %endif %endmacro %macro ASSERT 1 %if (%1) == 0 %error assertion ``%1'' failed %endif %endmacro %macro DEFINE_ARGS 0-* %ifdef n_arg_names %assign %%i 0 %rep n_arg_names CAT_UNDEF arg_name %+ %%i, q CAT_UNDEF arg_name %+ %%i, d CAT_UNDEF arg_name %+ %%i, w CAT_UNDEF arg_name %+ %%i, h CAT_UNDEF arg_name %+ %%i, b CAT_UNDEF arg_name %+ %%i, m CAT_UNDEF arg_name %+ %%i, mp CAT_UNDEF arg_name, %%i %assign %%i %%i+1 %endrep %endif %xdefine %%stack_offset stack_offset %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine %assign %%i 0 %rep %0 %xdefine %1q r %+ %%i %+ q %xdefine %1d r %+ %%i %+ d %xdefine %1w r %+ %%i %+ w %xdefine %1h r %+ %%i %+ h %xdefine %1b r %+ %%i %+ b %xdefine %1m r %+ %%i %+ m %xdefine %1mp r %+ %%i %+ mp CAT_XDEFINE arg_name, %%i, %1 %assign %%i %%i+1 %rotate 1 %endrep %xdefine stack_offset %%stack_offset %assign n_arg_names %0 %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) %define high_mm_regs (16*cpuflag(avx512)) ; Large stack allocations on Windows need to use stack probing in order ; to guarantee that all stack memory is committed before accessing it. ; This is done by ensuring that the guard page(s) at the end of the ; currently committed pages are touched prior to any pages beyond that. %if WIN64 %assign STACK_PROBE_SIZE 8192 %elifidn __OUTPUT_FORMAT__, win32 %assign STACK_PROBE_SIZE 4096 %else %assign STACK_PROBE_SIZE 0 %endif %macro PROBE_STACK 1 ; stack_size %if STACK_PROBE_SIZE %assign %%i STACK_PROBE_SIZE %rep %1 / STACK_PROBE_SIZE mov eax, [rsp-%%i] %assign %%i %%i+STACK_PROBE_SIZE %endrep %endif %endmacro %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 %if %1 != 0 %assign %%pad 0 %assign stack_size %1 %if stack_size < 0 %assign stack_size -stack_size %endif %if WIN64 %assign %%pad %%pad + 32 ; shadow space %if mmsize != 8 %assign xmm_regs_used %2 %if xmm_regs_used > 8 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers %endif %endif %endif %if required_stack_alignment <= STACK_ALIGNMENT ; maintain the current stack alignment %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) PROBE_STACK stack_size_padded SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) %xdefine rstk r %+ %%reg_num ; align stack, and save original stack location directly above ; it, i.e. in [rsp+stack_size_padded], so we can restore the ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) %if %1 < 0 ; need to store rsp on stack %xdefine rstkm [rsp + stack_size + %%pad] %assign %%pad %%pad + gprsize %else ; can keep rsp in rstk during whole function %xdefine rstkm rstk %endif %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) PROBE_STACK stack_size_padded mov rstk, rsp and rsp, ~(required_stack_alignment-1) sub rsp, stack_size_padded movifnidn rstkm, rstk %endif WIN64_PUSH_XMM %endif %endif %endmacro %macro SETUP_STACK_POINTER 0-1 0 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 ; Reserve an additional register for storing the original stack pointer, but avoid using ; eax/rax for this purpose since it can potentially get overwritten as a return value. %assign regs_used (regs_used + 1) %if ARCH_X86_64 && regs_used == 7 %assign regs_used 8 %elif ARCH_X86_64 == 0 && regs_used == 1 %assign regs_used 2 %endif %endif %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. %assign regs_used 5 + UNIX64 * 3 %endif %endif %endif %endmacro %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx DECLARE_REG 1, rdx DECLARE_REG 2, R8 DECLARE_REG 3, R9 DECLARE_REG 4, R10, 40 DECLARE_REG 5, R11, 48 DECLARE_REG 6, rax, 56 DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 DECLARE_REG 11, R14, 96 DECLARE_REG 12, R15, 104 DECLARE_REG 13, R12, 112 DECLARE_REG 14, R13, 120 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 ALLOC_STACK %4, %3 %if mmsize != 8 && stack_size == 0 WIN64_SPILL_XMM %3 %endif LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %macro WIN64_PUSH_XMM 0 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. %if xmm_regs_used > 6 + high_mm_regs movaps [rstk + stack_offset + 8], xmm6 %endif %if xmm_regs_used > 7 + high_mm_regs movaps [rstk + stack_offset + 24], xmm7 %endif %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 %if %%xmm_regs_on_stack > 0 %assign %%i 8 %rep %%xmm_regs_on_stack movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep %endif %endmacro %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 + high_mm_regs %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 %if %%xmm_regs_on_stack > 0 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. %assign %%pad %%xmm_regs_on_stack*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 %if %%xmm_regs_on_stack > 0 %assign %%i xmm_regs_used - high_mm_regs %rep %%xmm_regs_on_stack %assign %%i %%i-1 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif %if xmm_regs_used > 7 + high_mm_regs movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif %if xmm_regs_used > 6 + high_mm_regs movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro %macro WIN64_RESTORE_XMM 0 WIN64_RESTORE_XMM_INTERNAL %assign stack_offset (stack_offset-stack_size_padded) %assign stack_size_padded 0 %assign xmm_regs_used 0 %endmacro %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs %macro RET 0 WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %elif ARCH_X86_64 ; *nix x64 ;============================================= DECLARE_REG 0, rdi DECLARE_REG 1, rsi DECLARE_REG 2, rdx DECLARE_REG 3, rcx DECLARE_REG 4, R8 DECLARE_REG 5, R9 DECLARE_REG 6, rax, 8 DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 DECLARE_REG 11, R14, 48 DECLARE_REG 12, R15, 56 DECLARE_REG 13, R12, 64 DECLARE_REG 14, R13, 72 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 %assign xmm_regs_used %3 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 ALLOC_STACK %4 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %else ; X86_32 ;============================================================== DECLARE_REG 0, eax, 4 DECLARE_REG 1, ecx, 8 DECLARE_REG 2, edx, 12 DECLARE_REG 3, ebx, 16 DECLARE_REG 4, esi, 20 DECLARE_REG 5, edi, 24 DECLARE_REG 6, ebp, 28 %define rsp esp %macro DECLARE_ARG 1-* %rep %0 %define r%1m [rstk + stack_offset + 4*%1 + 4] %define r%1mp dword r%1m %rotate 1 %endrep %endmacro DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args %if num_args > 7 %assign num_args 7 %endif %if regs_used > 7 %assign regs_used 7 %endif SETUP_STACK_POINTER %4 ASSERT regs_used <= 7 PUSH_IF_USED 3, 4, 5, 6 ALLOC_STACK %4 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 6, 5, 4, 3 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %endif ;====================================================================== %if WIN64 == 0 %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 %endmacro %macro WIN64_RESTORE_XMM 0 %assign xmm_regs_used 0 %endmacro %macro WIN64_PUSH_XMM 0 %endmacro %endif ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either ; a branch or a branch target. So switch to a 2-byte form of ret in that case. ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 %if has_epilogue || cpuflag(ssse3) RET %else rep ret %endif annotate_function_size %endmacro %define last_branch_adr $$ %macro AUTO_REP_RET 0 %if notcpuflag(ssse3) times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. %endif ret annotate_function_size %endmacro %macro BRANCH_INSTR 0-* %rep %0 %macro %1 1-2 %1 %2 %1 %if notcpuflag(ssse3) %%branch_instr equ $ %xdefine last_branch_adr %%branch_instr %endif %endmacro %rotate 1 %endrep %endmacro BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent %if has_epilogue call %1 RET %elif %2 jmp %1 %endif annotate_function_size %endmacro ;============================================================================= ; arch-independent part ;============================================================================= %assign function_align 16 ; Begin a function. ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). %macro cglobal 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 1, %1 %+ SUFFIX, %2 %endmacro %macro cvisible 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 0, %1 %+ SUFFIX, %2 %endmacro %macro cglobal_internal 2-3+ annotate_function_size %ifndef cglobaled_%2 %if %1 %xdefine %2 mangle(private_prefix %+ _ %+ %2) %else %xdefine %2 mangle(public_prefix %+ _ %+ %2) %endif %xdefine %2.skip_prologue %2 %+ .skip_prologue CAT_XDEFINE cglobaled_, %2, 1 %endif %xdefine current_function %2 %xdefine current_function_section __SECT__ %if FORMAT_ELF %if %1 global %2:function hidden %else global %2:function %endif %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 global %2:private_extern %else global %2 %endif align function_align %2: RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper %ifnidn %3, "" PROLOGUE %3 %endif %endmacro ; Create a global symbol from a local label with the correct name mangling and type %macro cglobal_label 1 %if FORMAT_ELF global current_function %+ %1:function hidden %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN global current_function %+ %1:private_extern %else global current_function %+ %1 %endif %1: %endmacro %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro ; like cextern, but without the prefix %macro cextern_naked 1 %ifdef PREFIX %xdefine %1 mangle(%1) %endif CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro %macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) %if FORMAT_ELF global %1:data hidden %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN global %1:private_extern %else global %1 %endif %1: %2 %endmacro ; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. %if FORMAT_ELF [SECTION .note.GNU-stack noalloc noexec nowrite progbits] %endif ; Tell debuggers how large the function was. ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. ; This is invoked by RET and similar macros, and also cglobal does it for the previous function, ; but if the last function in a source file doesn't use any of the standard macros for its epilogue, ; then its size might be unspecified. %macro annotate_function_size 0 %ifdef __YASM_VER__ %ifdef current_function %if FORMAT_ELF current_function_section %%ecf equ $ size current_function %%ecf - current_function __SECT__ %endif %endif %endif %endmacro ; cpuflags %assign cpuflags_mmx (1<<0) %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx %assign cpuflags_3dnow (1<<2) | cpuflags_mmx %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 %assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 %assign cpuflags_sse3 (1<<8) | cpuflags_sse2 %assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 %assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 %assign cpuflags_sse42 (1<<11) | cpuflags_sse4 %assign cpuflags_aesni (1<<12) | cpuflags_sse42 %assign cpuflags_gfni (1<<13) | cpuflags_sse42 %assign cpuflags_avx (1<<14) | cpuflags_sse42 %assign cpuflags_xop (1<<15) | cpuflags_avx %assign cpuflags_fma4 (1<<16) | cpuflags_avx %assign cpuflags_fma3 (1<<17) | cpuflags_avx %assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt %assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1 %assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2 %assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL %assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ %assign cpuflags_cache32 (1<<23) %assign cpuflags_cache64 (1<<24) %assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant %assign cpuflags_atom (1<<26) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) %define notcpuflag(x) (cpuflag(x) ^ 1) ; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. %macro INIT_CPUFLAGS 0-* %xdefine SUFFIX %undef cpuname %assign cpuflags 0 %if %0 >= 1 %rep %0 %ifdef cpuname %xdefine cpuname cpuname %+ _%1 %else %xdefine cpuname %1 %endif %assign cpuflags cpuflags | cpuflags_%1 %rotate 1 %endrep %xdefine SUFFIX _ %+ cpuname %if cpuflag(avx) %assign avx_enabled 1 %endif %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) %define mova movaps %define movu movups %define movnta movntps %endif %if cpuflag(aligned) %define movu mova %elif cpuflag(sse3) && notcpuflag(ssse3) %define movu lddqu %endif %endif %if ARCH_X86_64 || cpuflag(sse2) %ifdef __NASM_VER__ ALIGNMODE p6 %else CPU amdnop %endif %else %ifdef __NASM_VER__ ALIGNMODE nop %else CPU basicnop %endif %endif %endmacro ; Merge mmx, sse*, and avx* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# ; (All 4 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 %endmacro %macro CAT_UNDEF 2 %undef %1%2 %endmacro %macro DEFINE_MMREGS 1 ; mmtype %assign %%prev_mmregs 0 %ifdef num_mmregs %assign %%prev_mmregs num_mmregs %endif %assign num_mmregs 8 %if ARCH_X86_64 && mmsize >= 16 %assign num_mmregs 16 %if cpuflag(avx512) || mmsize == 64 %assign num_mmregs 32 %endif %endif %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1 %+ %%i CAT_XDEFINE nn%1, %%i, %%i %assign %%i %%i+1 %endrep %if %%prev_mmregs > num_mmregs %rep %%prev_mmregs - num_mmregs CAT_UNDEF m, %%i CAT_UNDEF nn %+ mmtype, %%i %assign %%i %%i+1 %endrep %endif %xdefine mmtype %1 %endmacro ; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper %macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg %if ARCH_X86_64 && cpuflag(avx512) %assign %%i %1 %rep 16-%1 %assign %%i_high %%i+16 SWAP %%i, %%i_high %assign %%i %%i+1 %endrep %endif %endmacro %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 %define mmsize 8 %define mova movq %define movu movq %define movh movd %define movnta movntq INIT_CPUFLAGS %1 DEFINE_MMREGS mm %endmacro %macro INIT_XMM 0-1+ %assign avx_enabled FORCE_VEX_ENCODING %define RESET_MM_PERMUTATION INIT_XMM %1 %define mmsize 16 %define mova movdqa %define movu movdqu %define movh movq %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS xmm %if WIN64 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers %endif %xdefine bcstd 1to4 %xdefine bcstq 1to2 %endmacro %macro INIT_YMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_YMM %1 %define mmsize 32 %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS ymm AVX512_MM_PERMUTATION %xdefine bcstd 1to8 %xdefine bcstq 1to4 %endmacro %macro INIT_ZMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_ZMM %1 %define mmsize 64 %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS zmm AVX512_MM_PERMUTATION %xdefine bcstd 1to16 %xdefine bcstq 1to8 %endmacro INIT_XMM %macro DECLARE_MMCAST 1 %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 %define mmzmm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 %define xmmzmm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 %define ymmzmm%1 ymm%1 %define zmmmm%1 mm%1 %define zmmxmm%1 xmm%1 %define zmmymm%1 ymm%1 %define zmmzmm%1 zmm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 %define zm%1 zmm %+ m%1 %endmacro %assign i 0 %rep 32 DECLARE_MMCAST i %assign i i+1 %endrep ; I often want to use macros that permute their arguments. e.g. there's no ; efficient way to implement butterfly or transpose or dct without swapping some ; arguments. ; ; I would like to not have to manually keep track of the permutations: ; If I insert a permutation in the middle of a function, it should automatically ; change everything that follows. For more complex macros I may also have multiple ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. ; ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that ; permutes its arguments. It's equivalent to exchanging the contents of the ; registers, except that this way you exchange the register names instead, so it ; doesn't cost any cycles. %macro PERMUTE 2-* ; takes a list of pairs to swap %rep %0/2 %xdefine %%tmp%2 m%2 %rotate 2 %endrep %rep %0/2 %xdefine m%1 %%tmp%2 CAT_XDEFINE nn, m%1, %1 %rotate 2 %endrep %endmacro %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) %ifnum %1 ; SWAP 0, 1, ... SWAP_INTERNAL_NUM %1, %2 %else ; SWAP m0, m1, ... SWAP_INTERNAL_NAME %1, %2 %endif %endmacro %macro SWAP_INTERNAL_NUM 2-* %rep %0-1 %xdefine %%tmp m%1 %xdefine m%1 m%2 %xdefine m%2 %%tmp CAT_XDEFINE nn, m%1, %1 CAT_XDEFINE nn, m%2, %2 %rotate 1 %endrep %endmacro %macro SWAP_INTERNAL_NAME 2-* %xdefine %%args nn %+ %1 %rep %0-1 %xdefine %%args %%args, nn %+ %2 %rotate 1 %endrep SWAP_INTERNAL_NUM %%args %endmacro ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later ; calls to that function will automatically load the permutation, so values can ; be returned in mmregs. %macro SAVE_MM_PERMUTATION 0-1 %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %assign %%i 0 %rep num_mmregs %xdefine %%tmp m %+ %%i CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp %assign %%i %%i+1 %endrep %endmacro %macro LOAD_MM_PERMUTATION 0-1 ; name to load from %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %xdefine %%tmp %%f %+ 0 %ifnum %%tmp DEFINE_MMREGS mmtype %assign %%i 0 %rep num_mmregs %xdefine %%tmp %%f %+ %%i CAT_XDEFINE %%m, %%i, m %+ %%tmp %assign %%i %%i+1 %endrep %rep num_mmregs %assign %%i %%i-1 CAT_XDEFINE m, %%i, %%m %+ %%i CAT_XDEFINE nn, m %+ %%i, %%i %endrep %endif %endmacro ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 %ifid %1 call_internal %1 %+ SUFFIX, %1 %else call %1 %endif %endmacro %macro call_internal 2 %xdefine %%i %2 %ifndef cglobaled_%2 %ifdef cglobaled_%1 %xdefine %%i %1 %endif %endif call %%i LOAD_MM_PERMUTATION %%i %endmacro ; Substitutions that reduce instruction size but are functionally equivalent %macro add 2 %ifnum %2 %if %2==128 sub %1, -128 %else add %1, %2 %endif %else add %1, %2 %endif %endmacro %macro sub 2 %ifnum %2 %if %2==128 add %1, -128 %else sub %1, %2 %endif %else sub %1, %2 %endif %endmacro ;============================================================================= ; AVX abstraction layer ;============================================================================= %assign i 0 %rep 32 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 CAT_XDEFINE regnumofmm, i, i %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 CAT_XDEFINE sizeofzmm, i, 64 CAT_XDEFINE regnumofxmm, i, i CAT_XDEFINE regnumofymm, i, i CAT_XDEFINE regnumofzmm, i, i %assign i i+1 %endrep %undef i %macro CHECK_AVX_INSTR_EMU 3-* %xdefine %%opcode %1 %xdefine %%dst %2 %rep %0-2 %ifidn %%dst, %3 %error non-avx emulation of ``%%opcode'' is not supported %endif %rotate 1 %endrep %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%6+: operands %macro RUN_AVX_INSTR 6-9+ %ifnum sizeof%7 %assign __sizeofreg sizeof%7 %elifnum sizeof%6 %assign __sizeofreg sizeof%6 %else %assign __sizeofreg mmsize %endif %assign __emulate_avx 0 %if avx_enabled && __sizeofreg >= 16 %xdefine __instr v%1 %else %xdefine __instr %1 %if %0 >= 8+%4 %assign __emulate_avx 1 %endif %endif %ifnidn %2, fnord %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) %error use of ``%1'' sse2 instruction in cpuname function: current_function %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) %error use of ``%1'' avx2 instruction in cpuname function: current_function %elif __sizeofreg == 16 && notcpuflag(sse) %error use of ``%1'' sse instruction in cpuname function: current_function %elif __sizeofreg == 32 && notcpuflag(avx) %error use of ``%1'' avx instruction in cpuname function: current_function %elif __sizeofreg == 64 && notcpuflag(avx512) %error use of ``%1'' avx512 instruction in cpuname function: current_function %elifidn %1, pextrw ; special case because the base instruction is mmx2, %ifnid %6 ; but sse4 is required for memory operands %if notcpuflag(sse4) %error use of ``%1'' sse4 instruction in cpuname function: current_function %endif %endif %endif %endif %endif %if __emulate_avx %xdefine __src1 %7 %xdefine __src2 %8 %if %5 && %4 == 0 %ifnidn %6, %7 %ifidn %6, %8 %xdefine __src1 %8 %xdefine __src2 %7 %elifnnum sizeof%8 ; 3-operand AVX instructions with a memory arg can only have it in src2, ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). ; So, if the instruction is commutative with a memory arg, swap them. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %endif %ifnidn %6, __src1 %if %0 >= 9 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 %else CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 %endif %if __sizeofreg == 8 MOVQ %6, __src1 %elif %3 MOVAPS %6, __src1 %else MOVDQA %6, __src1 %endif %endif %if %0 >= 9 %1 %6, __src2, %9 %else %1 %6, __src2 %endif %elif %0 >= 9 %if avx_enabled && __sizeofreg >= 16 && %4 == 1 %ifnnum regnumof%7 %if %3 vmovaps %6, %7 %else vmovdqa %6, %7 %endif __instr %6, %6, %8, %9 %else __instr %6, %7, %8, %9 %endif %else __instr %6, %7, %8, %9 %endif %elif %0 == 8 %if avx_enabled && __sizeofreg >= 16 && %4 == 0 %xdefine __src1 %7 %xdefine __src2 %8 %if %5 %ifnum regnumof%7 %ifnum regnumof%8 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 ; Most VEX-encoded instructions require an additional byte to encode when ; src2 is a high register (e.g. m8..15). If the instruction is commutative ; we can swap src1 and src2 when doing so reduces the instruction length. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %elifnum regnumof%8 ; put memory operands in src2 when possible %xdefine __src1 %8 %xdefine __src2 %7 %else %assign __emulate_avx 1 %endif %elifnnum regnumof%7 ; EVEX allows imm8 shift instructions to be used with memory operands, ; but VEX does not. This handles those special cases. %ifnnum %8 %assign __emulate_avx 1 %elif notcpuflag(avx512) %assign __emulate_avx 1 %endif %endif %if __emulate_avx ; a separate load is required %if %3 vmovaps %6, %7 %else vmovdqa %6, %7 %endif __instr %6, %6, %8 %else __instr %6, __src1, __src2 %endif %else __instr %6, %7, %8 %endif %elif %0 == 7 %if avx_enabled && __sizeofreg >= 16 && %5 %xdefine __src1 %6 %xdefine __src2 %7 %ifnum regnumof%6 %ifnum regnumof%7 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 %xdefine __src1 %7 %xdefine __src2 %6 %endif %endif %endif __instr %6, __src1, __src2 %else __instr %6, %7 %endif %else __instr %6 %endif %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not %macro AVX_INSTR 1-5 fnord, 0, 255, 0 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 %ifidn %2, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 %elifidn %3, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 %elifidn %4, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 %elifidn %5, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 %else RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 %endif %endmacro %endmacro ; Instructions with both VEX/EVEX and legacy encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 AVX_INSTR addsd, sse2, 1, 0, 0 AVX_INSTR addss, sse, 1, 0, 0 AVX_INSTR addsubpd, sse3, 1, 0, 0 AVX_INSTR addsubps, sse3, 1, 0, 0 AVX_INSTR aesdec, aesni, 0, 0, 0 AVX_INSTR aesdeclast, aesni, 0, 0, 0 AVX_INSTR aesenc, aesni, 0, 0, 0 AVX_INSTR aesenclast, aesni, 0, 0, 0 AVX_INSTR aesimc, aesni AVX_INSTR aeskeygenassist, aesni AVX_INSTR andnpd, sse2, 1, 0, 0 AVX_INSTR andnps, sse, 1, 0, 0 AVX_INSTR andpd, sse2, 1, 0, 1 AVX_INSTR andps, sse, 1, 0, 1 AVX_INSTR blendpd, sse4, 1, 1, 0 AVX_INSTR blendps, sse4, 1, 1, 0 AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR cmpeqpd, sse2, 1, 0, 1 AVX_INSTR cmpeqps, sse, 1, 0, 1 AVX_INSTR cmpeqsd, sse2, 1, 0, 0 AVX_INSTR cmpeqss, sse, 1, 0, 0 AVX_INSTR cmplepd, sse2, 1, 0, 0 AVX_INSTR cmpleps, sse, 1, 0, 0 AVX_INSTR cmplesd, sse2, 1, 0, 0 AVX_INSTR cmpless, sse, 1, 0, 0 AVX_INSTR cmpltpd, sse2, 1, 0, 0 AVX_INSTR cmpltps, sse, 1, 0, 0 AVX_INSTR cmpltsd, sse2, 1, 0, 0 AVX_INSTR cmpltss, sse, 1, 0, 0 AVX_INSTR cmpneqpd, sse2, 1, 0, 1 AVX_INSTR cmpneqps, sse, 1, 0, 1 AVX_INSTR cmpneqsd, sse2, 1, 0, 0 AVX_INSTR cmpneqss, sse, 1, 0, 0 AVX_INSTR cmpnlepd, sse2, 1, 0, 0 AVX_INSTR cmpnleps, sse, 1, 0, 0 AVX_INSTR cmpnlesd, sse2, 1, 0, 0 AVX_INSTR cmpnless, sse, 1, 0, 0 AVX_INSTR cmpnltpd, sse2, 1, 0, 0 AVX_INSTR cmpnltps, sse, 1, 0, 0 AVX_INSTR cmpnltsd, sse2, 1, 0, 0 AVX_INSTR cmpnltss, sse, 1, 0, 0 AVX_INSTR cmpordpd, sse2 1, 0, 1 AVX_INSTR cmpordps, sse 1, 0, 1 AVX_INSTR cmpordsd, sse2 1, 0, 0 AVX_INSTR cmpordss, sse 1, 0, 0 AVX_INSTR cmppd, sse2, 1, 1, 0 AVX_INSTR cmpps, sse, 1, 1, 0 AVX_INSTR cmpsd, sse2, 1, 1, 0 AVX_INSTR cmpss, sse, 1, 1, 0 AVX_INSTR cmpunordpd, sse2, 1, 0, 1 AVX_INSTR cmpunordps, sse, 1, 0, 1 AVX_INSTR cmpunordsd, sse2, 1, 0, 0 AVX_INSTR cmpunordss, sse, 1, 0, 0 AVX_INSTR comisd, sse2, 1 AVX_INSTR comiss, sse, 1 AVX_INSTR cvtdq2pd, sse2, 1 AVX_INSTR cvtdq2ps, sse2, 1 AVX_INSTR cvtpd2dq, sse2, 1 AVX_INSTR cvtpd2ps, sse2, 1 AVX_INSTR cvtps2dq, sse2, 1 AVX_INSTR cvtps2pd, sse2, 1 AVX_INSTR cvtsd2si, sse2, 1 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 AVX_INSTR cvtsi2ss, sse, 1, 0, 0 AVX_INSTR cvtss2sd, sse2, 1, 0, 0 AVX_INSTR cvtss2si, sse, 1 AVX_INSTR cvttpd2dq, sse2, 1 AVX_INSTR cvttps2dq, sse2, 1 AVX_INSTR cvttsd2si, sse2, 1 AVX_INSTR cvttss2si, sse, 1 AVX_INSTR divpd, sse2, 1, 0, 0 AVX_INSTR divps, sse, 1, 0, 0 AVX_INSTR divsd, sse2, 1, 0, 0 AVX_INSTR divss, sse, 1, 0, 0 AVX_INSTR dppd, sse4, 1, 1, 0 AVX_INSTR dpps, sse4, 1, 1, 0 AVX_INSTR extractps, sse4, 1 AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 AVX_INSTR haddpd, sse3, 1, 0, 0 AVX_INSTR haddps, sse3, 1, 0, 0 AVX_INSTR hsubpd, sse3, 1, 0, 0 AVX_INSTR hsubps, sse3, 1, 0, 0 AVX_INSTR insertps, sse4, 1, 1, 0 AVX_INSTR lddqu, sse3 AVX_INSTR ldmxcsr, sse, 1 AVX_INSTR maskmovdqu, sse2 AVX_INSTR maxpd, sse2, 1, 0, 1 AVX_INSTR maxps, sse, 1, 0, 1 AVX_INSTR maxsd, sse2, 1, 0, 0 AVX_INSTR maxss, sse, 1, 0, 0 AVX_INSTR minpd, sse2, 1, 0, 1 AVX_INSTR minps, sse, 1, 0, 1 AVX_INSTR minsd, sse2, 1, 0, 0 AVX_INSTR minss, sse, 1, 0, 0 AVX_INSTR movapd, sse2, 1 AVX_INSTR movaps, sse, 1 AVX_INSTR movd, mmx AVX_INSTR movddup, sse3, 1 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 AVX_INSTR movhlps, sse, 1, 0, 0 AVX_INSTR movhpd, sse2, 1, 0, 0 AVX_INSTR movhps, sse, 1, 0, 0 AVX_INSTR movlhps, sse, 1, 0, 0 AVX_INSTR movlpd, sse2, 1, 0, 0 AVX_INSTR movlps, sse, 1, 0, 0 AVX_INSTR movmskpd, sse2, 1 AVX_INSTR movmskps, sse, 1 AVX_INSTR movntdq, sse2 AVX_INSTR movntdqa, sse4 AVX_INSTR movntpd, sse2, 1 AVX_INSTR movntps, sse, 1 AVX_INSTR movq, mmx AVX_INSTR movsd, sse2, 1, 0, 0 AVX_INSTR movshdup, sse3, 1 AVX_INSTR movsldup, sse3, 1 AVX_INSTR movss, sse, 1, 0, 0 AVX_INSTR movupd, sse2, 1 AVX_INSTR movups, sse, 1 AVX_INSTR mpsadbw, sse4, 0, 1, 0 AVX_INSTR mulpd, sse2, 1, 0, 1 AVX_INSTR mulps, sse, 1, 0, 1 AVX_INSTR mulsd, sse2, 1, 0, 0 AVX_INSTR mulss, sse, 1, 0, 0 AVX_INSTR orpd, sse2, 1, 0, 1 AVX_INSTR orps, sse, 1, 0, 1 AVX_INSTR pabsb, ssse3 AVX_INSTR pabsd, ssse3 AVX_INSTR pabsw, ssse3 AVX_INSTR packssdw, mmx, 0, 0, 0 AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packusdw, sse4, 0, 0, 0 AVX_INSTR packuswb, mmx, 0, 0, 0 AVX_INSTR paddb, mmx, 0, 0, 1 AVX_INSTR paddd, mmx, 0, 0, 1 AVX_INSTR paddq, sse2, 0, 0, 1 AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR palignr, ssse3, 0, 1, 0 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR pblendw, sse4, 0, 1, 0 AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 AVX_INSTR pclmulqdq, fnord, 0, 1, 0 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 AVX_INSTR pcmpestri, sse42 AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 AVX_INSTR pcmpistri, sse42 AVX_INSTR pcmpistrm, sse42 AVX_INSTR pextrb, sse4 AVX_INSTR pextrd, sse4 AVX_INSTR pextrq, sse4 AVX_INSTR pextrw, mmx2 AVX_INSTR phaddd, ssse3, 0, 0, 0 AVX_INSTR phaddsw, ssse3, 0, 0, 0 AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phminposuw, sse4 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR pinsrb, sse4, 0, 1, 0 AVX_INSTR pinsrd, sse4, 0, 1, 0 AVX_INSTR pinsrq, sse4, 0, 1, 0 AVX_INSTR pinsrw, mmx2, 0, 1, 0 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaxsb, sse4, 0, 0, 1 AVX_INSTR pmaxsd, sse4, 0, 0, 1 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxub, mmx2, 0, 0, 1 AVX_INSTR pmaxud, sse4, 0, 0, 1 AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pminsb, sse4, 0, 0, 1 AVX_INSTR pminsd, sse4, 0, 0, 1 AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminub, mmx2, 0, 0, 1 AVX_INSTR pminud, sse4, 0, 0, 1 AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pmovmskb, mmx2 AVX_INSTR pmovsxbd, sse4 AVX_INSTR pmovsxbq, sse4 AVX_INSTR pmovsxbw, sse4 AVX_INSTR pmovsxdq, sse4 AVX_INSTR pmovsxwd, sse4 AVX_INSTR pmovsxwq, sse4 AVX_INSTR pmovzxbd, sse4 AVX_INSTR pmovzxbq, sse4 AVX_INSTR pmovzxbw, sse4 AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmovzxwd, sse4 AVX_INSTR pmovzxwq, sse4 AVX_INSTR pmuldq, sse4, 0, 0, 1 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 AVX_INSTR pmulhw, mmx, 0, 0, 1 AVX_INSTR pmulld, sse4, 0, 0, 1 AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmuludq, sse2, 0, 0, 1 AVX_INSTR por, mmx, 0, 0, 1 AVX_INSTR psadbw, mmx2, 0, 0, 1 AVX_INSTR pshufb, ssse3, 0, 0, 0 AVX_INSTR pshufd, sse2 AVX_INSTR pshufhw, sse2 AVX_INSTR pshuflw, sse2 AVX_INSTR psignb, ssse3, 0, 0, 0 AVX_INSTR psignd, ssse3, 0, 0, 0 AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR pslld, mmx, 0, 0, 0 AVX_INSTR pslldq, sse2, 0, 0, 0 AVX_INSTR psllq, mmx, 0, 0, 0 AVX_INSTR psllw, mmx, 0, 0, 0 AVX_INSTR psrad, mmx, 0, 0, 0 AVX_INSTR psraw, mmx, 0, 0, 0 AVX_INSTR psrld, mmx, 0, 0, 0 AVX_INSTR psrldq, sse2, 0, 0, 0 AVX_INSTR psrlq, mmx, 0, 0, 0 AVX_INSTR psrlw, mmx, 0, 0, 0 AVX_INSTR psubb, mmx, 0, 0, 0 AVX_INSTR psubd, mmx, 0, 0, 0 AVX_INSTR psubq, sse2, 0, 0, 0 AVX_INSTR psubsb, mmx, 0, 0, 0 AVX_INSTR psubsw, mmx, 0, 0, 0 AVX_INSTR psubusb, mmx, 0, 0, 0 AVX_INSTR psubusw, mmx, 0, 0, 0 AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR ptest, sse4 AVX_INSTR punpckhbw, mmx, 0, 0, 0 AVX_INSTR punpckhdq, mmx, 0, 0, 0 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpcklbw, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 AVX_INSTR rcpps, sse, 1 AVX_INSTR rcpss, sse, 1, 0, 0 AVX_INSTR roundpd, sse4, 1 AVX_INSTR roundps, sse4, 1 AVX_INSTR roundsd, sse4, 1, 1, 0 AVX_INSTR roundss, sse4, 1, 1, 0 AVX_INSTR rsqrtps, sse, 1 AVX_INSTR rsqrtss, sse, 1, 0, 0 AVX_INSTR shufpd, sse2, 1, 1, 0 AVX_INSTR shufps, sse, 1, 1, 0 AVX_INSTR sqrtpd, sse2, 1 AVX_INSTR sqrtps, sse, 1 AVX_INSTR sqrtsd, sse2, 1, 0, 0 AVX_INSTR sqrtss, sse, 1, 0, 0 AVX_INSTR stmxcsr, sse, 1 AVX_INSTR subpd, sse2, 1, 0, 0 AVX_INSTR subps, sse, 1, 0, 0 AVX_INSTR subsd, sse2, 1, 0, 0 AVX_INSTR subss, sse, 1, 0, 0 AVX_INSTR ucomisd, sse2, 1 AVX_INSTR ucomiss, sse, 1 AVX_INSTR unpckhpd, sse2, 1, 0, 0 AVX_INSTR unpckhps, sse, 1, 0, 0 AVX_INSTR unpcklpd, sse2, 1, 0, 0 AVX_INSTR unpcklps, sse, 1, 0, 0 AVX_INSTR xorpd, sse2, 1, 0, 1 AVX_INSTR xorps, sse, 1, 0, 1 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN AVX_INSTR pfadd, 3dnow, 1, 0, 1 AVX_INSTR pfmul, 3dnow, 1, 0, 1 AVX_INSTR pfsub, 3dnow, 1, 0, 0 ;%1 == instruction ;%2 == minimal instruction set %macro GPR_INSTR 2 %macro %1 2-5 fnord, %1, %2 %ifdef cpuname %if notcpuflag(%5) %error use of ``%4'' %5 instruction in cpuname function: current_function %endif %endif %ifidn %3, fnord %4 %1, %2 %else %4 %1, %2, %3 %endif %endmacro %endmacro GPR_INSTR andn, bmi1 GPR_INSTR bextr, bmi1 GPR_INSTR blsi, bmi1 GPR_INSTR blsmsk, bmi1 GPR_INSTR blsr, bmi1 GPR_INSTR bzhi, bmi2 GPR_INSTR mulx, bmi2 GPR_INSTR pdep, bmi2 GPR_INSTR pext, bmi2 GPR_INSTR popcnt, sse42 GPR_INSTR rorx, bmi2 GPR_INSTR sarx, bmi2 GPR_INSTR shlx, bmi2 GPR_INSTR shrx, bmi2 ; base-4 constants for shuffles %assign i 0 %rep 256 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) %if j < 10 CAT_XDEFINE q000, j, i %elif j < 100 CAT_XDEFINE q00, j, i %elif j < 1000 CAT_XDEFINE q0, j, i %else CAT_XDEFINE q, j, i %endif %assign i i+1 %endrep %undef i %undef j %macro FMA_INSTR 3 %macro %1 4-7 %1, %2, %3 %if cpuflag(xop) v%5 %1, %2, %3, %4 %elifnidn %1, %4 %6 %1, %2, %3 %7 %1, %4 %else %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported %endif %endmacro %endmacro FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmadcswd, pmaddwd, paddd ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. ; FMA3 is only possible if dst is the same as one of the src registers. ; Either src2 or src3 can be a memory operand. %macro FMA4_INSTR 2-* %push fma4_instr %xdefine %$prefix %1 %rep %0 - 1 %macro %$prefix%2 4-6 %$prefix, %2 %if notcpuflag(fma3) && notcpuflag(fma4) %error use of ``%5%6'' fma instruction in cpuname function: current_function %elif cpuflag(fma4) v%5%6 %1, %2, %3, %4 %elifidn %1, %2 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. %ifnum sizeof%3 v%{5}213%6 %2, %3, %4 %else v%{5}132%6 %2, %4, %3 %endif %elifidn %1, %3 v%{5}213%6 %3, %2, %4 %elifidn %1, %4 v%{5}231%6 %4, %2, %3 %else %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported %endif %endmacro %rotate 1 %endrep %pop %endmacro FMA4_INSTR fmadd, pd, ps, sd, ss FMA4_INSTR fmaddsub, pd, ps FMA4_INSTR fmsub, pd, ps, sd, ss FMA4_INSTR fmsubadd, pd, ps FMA4_INSTR fnmadd, pd, ps, sd, ss FMA4_INSTR fnmsub, pd, ps, sd, ss ; Macros for converting VEX instructions to equivalent EVEX ones. %macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex %macro %1 2-7 fnord, fnord, %1, %2, %3 %ifidn %3, fnord %define %%args %1, %2 %elifidn %4, fnord %define %%args %1, %2, %3 %else %define %%args %1, %2, %3, %4 %endif %assign %%evex_required cpuflag(avx512) & %7 %ifnum regnumof%1 %if regnumof%1 >= 16 || sizeof%1 > 32 %assign %%evex_required 1 %endif %endif %ifnum regnumof%2 %if regnumof%2 >= 16 || sizeof%2 > 32 %assign %%evex_required 1 %endif %endif %ifnum regnumof%3 %if regnumof%3 >= 16 || sizeof%3 > 32 %assign %%evex_required 1 %endif %endif %if %%evex_required %6 %%args %else %5 %%args ; Prefer VEX over EVEX due to shorter instruction length %endif %endmacro %endmacro EVEX_INSTR vbroadcastf128, vbroadcastf32x4 EVEX_INSTR vbroadcasti128, vbroadcasti32x4 EVEX_INSTR vextractf128, vextractf32x4 EVEX_INSTR vextracti128, vextracti32x4 EVEX_INSTR vinsertf128, vinsertf32x4 EVEX_INSTR vinserti128, vinserti32x4 EVEX_INSTR vmovdqa, vmovdqa32 EVEX_INSTR vmovdqu, vmovdqu32 EVEX_INSTR vpand, vpandd EVEX_INSTR vpandn, vpandnd EVEX_INSTR vpor, vpord EVEX_INSTR vpxor, vpxord EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision EVEX_INSTR vrcpss, vrcp14ss, 1 EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 av-scenechange-0.14.1/src/cpu.rs000064400000000000000000000254461046102023000145150ustar 00000000000000#[cfg(asm_neon)] pub use neon::*; #[cfg(not(any(asm_x86_64, asm_neon)))] pub use rust::*; #[cfg(asm_x86_64)] pub use x86::*; #[cfg(not(any(asm_x86_64, asm_neon)))] mod rust { use arg_enum_proc_macro::ArgEnum; #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, ArgEnum, Default)] #[allow(clippy::upper_case_acronyms)] pub enum CpuFeatureLevel { #[default] RUST, } impl CpuFeatureLevel { #[cfg(test)] #[allow(unused)] #[inline] pub const fn all() -> &'static [Self] { use CpuFeatureLevel::*; &[RUST] } } } #[cfg(asm_x86_64)] #[macro_use] mod x86 { use std::{env, str::FromStr}; use arg_enum_proc_macro::ArgEnum; #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, ArgEnum)] #[allow(clippy::upper_case_acronyms)] pub enum CpuFeatureLevel { RUST, SSE2, SSSE3, #[arg_enum(alias = "sse4.1")] SSE4_1, AVX2, AVX512, #[arg_enum(alias = "avx512vpclmulqdq")] AVX512ICL, } impl CpuFeatureLevel { #[cfg(test)] pub const fn all() -> &'static [Self] { &[ CpuFeatureLevel::RUST, CpuFeatureLevel::SSE2, CpuFeatureLevel::SSSE3, CpuFeatureLevel::SSE4_1, CpuFeatureLevel::AVX2, CpuFeatureLevel::AVX512, CpuFeatureLevel::AVX512ICL, ] } #[inline] pub const fn len() -> usize { CpuFeatureLevel::AVX512ICL as usize + 1 } #[inline] pub const fn as_index(self) -> usize { self as usize } } impl Default for CpuFeatureLevel { #[inline] fn default() -> CpuFeatureLevel { fn avx512_detected() -> bool { is_x86_feature_detected!("avx512bw") && is_x86_feature_detected!("avx512cd") && is_x86_feature_detected!("avx512dq") && is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") } #[allow(deprecated)] // Until MSRV >= 1.69.0 fn avx512icl_detected() -> bool { // Per dav1d, these are the flags needed. avx512_detected() && is_x86_feature_detected!("avx512vnni") && is_x86_feature_detected!("avx512ifma") && is_x86_feature_detected!("avx512vbmi") && is_x86_feature_detected!("avx512vbmi2") && is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512bitalg") && is_x86_feature_detected!("avx512gfni") && is_x86_feature_detected!("avx512vaes") && is_x86_feature_detected!("avx512vpclmulqdq") } let detected: CpuFeatureLevel = if avx512icl_detected() { CpuFeatureLevel::AVX512ICL } else if avx512_detected() { CpuFeatureLevel::AVX512 } else if is_x86_feature_detected!("avx2") { CpuFeatureLevel::AVX2 } else if is_x86_feature_detected!("sse4.1") { CpuFeatureLevel::SSE4_1 } else if is_x86_feature_detected!("ssse3") { CpuFeatureLevel::SSSE3 } else if is_x86_feature_detected!("sse2") { CpuFeatureLevel::SSE2 } else { CpuFeatureLevel::RUST }; let manual: CpuFeatureLevel = match env::var("CPU_TARGET") { Ok(feature) => CpuFeatureLevel::from_str(&feature).unwrap_or(detected), Err(_e) => detected, }; if manual > detected { detected } else { manual } } } // Create a static lookup table for CPUFeatureLevel enums // Note: keys are CpuFeatureLevels without any prefix (no CpuFeatureLevel::) macro_rules! cpu_function_lookup_table { // version for default visibility ($name:ident: [$type:ty], default: $empty:expr, [$(($key:ident, $value:expr)),*]) => { static $name: [$type; crate::cpu::CpuFeatureLevel::len()] = { use crate::cpu::CpuFeatureLevel; #[allow(unused_mut)] let mut out: [$type; CpuFeatureLevel::len()] = [$empty; CpuFeatureLevel::len()]; // Can't use out[0][.] == $empty in static as of rust 1.40 #[allow(unused_mut)] let mut set: [bool; CpuFeatureLevel::len()] = [false; CpuFeatureLevel::len()]; #[allow(unused_imports)] use CpuFeatureLevel::*; $( out[$key as usize] = $value; set[$key as usize] = true; )* cpu_function_lookup_table!(waterfall_cpu_features(out, set, [SSE2, SSSE3, SSE4_1, AVX2, AVX512, AVX512ICL])); out }; }; ($pub:vis, $name:ident: [$type:ty], default: $empty:expr, [$(($key:ident, $value:expr)),*]) => { $pub cpu_function_lookup_table!($name: [$type], default: $empty, [$(($key, $value)),*]); }; // Fill empty output functions with the existent functions they support. // cpus should be in order of lowest cpu level to highest // Used like an internal function // Put in here to avoid adding more public macros (waterfall_cpu_features($out:ident, $set:ident, [$($cpu:ident),*])) => { // Use an array to emulate if statements (not supported in static as of // rust 1.40). Setting best[0] (false) and best[1] (true) is equivalent to // doing nothing and overriding our value respectively. #[allow(unused_assignments)] let mut best = [$out[0], $out[0]]; $( // If the current entry has a function, update out best function. best[$set[$cpu as usize] as usize] = $out[$cpu as usize]; // Update our current entry. Does nothing if it already had a function. $out[$cpu as usize] = best[1]; )* }; // use $name_$key as our values ($pub:vis, $name:ident: [$type:ty], default: $empty:expr, [$($key:ident),*]) => { pastey::item!{ cpu_function_lookup_table!( $pub, $name: [$type], default: $empty, [$(($key, [<$name _$key>])),*] ); } }; // version for default visibility ($name:ident: [$type:ty], default: $empty:expr, [$($key:ident),*]) => { pastey::item!{ cpu_function_lookup_table!( $name: [$type], default: $empty, [$(($key, [<$name _$key>])),*] ); } }; } } #[cfg(asm_neon)] #[macro_use] mod neon { use std::{env, str::FromStr}; use arg_enum_proc_macro::ArgEnum; #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, ArgEnum)] #[allow(clippy::upper_case_acronyms)] pub enum CpuFeatureLevel { RUST, NEON, } impl CpuFeatureLevel { #[cfg(test)] #[inline] pub const fn all() -> &'static [Self] { use CpuFeatureLevel::*; &[RUST, NEON] } #[inline] pub const fn len() -> usize { CpuFeatureLevel::NEON as usize + 1 } #[inline] pub fn as_index(self) -> usize { self as usize } } impl Default for CpuFeatureLevel { #[inline] fn default() -> CpuFeatureLevel { let detected = CpuFeatureLevel::NEON; let manual: CpuFeatureLevel = match env::var("CPU_TARGET") { Ok(feature) => CpuFeatureLevel::from_str(&feature).unwrap_or(detected), Err(_e) => detected, }; if manual > detected { detected } else { manual } } } // Create a static lookup table for CPUFeatureLevel enums // Note: keys are CpuFeatureLevels without any prefix (no CpuFeatureLevel::) macro_rules! cpu_function_lookup_table { // version for default visibility ($name:ident: [$type:ty], default: $empty:expr, [$(($key:ident, $value:expr)),*]) => { static $name: [$type; crate::cpu::CpuFeatureLevel::len()] = { use crate::cpu::CpuFeatureLevel; #[allow(unused_mut)] let mut out: [$type; CpuFeatureLevel::len()] = [$empty; CpuFeatureLevel::len()]; // Can't use out[0][.] == $empty in static as of rust 1.40 #[allow(unused_mut)] let mut set: [bool; CpuFeatureLevel::len()] = [false; CpuFeatureLevel::len()]; #[allow(unused_imports)] use CpuFeatureLevel::*; $( out[$key as usize] = $value; set[$key as usize] = true; )* cpu_function_lookup_table!(waterfall_cpu_features(out, set, [NEON])); out }; }; ($pub:vis, $name:ident: [$type:ty], default: $empty:expr, [$(($key:ident, $value:expr)),*]) => { $pub cpu_function_lookup_table!($name: [$type], default: $empty, [$(($key, $value)),*]); }; // Fill empty output functions with the existent functions they support. // cpus should be in order of lowest cpu level to highest // Used like an internal function // Put in here to avoid adding more public macros (waterfall_cpu_features($out:ident, $set:ident, [$($cpu:ident),*])) => { // Use an array to emulate if statements (not supported in static as of // rust 1.40). Setting best[0] (false) and best[1] (true) is equivalent to // doing nothing and overriding our value respectively. #[allow(unused_assignments)] let mut best = [$out[0], $out[0]]; $( // If the current entry has a function, update out best function. best[$set[$cpu as usize] as usize] = $out[$cpu as usize]; // Update our current entry. Does nothing if it already had a function. $out[$cpu as usize] = best[1]; )* }; // use $name_$key as our values ($pub:vis, $name:ident: [$type:ty], default: $empty:expr, [$($key:ident),*]) => { pastey::item!{ cpu_function_lookup_table!( $pub, $name: [$type], default: $empty, [$(($key, [<$name _$key>])),*] ); } }; // version for default visibility ($name:ident: [$type:ty], default: $empty:expr, [$($key:ident),*]) => { pastey::item!{ cpu_function_lookup_table!( $name: [$type], default: $empty, [$(($key, [<$name _$key>])),*] ); } }; } } av-scenechange-0.14.1/src/data/block.rs000064400000000000000000000171241046102023000157230ustar 00000000000000use std::{ fmt, fmt::Display, ops::{Index, IndexMut}, }; use thiserror::Error; use v_frame::plane::PlaneOffset; use crate::data::{ plane::PlaneBlockOffset, superblock::{MI_SIZE_LOG2, SB_SIZE_LOG2}, }; pub const MAX_TX_SIZE: usize = 64; pub const BLOCK_TO_PLANE_SHIFT: usize = MI_SIZE_LOG2; pub const MIB_SIZE_LOG2: usize = SB_SIZE_LOG2 - MI_SIZE_LOG2; #[derive(Debug, Copy, Clone, PartialEq, Eq)] #[cfg_attr(test, derive(Default))] #[allow(non_camel_case_types)] pub enum BlockSize { BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, #[cfg_attr(test, default)] BLOCK_64X64, BLOCK_64X128, BLOCK_128X64, BLOCK_128X128, BLOCK_4X16, BLOCK_16X4, BLOCK_8X32, BLOCK_32X8, BLOCK_16X64, BLOCK_64X16, } impl BlockSize { /// # Errors /// /// - Returns `InvalidBlockSize` if the given `w` and `h` do not produce a /// valid block size. pub fn from_width_and_height_opt(w: usize, h: usize) -> Result { use crate::data::block::BlockSize::*; match (w, h) { (4, 4) => Ok(BLOCK_4X4), (4, 8) => Ok(BLOCK_4X8), (4, 16) => Ok(BLOCK_4X16), (8, 4) => Ok(BLOCK_8X4), (8, 8) => Ok(BLOCK_8X8), (8, 16) => Ok(BLOCK_8X16), (8, 32) => Ok(BLOCK_8X32), (16, 4) => Ok(BLOCK_16X4), (16, 8) => Ok(BLOCK_16X8), (16, 16) => Ok(BLOCK_16X16), (16, 32) => Ok(BLOCK_16X32), (16, 64) => Ok(BLOCK_16X64), (32, 8) => Ok(BLOCK_32X8), (32, 16) => Ok(BLOCK_32X16), (32, 32) => Ok(BLOCK_32X32), (32, 64) => Ok(BLOCK_32X64), (64, 16) => Ok(BLOCK_64X16), (64, 32) => Ok(BLOCK_64X32), (64, 64) => Ok(BLOCK_64X64), (64, 128) => Ok(BLOCK_64X128), (128, 64) => Ok(BLOCK_128X64), (128, 128) => Ok(BLOCK_128X128), _ => Err(InvalidBlockSize), } } /// # Panics /// /// - If the given `w` and `h` do not produce a valid block size. pub fn from_width_and_height(w: usize, h: usize) -> BlockSize { Self::from_width_and_height_opt(w, h).unwrap() } pub const fn width(self) -> usize { 1 << self.width_log2() } pub const fn width_log2(self) -> usize { use crate::data::block::BlockSize::*; match self { BLOCK_4X4 | BLOCK_4X8 | BLOCK_4X16 => 2, BLOCK_8X4 | BLOCK_8X8 | BLOCK_8X16 | BLOCK_8X32 => 3, BLOCK_16X4 | BLOCK_16X8 | BLOCK_16X16 | BLOCK_16X32 | BLOCK_16X64 => 4, BLOCK_32X8 | BLOCK_32X16 | BLOCK_32X32 | BLOCK_32X64 => 5, BLOCK_64X16 | BLOCK_64X32 | BLOCK_64X64 | BLOCK_64X128 => 6, BLOCK_128X64 | BLOCK_128X128 => 7, } } pub const fn height(self) -> usize { 1 << self.height_log2() } pub const fn height_log2(self) -> usize { use crate::data::block::BlockSize::*; match self { BLOCK_4X4 | BLOCK_8X4 | BLOCK_16X4 => 2, BLOCK_4X8 | BLOCK_8X8 | BLOCK_16X8 | BLOCK_32X8 => 3, BLOCK_4X16 | BLOCK_8X16 | BLOCK_16X16 | BLOCK_32X16 | BLOCK_64X16 => 4, BLOCK_8X32 | BLOCK_16X32 | BLOCK_32X32 | BLOCK_64X32 => 5, BLOCK_16X64 | BLOCK_32X64 | BLOCK_64X64 | BLOCK_128X64 => 6, BLOCK_64X128 | BLOCK_128X128 => 7, } } pub const fn tx_size(self) -> TxSize { use crate::data::block::{BlockSize::*, TxSize::*}; match self { BLOCK_4X4 => TX_4X4, BLOCK_4X8 => TX_4X8, BLOCK_8X4 => TX_8X4, BLOCK_8X8 => TX_8X8, BLOCK_8X16 => TX_8X16, BLOCK_16X8 => TX_16X8, BLOCK_16X16 => TX_16X16, BLOCK_16X32 => TX_16X32, BLOCK_32X16 => TX_32X16, BLOCK_32X32 => TX_32X32, BLOCK_32X64 => TX_32X64, BLOCK_64X32 => TX_64X32, BLOCK_4X16 => TX_4X16, BLOCK_16X4 => TX_16X4, BLOCK_8X32 => TX_8X32, BLOCK_32X8 => TX_32X8, BLOCK_16X64 => TX_16X64, BLOCK_64X16 => TX_64X16, _ => TX_64X64, } } } #[derive(Debug, Copy, Clone, Error, Eq, PartialEq)] pub struct InvalidBlockSize; impl Display for InvalidBlockSize { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str("invalid block size") } } /// Transform Size #[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord)] #[allow(non_camel_case_types)] pub enum TxSize { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4, TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16, TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16, } impl TxSize { pub const fn width(self) -> usize { 1 << self.width_log2() } pub const fn width_log2(self) -> usize { use crate::data::block::TxSize::*; match self { TX_4X4 | TX_4X8 | TX_4X16 => 2, TX_8X8 | TX_8X4 | TX_8X16 | TX_8X32 => 3, TX_16X16 | TX_16X8 | TX_16X32 | TX_16X4 | TX_16X64 => 4, TX_32X32 | TX_32X16 | TX_32X64 | TX_32X8 => 5, TX_64X64 | TX_64X32 | TX_64X16 => 6, } } pub const fn height(self) -> usize { 1 << self.height_log2() } pub const fn height_log2(self) -> usize { use crate::data::block::TxSize::*; match self { TX_4X4 | TX_8X4 | TX_16X4 => 2, TX_8X8 | TX_4X8 | TX_16X8 | TX_32X8 => 3, TX_16X16 | TX_8X16 | TX_32X16 | TX_4X16 | TX_64X16 => 4, TX_32X32 | TX_16X32 | TX_64X32 | TX_8X32 => 5, TX_64X64 | TX_32X64 | TX_16X64 => 6, } } } /// Absolute offset in blocks, where a block is defined /// to be an `N*N` square where `N == (1 << BLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub struct BlockOffset { pub x: usize, pub y: usize, } impl BlockOffset { /// Convert to plane offset without decimation. pub const fn to_luma_plane_offset(self) -> PlaneOffset { PlaneOffset { x: (self.x as isize) << BLOCK_TO_PLANE_SHIFT, y: (self.y as isize) << BLOCK_TO_PLANE_SHIFT, } } pub fn with_offset(self, col_offset: isize, row_offset: isize) -> BlockOffset { let x = self.x as isize + col_offset; let y = self.y as isize + row_offset; debug_assert!(x >= 0); debug_assert!(y >= 0); BlockOffset { x: x as usize, y: y as usize, } } } #[derive(Clone)] pub struct FrameBlocks { blocks: Box<[Block]>, pub cols: usize, } impl Index for FrameBlocks { type Output = [Block]; fn index(&self, index: usize) -> &Self::Output { &self.blocks[index * self.cols..(index + 1) * self.cols] } } impl IndexMut for FrameBlocks { fn index_mut(&mut self, index: usize) -> &mut Self::Output { &mut self.blocks[index * self.cols..(index + 1) * self.cols] } } // for convenience, also index by BlockOffset impl Index for FrameBlocks { type Output = Block; fn index(&self, bo: PlaneBlockOffset) -> &Self::Output { &self[bo.0.y][bo.0.x] } } impl IndexMut for FrameBlocks { fn index_mut(&mut self, bo: PlaneBlockOffset) -> &mut Self::Output { &mut self[bo.0.y][bo.0.x] } } #[derive(Copy, Clone, Default)] pub struct Block {} av-scenechange-0.14.1/src/data/frame.rs000064400000000000000000000137211046102023000157220ustar 00000000000000use std::sync::Arc; use v_frame::{frame::Frame, math::Fixed, pixel::Pixel, plane::Plane}; use crate::data::motion::{RefMEStats, ReferenceFramesSet}; pub const MAX_PLANES: usize = 3; pub const ALLOWED_REF_FRAMES: &[RefType] = &[RefType::LAST_FRAME]; pub const INTER_REFS_PER_FRAME: usize = 7; #[derive(Debug, Clone)] pub struct FrameState { pub input: Arc>, pub input_hres: Arc>, // half-resolution version of input luma pub input_qres: Arc>, // quarter-resolution version of input luma pub frame_me_stats: RefMEStats, } impl FrameState { /// Similar to [`FrameState::new_with_frame`], but takes an `me_stats` /// and `rec` to enable reusing the same underlying allocations to create /// a `FrameState` /// /// This function primarily exists for [`estimate_inter_costs`], and so /// it does not create hres or qres versions of `frame` as downscaling is /// somewhat expensive and are not needed for [`estimate_inter_costs`]. pub fn new_with_frame_and_me_stats_and_rec(frame: Arc>, me_stats: RefMEStats) -> Self { let hres = Plane::new(0, 0, 0, 0, 0, 0); let qres = Plane::new(0, 0, 0, 0, 0, 0); Self { input: frame, input_hres: Arc::new(hres), input_qres: Arc::new(qres), frame_me_stats: me_stats, // enc_stats: Default::default(), } } } // LAST_FRAME through ALTREF_FRAME correspond to slots 0-6. #[allow(non_camel_case_types)] #[allow(dead_code)] #[derive(PartialEq, Eq, PartialOrd, Copy, Clone, Debug)] pub enum RefType { INTRA_FRAME = 0, LAST_FRAME = 1, LAST2_FRAME = 2, LAST3_FRAME = 3, GOLDEN_FRAME = 4, BWDREF_FRAME = 5, ALTREF2_FRAME = 6, ALTREF_FRAME = 7, NONE_FRAME = 8, } impl RefType { /// convert to a ref list index, 0-6 (`INTER_REFS_PER_FRAME`) /// /// # Panics /// /// - If the ref type is a None or Intra frame pub fn to_index(self) -> usize { use self::RefType::*; match self { NONE_FRAME => { panic!("Tried to get slot of NONE_FRAME"); } INTRA_FRAME => { panic!("Tried to get slot of INTRA_FRAME"); } _ => (self as usize) - 1, } } } // Frame Invariants are invariant inside a frame #[allow(dead_code)] #[derive(Debug, Clone)] pub struct FrameInvariants { pub w_in_b: usize, pub h_in_b: usize, pub ref_frames: [u8; INTER_REFS_PER_FRAME], pub rec_buffer: ReferenceFramesSet, } impl FrameInvariants { pub fn new(width: usize, height: usize) -> Self { let w_in_b = 2 * width.align_power_of_two_and_shift(3); // MiCols, ((width+7)/8)<<3 >> MI_SIZE_LOG2 let h_in_b = 2 * height.align_power_of_two_and_shift(3); // MiRows, ((height+7)/8)<<3 >> MI_SIZE_LOG2 Self { w_in_b, h_in_b, ref_frames: [0; INTER_REFS_PER_FRAME], rec_buffer: ReferenceFramesSet::new(), } } pub fn new_key_frame(width: usize, height: usize) -> Self { Self::new(width, height) } /// Returns the created `FrameInvariants`, or `None` if this should be /// a placeholder frame. pub(crate) fn new_inter_frame( previous_coded_fi: &Self, output_frameno_in_gop: u64, ) -> Option { let mut fi = previous_coded_fi.clone(); let idx_in_group_output = get_idx_in_group_output(output_frameno_in_gop); let order_hint = get_order_hint(output_frameno_in_gop, idx_in_group_output); // this is the slot that the current frame is going to be saved into let slot_idx = get_slot_idx(0, order_hint); // level 0 has no forward references // default to last P frame // // calculations done relative to the slot_idx for this frame. // the last four frames can be found by subtracting from the current // // add 4 to prevent underflow // TODO: maybe use order_hint here like in get_slot_idx? // this is the previous P frame fi.ref_frames = [(slot_idx + 4 - 1) as u8 % 4; INTER_REFS_PER_FRAME]; Some(fi) } } // All the stuff below is ripped from InterCfg but assumes // reordering and multiref are off, so pyramid depth is always 0 const fn get_slot_idx(level: u64, order_hint: u32) -> u32 { // Frames with level == 0 are stored in slots 0..4, and frames with higher // values of level in slots 4..8 if level == 0 { order_hint & 3 } else { // This only works with pyramid_depth <= 4. 3 + level as u32 } } /// Get the index of an output frame in its re-ordering group given the output /// frame number of the frame in the current keyframe gop. /// When re-ordering is disabled, this always returns 0. fn get_idx_in_group_output(output_frameno_in_gop: u64) -> u64 { // The first frame in the GOP should be a keyframe and is not re-ordered, // so we should not be calling this function on it. debug_assert!(output_frameno_in_gop > 0); output_frameno_in_gop - 1 } /// Get the order-hint of an output frame given the output frame number of the /// frame in the current keyframe gop and the index of that output frame /// in its re-ordering gorup. fn get_order_hint(output_frameno_in_gop: u64, idx_in_group_output: u64) -> u32 { // The first frame in the GOP should be a keyframe, but currently this // function only handles inter frames. // We could return 0 for keyframes if keyframe support is needed. debug_assert!(output_frameno_in_gop > 0); // Which P-frame group in the current gop is this output frame in? // Subtract 1 because the first frame in the gop is always a keyframe. let group_idx = output_frameno_in_gop - 1; // Get the offset to the corresponding input frame. let offset = idx_in_group_output + 1; // Construct the final order hint relative to the start of the group. (group_idx + offset) as u32 } av-scenechange-0.14.1/src/data/hadamard.rs000064400000000000000000000050171046102023000163700ustar 00000000000000pub unsafe fn hadamard4x4(data: &mut [i32]) { hadamard2d::<{ 4 * 4 }, 4, 4>(&mut *(data.as_mut_ptr() as *mut [i32; 16])); } // SAFETY: The length of data must be 64. pub unsafe fn hadamard8x8(data: &mut [i32]) { hadamard2d::<{ 8 * 8 }, 8, 8>(&mut *(data.as_mut_ptr() as *mut [i32; 64])); } fn hadamard2d(data: &mut [i32; LEN]) { // Vertical transform. let vert_func = if H == 4 { hadamard4_1d:: } else { hadamard8_1d:: }; vert_func(data); // Horizontal transform. let horz_func = if W == 4 { hadamard4_1d:: } else { hadamard8_1d:: }; horz_func(data); } #[allow(clippy::erasing_op)] #[allow(clippy::identity_op)] fn hadamard4_1d( data: &mut [i32; LEN], ) { for i in 0..N { let sub: &mut [i32] = &mut data[i * STRIDE0..]; let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]); let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]); let (b0, b2) = butterfly(a0, a2); let (b1, b3) = butterfly(a1, a3); sub[0 * STRIDE1] = b0; sub[1 * STRIDE1] = b1; sub[2 * STRIDE1] = b2; sub[3 * STRIDE1] = b3; } } #[allow(clippy::erasing_op)] #[allow(clippy::identity_op)] fn hadamard8_1d( data: &mut [i32; LEN], ) { for i in 0..N { let sub: &mut [i32] = &mut data[i * STRIDE0..]; let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]); let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]); let (a4, a5) = butterfly(sub[4 * STRIDE1], sub[5 * STRIDE1]); let (a6, a7) = butterfly(sub[6 * STRIDE1], sub[7 * STRIDE1]); let (b0, b2) = butterfly(a0, a2); let (b1, b3) = butterfly(a1, a3); let (b4, b6) = butterfly(a4, a6); let (b5, b7) = butterfly(a5, a7); let (c0, c4) = butterfly(b0, b4); let (c1, c5) = butterfly(b1, b5); let (c2, c6) = butterfly(b2, b6); let (c3, c7) = butterfly(b3, b7); sub[0 * STRIDE1] = c0; sub[1 * STRIDE1] = c1; sub[2 * STRIDE1] = c2; sub[3 * STRIDE1] = c3; sub[4 * STRIDE1] = c4; sub[5 * STRIDE1] = c5; sub[6 * STRIDE1] = c6; sub[7 * STRIDE1] = c7; } } const fn butterfly(a: i32, b: i32) -> (i32, i32) { ((a + b), (a - b)) } av-scenechange-0.14.1/src/data/mc/simd_neon.rs000064400000000000000000000132151046102023000172000ustar 00000000000000use v_frame::{ pixel::{Pixel, PixelType}, plane::PlaneSlice, }; use crate::{ cpu::CpuFeatureLevel, data::{ mc::{FilterMode, FilterMode::*}, plane::PlaneRegionMut, }, }; #[allow(clippy::too_many_arguments)] pub fn put_8tap_internal( dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, bit_depth: usize, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { super::rust::put_8tap_internal(dst, src, width, height, col_frac, row_frac, bit_depth, cpu); }; unsafe { // SAFETY: The assembly only supports even heights and valid uncropped // widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); // SAFETY: Check bounds of dst assert!(dst.rect().width >= width && dst.rect().height >= height); // SAFETY: Check bounds of src assert!(src.accessible(width + 4, height + 4)); assert!(src.accessible_neg(3, 3)); match T::type_enum() { PixelType::U8 => match PUT_FNS[cpu.as_index()] [get_2d_mode_idx(FilterMode::REGULAR, FilterMode::REGULAR)] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, ), None => call_rust(dst), }, PixelType::U16 if bit_depth > 8 => { match PUT_HBD_FNS[cpu.as_index()] [get_2d_mode_idx(FilterMode::REGULAR, FilterMode::REGULAR)] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, (1 << bit_depth) - 1, ), None => call_rust(dst), } } _ => call_rust(dst), } } } // gets an index that can be mapped to a function for a pair of filter modes const fn get_2d_mode_idx(mode_x: FilterMode, mode_y: FilterMode) -> usize { (mode_x as usize + 4 * (mode_y as usize)) & 15 } type PutFn = unsafe extern "C" fn( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, ); type PutHBDFn = unsafe extern "C" fn( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, bitdepth_max: i32, ); macro_rules! decl_mc_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { extern "C" { $( fn $func_name( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, w: i32, h: i32, mx: i32, my: i32, ); )* } static PUT_FNS_NEON: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some($func_name); )* out }; } } decl_mc_fns!( (REGULAR, REGULAR, avsc_put_8tap_regular_8bpc_neon), (REGULAR, SMOOTH, avsc_put_8tap_regular_smooth_8bpc_neon), (REGULAR, SHARP, avsc_put_8tap_regular_sharp_8bpc_neon), (SMOOTH, REGULAR, avsc_put_8tap_smooth_regular_8bpc_neon), (SMOOTH, SMOOTH, avsc_put_8tap_smooth_8bpc_neon), (SMOOTH, SHARP, avsc_put_8tap_smooth_sharp_8bpc_neon), (SHARP, REGULAR, avsc_put_8tap_sharp_regular_8bpc_neon), (SHARP, SMOOTH, avsc_put_8tap_sharp_smooth_8bpc_neon), (SHARP, SHARP, avsc_put_8tap_sharp_8bpc_neon), (BILINEAR, BILINEAR, avsc_put_bilin_8bpc_neon) ); cpu_function_lookup_table!( PUT_FNS: [[Option; 16]], default: [None; 16], [NEON] ); macro_rules! decl_mc_hbd_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { extern "C" { $( fn $func_name( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, w: i32, h: i32, mx: i32, my: i32, bitdepth_max: i32, ); )* } static PUT_HBD_FNS_NEON: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some($func_name); )* out }; } } decl_mc_hbd_fns!( (REGULAR, REGULAR, avsc_put_8tap_regular_16bpc_neon), (REGULAR, SMOOTH, avsc_put_8tap_regular_smooth_16bpc_neon), (REGULAR, SHARP, avsc_put_8tap_regular_sharp_16bpc_neon), (SMOOTH, REGULAR, avsc_put_8tap_smooth_regular_16bpc_neon), (SMOOTH, SMOOTH, avsc_put_8tap_smooth_16bpc_neon), (SMOOTH, SHARP, avsc_put_8tap_smooth_sharp_16bpc_neon), (SHARP, REGULAR, avsc_put_8tap_sharp_regular_16bpc_neon), (SHARP, SMOOTH, avsc_put_8tap_sharp_smooth_16bpc_neon), (SHARP, SHARP, avsc_put_8tap_sharp_16bpc_neon), (BILINEAR, BILINEAR, avsc_put_bilin_16bpc_neon) ); cpu_function_lookup_table!( PUT_HBD_FNS: [[Option; 16]], default: [None; 16], [NEON] ); av-scenechange-0.14.1/src/data/mc/simd_x86.rs000064400000000000000000000156531046102023000166760ustar 00000000000000use v_frame::{ pixel::{Pixel, PixelType}, plane::PlaneSlice, }; use crate::{ cpu::CpuFeatureLevel, data::{ mc::{FilterMode, FilterMode::*}, plane::PlaneRegionMut, }, }; #[allow(clippy::too_many_arguments)] pub fn put_8tap_internal( dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, bit_depth: usize, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { super::rust::put_8tap_internal(dst, src, width, height, col_frac, row_frac, bit_depth, cpu); }; // SAFETY: The assembly only supports even heights and valid uncropped // widths unsafe { assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); // SAFETY: Check bounds of dst assert!(dst.rect().width >= width && dst.rect().height >= height); // SAFETY: Check bounds of src assert!(src.accessible(width + 4, height + 4)); assert!(src.accessible_neg(3, 3)); match T::type_enum() { PixelType::U8 => match PUT_FNS[cpu.as_index()][get_2d_mode_idx(REGULAR, REGULAR)] { Some(func) => func( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, ), None => call_rust(dst), }, PixelType::U16 => { match PUT_HBD_FNS[cpu.as_index()][get_2d_mode_idx(REGULAR, REGULAR)] { Some(func) => func( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, (1 << bit_depth) - 1, ), None => call_rust(dst), } } } } } type PutFn = unsafe extern "C" fn( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, ); type PutHBDFn = unsafe extern "C" fn( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, bitdepth_max: i32, ); // gets an index that can be mapped to a function for a pair of filter modes const fn get_2d_mode_idx(mode_x: FilterMode, mode_y: FilterMode) -> usize { (mode_x as usize + 4 * (mode_y as usize)) & 15 } macro_rules! decl_mc_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { pastey::item! { extern "C" { $( fn [<$func_name _ssse3>]( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, w: i32, h: i32, mx: i32, my: i32 ); fn [<$func_name _avx2>]( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, w: i32, h: i32, mx: i32, my: i32 ); fn [<$func_name _avx512icl>]( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, w: i32, h: i32, mx: i32, my: i32 ); )* } static PUT_FNS_SSSE3: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _ssse3>]); )* out }; static PUT_FNS_AVX2: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _avx2>]); )* out }; static PUT_FNS_AVX512ICL: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _avx512icl>]); )* out }; } } } decl_mc_fns!( (REGULAR, REGULAR, avsc_put_8tap_regular_8bpc), (REGULAR, SMOOTH, avsc_put_8tap_regular_smooth_8bpc), (REGULAR, SHARP, avsc_put_8tap_regular_sharp_8bpc), (SMOOTH, REGULAR, avsc_put_8tap_smooth_regular_8bpc), (SMOOTH, SMOOTH, avsc_put_8tap_smooth_8bpc), (SMOOTH, SHARP, avsc_put_8tap_smooth_sharp_8bpc), (SHARP, REGULAR, avsc_put_8tap_sharp_regular_8bpc), (SHARP, SMOOTH, avsc_put_8tap_sharp_smooth_8bpc), (SHARP, SHARP, avsc_put_8tap_sharp_8bpc), (BILINEAR, BILINEAR, avsc_put_bilin_8bpc) ); cpu_function_lookup_table!( PUT_FNS: [[Option; 16]], default: [None; 16], [SSSE3, AVX2, AVX512ICL] ); macro_rules! decl_mc_hbd_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { pastey::item! { extern "C" { $( fn [<$func_name _ssse3>]( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, w: i32, h: i32, mx: i32, my: i32, bitdepth_max: i32, ); fn [<$func_name _avx2>]( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, w: i32, h: i32, mx: i32, my: i32, bitdepth_max: i32, ); )* } static PUT_HBD_FNS_SSSE3: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _ssse3>]); )* out }; static PUT_HBD_FNS_AVX2: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _avx2>]); )* out }; } } } decl_mc_hbd_fns!( (REGULAR, REGULAR, avsc_put_8tap_regular_16bpc), (REGULAR, SMOOTH, avsc_put_8tap_regular_smooth_16bpc), (REGULAR, SHARP, avsc_put_8tap_regular_sharp_16bpc), (SMOOTH, REGULAR, avsc_put_8tap_smooth_regular_16bpc), (SMOOTH, SMOOTH, avsc_put_8tap_smooth_16bpc), (SMOOTH, SHARP, avsc_put_8tap_smooth_sharp_16bpc), (SHARP, REGULAR, avsc_put_8tap_sharp_regular_16bpc), (SHARP, SMOOTH, avsc_put_8tap_sharp_smooth_16bpc), (SHARP, SHARP, avsc_put_8tap_sharp_16bpc), (BILINEAR, BILINEAR, avsc_put_bilin_16bpc) ); cpu_function_lookup_table!( PUT_HBD_FNS: [[Option; 16]], default: [None; 16], [SSSE3, AVX2] ); av-scenechange-0.14.1/src/data/mc.rs000064400000000000000000000250441046102023000152300ustar 00000000000000#[cfg(asm_neon)] mod simd_neon; #[cfg(asm_x86_64)] mod simd_x86; use v_frame::{pixel::Pixel, plane::PlaneSlice}; #[cfg(not(any(asm_x86_64, asm_neon)))] use self::rust::*; #[cfg(asm_neon)] use self::simd_neon::*; #[cfg(asm_x86_64)] use self::simd_x86::*; use crate::{cpu::CpuFeatureLevel, data::plane::PlaneRegionMut}; #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] #[allow(clippy::upper_case_acronyms)] #[allow(dead_code)] pub enum FilterMode { REGULAR = 0, SMOOTH = 1, SHARP = 2, BILINEAR = 3, SWITCHABLE = 4, } pub const SUBPEL_FILTER_SIZE: usize = 8; const SUBPEL_FILTERS: [[[i32; SUBPEL_FILTER_SIZE]; 16]; 6] = [ [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 2, -6, 126, 8, -2, 0, 0], [0, 2, -10, 122, 18, -4, 0, 0], [0, 2, -12, 116, 28, -8, 2, 0], [0, 2, -14, 110, 38, -10, 2, 0], [0, 2, -14, 102, 48, -12, 2, 0], [0, 2, -16, 94, 58, -12, 2, 0], [0, 2, -14, 84, 66, -12, 2, 0], [0, 2, -14, 76, 76, -14, 2, 0], [0, 2, -12, 66, 84, -14, 2, 0], [0, 2, -12, 58, 94, -16, 2, 0], [0, 2, -12, 48, 102, -14, 2, 0], [0, 2, -10, 38, 110, -14, 2, 0], [0, 2, -8, 28, 116, -12, 2, 0], [0, 0, -4, 18, 122, -10, 2, 0], [0, 0, -2, 8, 126, -6, 2, 0], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 2, 28, 62, 34, 2, 0, 0], [0, 0, 26, 62, 36, 4, 0, 0], [0, 0, 22, 62, 40, 4, 0, 0], [0, 0, 20, 60, 42, 6, 0, 0], [0, 0, 18, 58, 44, 8, 0, 0], [0, 0, 16, 56, 46, 10, 0, 0], [0, -2, 16, 54, 48, 12, 0, 0], [0, -2, 14, 52, 52, 14, -2, 0], [0, 0, 12, 48, 54, 16, -2, 0], [0, 0, 10, 46, 56, 16, 0, 0], [0, 0, 8, 44, 58, 18, 0, 0], [0, 0, 6, 42, 60, 20, 0, 0], [0, 0, 4, 40, 62, 22, 0, 0], [0, 0, 4, 36, 62, 26, 0, 0], [0, 0, 2, 34, 62, 28, 2, 0], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [-2, 2, -6, 126, 8, -2, 2, 0], [-2, 6, -12, 124, 16, -6, 4, -2], [-2, 8, -18, 120, 26, -10, 6, -2], [-4, 10, -22, 116, 38, -14, 6, -2], [-4, 10, -22, 108, 48, -18, 8, -2], [-4, 10, -24, 100, 60, -20, 8, -2], [-4, 10, -24, 90, 70, -22, 10, -2], [-4, 12, -24, 80, 80, -24, 12, -4], [-2, 10, -22, 70, 90, -24, 10, -4], [-2, 8, -20, 60, 100, -24, 10, -4], [-2, 8, -18, 48, 108, -22, 10, -4], [-2, 6, -14, 38, 116, -22, 10, -4], [-2, 6, -10, 26, 120, -18, 8, -2], [-2, 4, -6, 16, 124, -12, 6, -2], [0, 2, -2, 8, 126, -6, 2, -2], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 0, 0, 120, 8, 0, 0, 0], [0, 0, 0, 112, 16, 0, 0, 0], [0, 0, 0, 104, 24, 0, 0, 0], [0, 0, 0, 96, 32, 0, 0, 0], [0, 0, 0, 88, 40, 0, 0, 0], [0, 0, 0, 80, 48, 0, 0, 0], [0, 0, 0, 72, 56, 0, 0, 0], [0, 0, 0, 64, 64, 0, 0, 0], [0, 0, 0, 56, 72, 0, 0, 0], [0, 0, 0, 48, 80, 0, 0, 0], [0, 0, 0, 40, 88, 0, 0, 0], [0, 0, 0, 32, 96, 0, 0, 0], [0, 0, 0, 24, 104, 0, 0, 0], [0, 0, 0, 16, 112, 0, 0, 0], [0, 0, 0, 8, 120, 0, 0, 0], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 0, -4, 126, 8, -2, 0, 0], [0, 0, -8, 122, 18, -4, 0, 0], [0, 0, -10, 116, 28, -6, 0, 0], [0, 0, -12, 110, 38, -8, 0, 0], [0, 0, -12, 102, 48, -10, 0, 0], [0, 0, -14, 94, 58, -10, 0, 0], [0, 0, -12, 84, 66, -10, 0, 0], [0, 0, -12, 76, 76, -12, 0, 0], [0, 0, -10, 66, 84, -12, 0, 0], [0, 0, -10, 58, 94, -14, 0, 0], [0, 0, -10, 48, 102, -12, 0, 0], [0, 0, -8, 38, 110, -12, 0, 0], [0, 0, -6, 28, 116, -10, 0, 0], [0, 0, -4, 18, 122, -8, 0, 0], [0, 0, -2, 8, 126, -4, 0, 0], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 0, 30, 62, 34, 2, 0, 0], [0, 0, 26, 62, 36, 4, 0, 0], [0, 0, 22, 62, 40, 4, 0, 0], [0, 0, 20, 60, 42, 6, 0, 0], [0, 0, 18, 58, 44, 8, 0, 0], [0, 0, 16, 56, 46, 10, 0, 0], [0, 0, 14, 54, 48, 12, 0, 0], [0, 0, 12, 52, 52, 12, 0, 0], [0, 0, 12, 48, 54, 14, 0, 0], [0, 0, 10, 46, 56, 16, 0, 0], [0, 0, 8, 44, 58, 18, 0, 0], [0, 0, 6, 42, 60, 20, 0, 0], [0, 0, 4, 40, 62, 22, 0, 0], [0, 0, 4, 36, 62, 26, 0, 0], [0, 0, 2, 34, 62, 30, 0, 0], ], ]; mod rust { use num_traits::AsPrimitive; use v_frame::{math::round_shift, pixel::Pixel, plane::PlaneSlice}; use crate::{ cpu::CpuFeatureLevel, data::{ mc::{FilterMode, SUBPEL_FILTERS, SUBPEL_FILTER_SIZE}, plane::PlaneRegionMut, }, }; #[cfg_attr( all(asm_x86_64, any(target_feature = "ssse3", target_feature = "avx2")), cold )] #[cfg_attr(asm_neon, cold)] #[allow(clippy::too_many_arguments)] pub fn put_8tap_internal( dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, bit_depth: usize, _cpu: CpuFeatureLevel, ) { // The assembly only supports even heights and valid uncropped widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); let ref_stride = src.plane.cfg.stride; let y_filter = get_filter(row_frac, height); let x_filter = get_filter(col_frac, width); let max_sample_val = (1 << bit_depth) - 1; let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; match (col_frac, row_frac) { (0, 0) => { for r in 0..height { let src_slice = &src[r]; let dst_slice = &mut dst[r]; dst_slice[..width].copy_from_slice(&src_slice[..width]); } } (0, _) => { let offset_slice = src.go_up(3); for r in 0..height { let src_slice = &offset_slice[r]; let dst_slice = &mut dst[r]; for c in 0..width { dst_slice[c] = T::cast_from( round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter(src_slice[c..].as_ptr(), ref_stride, y_filter) }, 7, ) .clamp(0, max_sample_val), ); } } } (_, 0) => { let offset_slice = src.go_left(3); for r in 0..height { let src_slice = &offset_slice[r]; let dst_slice = &mut dst[r]; for c in 0..width { dst_slice[c] = T::cast_from( round_shift( round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, 7 - intermediate_bits, ), intermediate_bits, ) .clamp(0, max_sample_val), ); } } } (_, _) => { let mut intermediate: [i16; 8 * (128 + 7)] = [0; 8 * (128 + 7)]; let offset_slice = src.go_left(3).go_up(3); for cg in (0..width).step_by(8) { for r in 0..height + 7 { let src_slice = &offset_slice[r]; for c in cg..(cg + 8).min(width) { intermediate[8 * r + (c - cg)] = round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, 7 - intermediate_bits, ) as i16; } } for r in 0..height { let dst_slice = &mut dst[r]; for c in cg..(cg + 8).min(width) { dst_slice[c] = T::cast_from( round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter( intermediate[8 * r + c - cg..].as_ptr(), 8, y_filter, ) }, 7 + intermediate_bits, ) .clamp(0, max_sample_val), ); } } } } } } fn get_filter(frac: i32, length: usize) -> [i32; SUBPEL_FILTER_SIZE] { const MODE: FilterMode = FilterMode::REGULAR; let filter_idx = if MODE == FilterMode::BILINEAR || length > 4 { MODE as usize } else { (MODE as usize).min(1) + 4 }; SUBPEL_FILTERS[filter_idx][frac as usize] } unsafe fn run_filter>( src: *const T, stride: usize, filter: [i32; 8], ) -> i32 { filter .iter() .enumerate() .map(|(i, f)| { let p = src.add(i * stride); f * (*p).as_() }) .sum::() } } #[allow(clippy::too_many_arguments)] pub fn put_8tap( dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, bit_depth: usize, cpu: CpuFeatureLevel, ) { put_8tap_internal(dst, src, width, height, col_frac, row_frac, bit_depth, cpu); } av-scenechange-0.14.1/src/data/mod.rs000064400000000000000000000007331046102023000154060ustar 00000000000000use std::mem::MaybeUninit; pub(crate) mod block; pub(crate) mod frame; pub(crate) mod hadamard; pub(crate) mod mc; pub(crate) mod motion; pub(crate) mod plane; pub(crate) mod prediction; pub(crate) mod sad; pub(crate) mod satd; pub(crate) mod superblock; pub(crate) mod tile; /// Assume all the elements are initialized. pub unsafe fn slice_assume_init_mut(slice: &'_ mut [MaybeUninit]) -> &'_ mut [T] { &mut *(slice as *mut [MaybeUninit] as *mut [T]) } av-scenechange-0.14.1/src/data/motion.rs000064400000000000000000000173451046102023000161430ustar 00000000000000use std::{ marker::PhantomData, ops, ops::{Index, IndexMut}, slice, sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; use arrayvec::ArrayVec; use v_frame::{frame::Frame, pixel::Pixel, plane::Plane}; const MV_IN_USE_BITS: usize = 14; pub const MV_UPP: i32 = 1 << MV_IN_USE_BITS; pub const MV_LOW: i32 = -(1 << MV_IN_USE_BITS); #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub struct MotionVector { pub row: i16, pub col: i16, } impl MotionVector { pub const fn quantize_to_fullpel(self) -> Self { Self { row: (self.row / 8) * 8, col: (self.col / 8) * 8, } } } impl ops::Mul for MotionVector { type Output = MotionVector; fn mul(self, rhs: i16) -> MotionVector { MotionVector { row: self.row * rhs, col: self.col * rhs, } } } impl ops::Mul for MotionVector { type Output = MotionVector; fn mul(self, rhs: u16) -> MotionVector { MotionVector { row: self.row * rhs as i16, col: self.col * rhs as i16, } } } impl ops::Shr for MotionVector { type Output = MotionVector; fn shr(self, rhs: u8) -> MotionVector { MotionVector { row: self.row >> rhs, col: self.col >> rhs, } } } impl ops::Shl for MotionVector { type Output = MotionVector; fn shl(self, rhs: u8) -> MotionVector { MotionVector { row: self.row << rhs, col: self.col << rhs, } } } impl ops::Add for MotionVector { type Output = MotionVector; fn add(self, rhs: MotionVector) -> MotionVector { MotionVector { row: self.row + rhs.row, col: self.col + rhs.col, } } } #[derive(Debug, Copy, Clone, Default)] pub struct MEStats { pub mv: MotionVector, /// SAD value on the scale of a 128x128 block pub normalized_sad: u32, } #[derive(Debug, Clone)] pub struct FrameMEStats { stats: Box<[MEStats]>, pub cols: usize, pub rows: usize, } pub const REF_FRAMES_LOG2: usize = 3; pub const REF_FRAMES: usize = 1 << REF_FRAMES_LOG2; pub type RefMEStats = Arc>; pub type ReadGuardMEStats<'a> = RwLockReadGuard<'a, [FrameMEStats; REF_FRAMES]>; pub type WriteGuardMEStats<'a> = RwLockWriteGuard<'a, [FrameMEStats; REF_FRAMES]>; impl FrameMEStats { pub fn new(cols: usize, rows: usize) -> Self { Self { // dynamic allocation: once per frame stats: vec![MEStats::default(); cols * rows].into_boxed_slice(), cols, rows, } } pub fn new_arc_array(cols: usize, rows: usize) -> RefMEStats { Arc::new(RwLock::new([ FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), ])) } } impl Index for FrameMEStats { type Output = [MEStats]; fn index(&self, index: usize) -> &Self::Output { &self.stats[index * self.cols..(index + 1) * self.cols] } } impl IndexMut for FrameMEStats { fn index_mut(&mut self, index: usize) -> &mut Self::Output { &mut self.stats[index * self.cols..(index + 1) * self.cols] } } /// Tiled view of `FrameMEStats` #[derive(Debug)] pub struct TileMEStats<'a> { data: *const MEStats, // expressed in mi blocks // private to guarantee borrowing rules x: usize, y: usize, cols: usize, rows: usize, /// number of cols in the underlying `FrameMEStats` stride: usize, phantom: PhantomData<&'a MotionVector>, } /// Mutable tiled view of `FrameMEStats` #[derive(Debug)] pub struct TileMEStatsMut<'a> { data: *mut MEStats, // expressed in mi blocks // private to guarantee borrowing rules x: usize, y: usize, cols: usize, rows: usize, /// number of cols in the underlying `FrameMEStats` stride: usize, phantom: PhantomData<&'a mut MotionVector>, } // common impl for TileMotionVectors and TileMotionVectorsMut macro_rules! tile_me_stats_common { // $name: TileMEStats or TileMEStatsMut // $opt_mut: nothing or mut ($name:ident $(,$opt_mut:tt)?) => { impl<'a> $name<'a> { /// # Panics /// /// - If the requested dimensions are larger than the frame MV size #[allow(dead_code)] pub fn new( frame_mvs: &'a $($opt_mut)? FrameMEStats, x: usize, y: usize, cols: usize, rows: usize, ) -> Self { assert!(x + cols <= frame_mvs.cols); assert!(y + rows <= frame_mvs.rows); Self { data: & $($opt_mut)? frame_mvs[y][x], x, y, cols, rows, stride: frame_mvs.cols, phantom: PhantomData, } } #[allow(dead_code)] pub const fn x(&self) -> usize { self.x } #[allow(dead_code)] pub const fn y(&self) -> usize { self.y } #[allow(dead_code)] pub const fn cols(&self) -> usize { self.cols } #[allow(dead_code)] pub const fn rows(&self) -> usize { self.rows } } unsafe impl Send for $name<'_> {} unsafe impl Sync for $name<'_> {} impl Index for $name<'_> { type Output = [MEStats]; fn index(&self, index: usize) -> &Self::Output { assert!(index < self.rows); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.stride); slice::from_raw_parts(ptr, self.cols) } } } } } tile_me_stats_common!(TileMEStats); tile_me_stats_common!(TileMEStatsMut, mut); impl TileMEStatsMut<'_> { pub const fn as_const(&self) -> TileMEStats<'_> { TileMEStats { data: self.data, x: self.x, y: self.y, cols: self.cols, rows: self.rows, stride: self.stride, phantom: PhantomData, } } } impl IndexMut for TileMEStatsMut<'_> { fn index_mut(&mut self, index: usize) -> &mut Self::Output { assert!(index < self.rows); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.stride); slice::from_raw_parts_mut(ptr, self.cols) } } } #[derive(Debug, Copy, Clone, Eq, PartialEq)] #[allow(clippy::upper_case_acronyms)] pub enum MVSamplingMode { INIT, CORNER { right: bool, bottom: bool }, } #[derive(Debug, Clone)] pub struct ReferenceFrame { pub frame: Arc>, pub input_hres: Arc>, pub input_qres: Arc>, pub frame_me_stats: RefMEStats, } #[derive(Debug, Clone, Default)] pub struct ReferenceFramesSet { pub frames: [Option>>; REF_FRAMES], } impl ReferenceFramesSet { pub fn new() -> Self { Self { frames: Default::default(), // deblock: Default::default() } } } pub struct MotionEstimationSubsets { pub min_sad: u32, pub median: Option, pub subset_b: ArrayVec, pub subset_c: ArrayVec, } impl MotionEstimationSubsets { pub fn all_mvs(&self) -> ArrayVec { let mut all = ArrayVec::new(); if let Some(median) = self.median { all.push(median); } all.extend(self.subset_b.iter().copied()); all.extend(self.subset_c.iter().copied()); all } } av-scenechange-0.14.1/src/data/plane.rs000064400000000000000000000413661046102023000157350ustar 00000000000000use std::{ iter::FusedIterator, marker::PhantomData, ops::{Index, IndexMut}, slice, }; use v_frame::{ pixel::Pixel, plane::{Plane, PlaneConfig, PlaneOffset}, }; use super::block::{BlockOffset, BLOCK_TO_PLANE_SHIFT}; /// Bounded region of a plane /// /// This allows giving access to a rectangular area of a plane without /// giving access to the whole plane. #[derive(Debug)] pub struct PlaneRegion<'a, T: Pixel> { data: *const T, // points to (plane_cfg.x, plane_cfg.y) pub plane_cfg: &'a PlaneConfig, // private to guarantee borrowing rules rect: Rect, phantom: PhantomData<&'a T>, } /// Mutable bounded region of a plane /// /// This allows to give mutable access to a rectangular area of the plane /// without giving access to the whole plane. #[derive(Debug)] pub struct PlaneRegionMut<'a, T: Pixel> { data: *mut T, // points to (plane_cfg.x, plane_cfg.y) pub plane_cfg: &'a PlaneConfig, rect: Rect, phantom: PhantomData<&'a mut T>, } // common impl for PlaneRegion and PlaneRegionMut macro_rules! plane_region_common { // $name: PlaneRegion or PlaneRegionMut // $as_ptr: as_ptr or as_mut_ptr // $opt_mut: nothing or mut ($name:ident, $as_ptr:ident $(,$opt_mut:tt)?) => { impl<'a, T: Pixel> $name<'a, T> { #[cold] pub fn empty(plane_cfg : &'a PlaneConfig) -> Self { return Self { // SAFETY: This is actually pretty unsafe. // This means we need to ensure that no other method on this struct // can access data if the dimensions are 0. data: std::ptr::null_mut::(), plane_cfg, rect: Rect::default(), phantom: PhantomData, } } /// # Panics /// /// - If the configured dimensions are invalid pub fn from_slice(data: &'a $($opt_mut)? [T], cfg: &'a PlaneConfig, rect: Rect) -> Self { if cfg.width == 0 || cfg.height == 0 { return Self::empty(&cfg); } assert!(rect.x >= -(cfg.xorigin as isize)); assert!(rect.y >= -(cfg.yorigin as isize)); assert!(cfg.xorigin as isize + rect.x + rect.width as isize <= cfg.stride as isize); assert!(cfg.yorigin as isize + rect.y + rect.height as isize <= cfg.alloc_height as isize); // SAFETY: The above asserts ensure we do not go OOB. unsafe { Self::from_slice_unsafe(data, cfg, rect)} } unsafe fn from_slice_unsafe(data: &'a $($opt_mut)? [T], cfg: &'a PlaneConfig, rect: Rect) -> Self { let origin = (cfg.yorigin as isize + rect.y) * cfg.stride as isize + cfg.xorigin as isize + rect.x; Self { data: data.$as_ptr().offset(origin), plane_cfg: cfg, rect, phantom: PhantomData, } } pub fn new(plane: &'a $($opt_mut)? Plane, rect: Rect) -> Self { Self::from_slice(& $($opt_mut)? plane.data, &plane.cfg, rect) } #[allow(dead_code)] pub fn new_from_plane(plane: &'a $($opt_mut)? Plane) -> Self { let rect = Rect { x: 0, y: 0, width: plane.cfg.stride - plane.cfg.xorigin, height: plane.cfg.alloc_height - plane.cfg.yorigin, }; // SAFETY: Area::StartingAt{}.to_rect is guaranteed to be the entire plane unsafe { Self::from_slice_unsafe(& $($opt_mut)? plane.data, &plane.cfg, rect) } } #[allow(dead_code)] pub fn data_ptr(&self) -> *const T { self.data } pub fn rect(&self) -> &Rect { &self.rect } #[allow(dead_code)] pub fn rows_iter(&self) -> PlaneRegionRowsIter<'_, T> { PlaneRegionRowsIter { data: self.data, stride: self.plane_cfg.stride, width: self.rect.width, remaining: self.rect.height, phantom: PhantomData, } } #[allow(dead_code)] pub fn vert_windows(&self, h: usize) -> VertWindows<'_, T> { VertWindows { data: self.data, plane_cfg: self.plane_cfg, remaining: (self.rect.height as isize - h as isize + 1).max(0) as usize, output_rect: Rect { x: self.rect.x, y: self.rect.y, width: self.rect.width, height: h } } } #[allow(dead_code)] pub fn horz_windows(&self, w: usize) -> HorzWindows<'_, T> { HorzWindows { data: self.data, plane_cfg: self.plane_cfg, remaining: (self.rect.width as isize - w as isize + 1).max(0) as usize, output_rect: Rect { x: self.rect.x, y: self.rect.y, width: w, height: self.rect.height } } } /// Return a view to a subregion of the plane /// /// The subregion must be included in (i.e. must not exceed) this region. /// /// It is described by an `Area`, relative to this region. /// /// # Panics /// /// - If the requested dimensions are larger than the plane region size /// /// # Example /// /// ``` ignore /// # use rav1e::tiling::*; /// # fn f(region: &PlaneRegion<'_, u16>) { /// // a subregion from (10, 8) to the end of the region /// let subregion = region.subregion(Area::StartingAt { x: 10, y: 8 }); /// # } /// ``` /// /// ``` ignore /// # use rav1e::context::*; /// # use rav1e::tiling::*; /// # fn f(region: &PlaneRegion<'_, u16>) { /// // a subregion from the top-left of block (2, 3) having size (64, 64) /// let bo = BlockOffset { x: 2, y: 3 }; /// let subregion = region.subregion(Area::BlockRect { bo, width: 64, height: 64 }); /// # } /// ``` #[allow(dead_code)] pub fn subregion(&self, area: Area) -> PlaneRegion<'_, T> { if self.data.is_null() { return PlaneRegion::empty(&self.plane_cfg); } let rect = area.to_rect( self.plane_cfg.xdec, self.plane_cfg.ydec, self.rect.width, self.rect.height, ); assert!(rect.x >= 0 && rect.x as usize <= self.rect.width); assert!(rect.y >= 0 && rect.y as usize <= self.rect.height); // SAFETY: The above asserts ensure we do not go outside the original rectangle. let data = unsafe { self.data.add(rect.y as usize * self.plane_cfg.stride + rect.x as usize) }; let absolute_rect = Rect { x: self.rect.x + rect.x, y: self.rect.y + rect.y, width: rect.width, height: rect.height, }; PlaneRegion { data, plane_cfg: &self.plane_cfg, rect: absolute_rect, phantom: PhantomData, } } } unsafe impl Send for $name<'_, T> {} unsafe impl Sync for $name<'_, T> {} impl Index for $name<'_, T> { type Output = [T]; fn index(&self, index: usize) -> &Self::Output { assert!(index < self.rect.height); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.plane_cfg.stride); slice::from_raw_parts(ptr, self.rect.width) } } } } } plane_region_common!(PlaneRegion, as_ptr); plane_region_common!(PlaneRegionMut, as_mut_ptr, mut); impl PlaneRegionMut<'_, T> { pub fn data_ptr_mut(&mut self) -> *mut T { self.data } pub fn rows_iter_mut(&mut self) -> PlaneRegionRowsIterMut<'_, T> { PlaneRegionRowsIterMut { data: self.data, stride: self.plane_cfg.stride, width: self.rect.width, remaining: self.rect.height, phantom: PhantomData, } } pub fn as_const(&self) -> PlaneRegion<'_, T> { PlaneRegion { data: self.data, plane_cfg: self.plane_cfg, rect: self.rect, phantom: PhantomData, } } } impl IndexMut for PlaneRegionMut<'_, T> { fn index_mut(&mut self, index: usize) -> &mut Self::Output { assert!(index < self.rect.height); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.plane_cfg.stride); slice::from_raw_parts_mut(ptr, self.rect.width) } } } /// Iterator over plane region rows pub struct PlaneRegionRowsIter<'a, T: Pixel> { data: *const T, stride: usize, width: usize, remaining: usize, phantom: PhantomData<&'a T>, } impl<'a, T: Pixel> Iterator for PlaneRegionRowsIter<'a, T> { type Item = &'a [T]; fn next(&mut self) -> Option { if self.remaining > 0 { // SAFETY: We verified that we have enough data left to not go OOB, // assuming that `self.stride` and `self.width` are set correctly. let row = unsafe { let ptr = self.data; self.data = self.data.add(self.stride); slice::from_raw_parts(ptr, self.width) }; self.remaining -= 1; Some(row) } else { None } } fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } } /// Mutable iterator over plane region rows pub struct PlaneRegionRowsIterMut<'a, T: Pixel> { data: *mut T, stride: usize, width: usize, remaining: usize, phantom: PhantomData<&'a mut T>, } impl<'a, T: Pixel> Iterator for PlaneRegionRowsIterMut<'a, T> { type Item = &'a mut [T]; fn next(&mut self) -> Option { if self.remaining > 0 { // SAFETY: We verified that we have enough data left to not go OOB, // assuming that `self.stride` and `self.width` are set correctly. let row = unsafe { let ptr = self.data; self.data = self.data.add(self.stride); slice::from_raw_parts_mut(ptr, self.width) }; self.remaining -= 1; Some(row) } else { None } } fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } } impl ExactSizeIterator for PlaneRegionRowsIter<'_, T> { } impl FusedIterator for PlaneRegionRowsIter<'_, T> { } impl ExactSizeIterator for PlaneRegionRowsIterMut<'_, T> { } impl FusedIterator for PlaneRegionRowsIterMut<'_, T> { } pub struct VertWindows<'a, T: Pixel> { data: *const T, plane_cfg: &'a PlaneConfig, remaining: usize, output_rect: Rect, } pub struct HorzWindows<'a, T: Pixel> { data: *const T, plane_cfg: &'a PlaneConfig, remaining: usize, output_rect: Rect, } impl<'a, T: Pixel> Iterator for VertWindows<'a, T> { type Item = PlaneRegion<'a, T>; fn next(&mut self) -> Option { self.nth(0) } fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } fn nth(&mut self, n: usize) -> Option { if self.remaining > n { // SAFETY: We verified that we have enough data left to not go OOB. self.data = unsafe { self.data.add(self.plane_cfg.stride * n) }; self.output_rect.y += n as isize; let output = PlaneRegion { data: self.data, plane_cfg: self.plane_cfg, rect: self.output_rect, phantom: PhantomData, }; // SAFETY: We verified that we have enough data left to not go OOB. self.data = unsafe { self.data.add(self.plane_cfg.stride) }; self.output_rect.y += 1; self.remaining -= n + 1; Some(output) } else { None } } } impl<'a, T: Pixel> Iterator for HorzWindows<'a, T> { type Item = PlaneRegion<'a, T>; fn next(&mut self) -> Option { self.nth(0) } fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } fn nth(&mut self, n: usize) -> Option { if self.remaining > n { // SAFETY: We verified that we have enough data left to not go OOB. self.data = unsafe { self.data.add(n) }; self.output_rect.x += n as isize; let output = PlaneRegion { data: self.data, plane_cfg: self.plane_cfg, rect: self.output_rect, phantom: PhantomData, }; // SAFETY: We verified that we have enough data left to not go OOB. self.data = unsafe { self.data.add(1) }; self.output_rect.x += 1; self.remaining -= n + 1; Some(output) } else { None } } } impl ExactSizeIterator for VertWindows<'_, T> { } impl FusedIterator for VertWindows<'_, T> { } impl ExactSizeIterator for HorzWindows<'_, T> { } impl FusedIterator for HorzWindows<'_, T> { } /// Rectangle of a plane region, in pixels #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] pub struct Rect { // coordinates relative to the plane origin (xorigin, yorigin) pub x: isize, pub y: isize, pub width: usize, pub height: usize, } // Structure to describe a rectangle area in several ways // // To retrieve a subregion from a region, we need to provide the subregion // bounds, relative to its parent region. The subregion must always be included // in its parent region. // // For that purpose, we could just use a rectangle (x, y, width, height), but // this would be too cumbersome to use in practice. For example, we often need // to pass a subregion from an offset, using the same bottom-right corner as // its parent, or to pass a subregion expressed in block offset instead of // pixel offset. // // Area provides a flexible way to describe a subregion. #[derive(Debug, Clone, Copy)] pub enum Area { /// A well-defined rectangle Rect(Rect), /// A rectangle starting at offset (x, y) and ending at the bottom-right /// corner of the parent StartingAt { x: isize, y: isize }, /// a rectangle starting at given block offset until the bottom-right corner /// of the parent BlockStartingAt { bo: BlockOffset }, } impl Area { /// Convert to a rectangle of pixels. /// For a `BlockRect` and `BlockStartingAt`, for subsampled chroma planes, /// the returned rect will be aligned to a 4x4 chroma block. /// This is necessary for `compute_distortion` and `rdo_cfl_alpha` as /// the subsampled chroma block covers multiple luma blocks. pub const fn to_rect( self, xdec: usize, ydec: usize, parent_width: usize, parent_height: usize, ) -> Rect { match self { Area::Rect(rect) => rect, Area::StartingAt { x, y } => Rect { x, y, width: (parent_width as isize - x) as usize, height: (parent_height as isize - y) as usize, }, Area::BlockStartingAt { bo } => { let x = (bo.x >> xdec << BLOCK_TO_PLANE_SHIFT) as isize; let y = (bo.y >> ydec << BLOCK_TO_PLANE_SHIFT) as isize; Rect { x, y, width: (parent_width as isize - x) as usize, height: (parent_height as isize - y) as usize, } } } } } pub trait AsRegion { fn as_region(&self) -> PlaneRegion<'_, T>; fn region(&self, area: Area) -> PlaneRegion<'_, T>; fn region_mut(&mut self, area: Area) -> PlaneRegionMut<'_, T>; } impl AsRegion for Plane { fn as_region(&self) -> PlaneRegion<'_, T> { PlaneRegion::new_from_plane(self) } fn region(&self, area: Area) -> PlaneRegion<'_, T> { let rect = area.to_rect( self.cfg.xdec, self.cfg.ydec, self.cfg.stride - self.cfg.xorigin, self.cfg.alloc_height - self.cfg.yorigin, ); PlaneRegion::new(self, rect) } fn region_mut(&mut self, area: Area) -> PlaneRegionMut<'_, T> { let rect = area.to_rect( self.cfg.xdec, self.cfg.ydec, self.cfg.stride - self.cfg.xorigin, self.cfg.alloc_height - self.cfg.yorigin, ); PlaneRegionMut::new(self, rect) } } /// Absolute offset in blocks inside a plane, where a block is defined /// to be an `N*N` square where `N == (1 << BLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct PlaneBlockOffset(pub BlockOffset); impl PlaneBlockOffset { /// Convert to plane offset without decimation. pub const fn to_luma_plane_offset(self) -> PlaneOffset { self.0.to_luma_plane_offset() } } av-scenechange-0.14.1/src/data/prediction.rs000064400000000000000000000077221046102023000167740ustar 00000000000000use v_frame::{ pixel::Pixel, plane::{Plane, PlaneConfig, PlaneOffset, PlaneSlice}, }; use crate::{ cpu::CpuFeatureLevel, data::{ frame::{FrameInvariants, RefType}, mc::put_8tap, motion::MotionVector, plane::PlaneRegionMut, tile::TileRect, }, }; // There are more modes than in the spec because every allowed // drl index for NEAR modes is considered its own mode. #[allow(non_camel_case_types)] #[allow(clippy::upper_case_acronyms)] #[allow(dead_code)] #[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Default)] pub enum PredictionMode { #[default] DC_PRED, // Average of above and left pixels V_PRED, // Vertical H_PRED, // Horizontal D45_PRED, // Directional 45 degree D135_PRED, // Directional 135 degree D113_PRED, // Directional 113 degree D157_PRED, // Directional 157 degree D203_PRED, // Directional 203 degree D67_PRED, // Directional 67 degree SMOOTH_PRED, // Combination of horizontal and vertical interpolation SMOOTH_V_PRED, SMOOTH_H_PRED, PAETH_PRED, UV_CFL_PRED, NEARESTMV, NEAR0MV, NEAR1MV, NEAR2MV, GLOBALMV, NEWMV, // Compound ref compound modes NEAREST_NEARESTMV, NEAR_NEAR0MV, NEAR_NEAR1MV, NEAR_NEAR2MV, NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEW0MV, NEAR_NEW1MV, NEAR_NEW2MV, NEW_NEAR0MV, NEW_NEAR1MV, NEW_NEAR2MV, GLOBAL_GLOBALMV, NEW_NEWMV, } impl PredictionMode { pub fn is_intra(self) -> bool { self < PredictionMode::NEARESTMV } /// Inter prediction with a single reference (i.e. not compound mode) /// /// # Panics /// /// - If called on an intra `PredictionMode` #[allow(clippy::too_many_arguments)] pub fn predict_inter_single( self, fi: &FrameInvariants, tile_rect: TileRect, p: usize, po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize, height: usize, ref_frame: RefType, mv: MotionVector, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) { assert!(!self.is_intra()); let frame_po = tile_rect.to_frame_plane_offset(po); if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] { let (row_frac, col_frac, src) = PredictionMode::get_mv_params(&rec.frame.planes[p], frame_po, mv); put_8tap( dst, src, width, height, col_frac, row_frac, bit_depth, cpu_feature_level, ); } } // Used by inter prediction to extract the fractional component of a mv and // obtain the correct PlaneSlice to operate on. fn get_mv_params( rec_plane: &Plane, po: PlaneOffset, mv: MotionVector, ) -> (i32, i32, PlaneSlice) { let &PlaneConfig { xdec, ydec, .. } = &rec_plane.cfg; let row_offset = mv.row as i32 >> (3 + ydec); let col_offset = mv.col as i32 >> (3 + xdec); let row_frac = ((mv.row as i32) << (1 - ydec)) & 0xf; let col_frac = ((mv.col as i32) << (1 - xdec)) & 0xf; let qo = PlaneOffset { x: po.x + col_offset as isize - 3, y: po.y + row_offset as isize - 3, }; ( row_frac, col_frac, rec_plane.slice(qo).clamp().subslice(3, 3), ) } } #[derive(Copy, Clone, Debug)] #[allow(clippy::upper_case_acronyms)] pub enum PredictionVariant { NONE, LEFT, TOP, BOTH, } impl PredictionVariant { pub const fn new(x: usize, y: usize) -> Self { match (x, y) { (0, 0) => PredictionVariant::NONE, (_, 0) => PredictionVariant::LEFT, (0, _) => PredictionVariant::TOP, _ => PredictionVariant::BOTH, } } } av-scenechange-0.14.1/src/data/sad/simd_x86.rs000064400000000000000000000037031046102023000170370ustar 00000000000000use v_frame::{ pixel::{Pixel, PixelType}, plane::Plane, }; use crate::CpuFeatureLevel; macro_rules! decl_sad_plane_fn { ($($f:ident),+) => { extern "C" { $( fn $f( src: *const u8, dst: *const u8, stride: libc::size_t, width: libc::size_t, rows: libc::size_t ) -> u64; )* } }; } decl_sad_plane_fn!(avsc_sad_plane_8bpc_sse2, avsc_sad_plane_8bpc_avx2); pub(super) fn sad_plane_internal( src: &Plane, dst: &Plane, cpu: CpuFeatureLevel, ) -> u64 { use std::mem; assert_eq!(src.cfg.width, dst.cfg.width); assert_eq!(src.cfg.stride, dst.cfg.stride); assert_eq!(src.cfg.height, dst.cfg.height); assert!(src.cfg.width <= src.cfg.stride); match T::type_enum() { PixelType::U8 => { // helper macro to reduce boilerplate macro_rules! call_asm { ($func:ident, $src:expr, $dst:expr, $cpu:expr) => { // SAFETY: Calls Assembly code. unsafe { let result = $func( mem::transmute::<*const T, *const u8>(src.data_origin().as_ptr()), mem::transmute::<*const T, *const u8>(dst.data_origin().as_ptr()), src.cfg.stride, src.cfg.width, src.cfg.height, ); result } }; } if cpu >= CpuFeatureLevel::AVX2 { call_asm!(avsc_sad_plane_8bpc_avx2, src, dst, cpu) } else if cpu >= CpuFeatureLevel::SSE2 { call_asm!(avsc_sad_plane_8bpc_sse2, src, dst, cpu) } else { super::rust::sad_plane_internal(src, dst, cpu) } } PixelType::U16 => super::rust::sad_plane_internal(src, dst, cpu), } } av-scenechange-0.14.1/src/data/sad.rs000064400000000000000000000047601046102023000154020ustar 00000000000000#[cfg(asm_x86_64)] mod simd_x86; use rust::get_sad_internal; #[cfg(not(asm_x86_64))] use rust::*; #[cfg(asm_x86_64)] use simd_x86::*; use v_frame::{pixel::Pixel, plane::Plane}; use super::plane::PlaneRegion; use crate::cpu::CpuFeatureLevel; mod rust { use v_frame::{ pixel::{CastFromPrimitive, Pixel}, plane::Plane, }; use crate::{ data::plane::{Area, PlaneRegion, Rect}, CpuFeatureLevel, }; pub(super) fn sad_plane_internal( src: &Plane, dst: &Plane, _cpu: CpuFeatureLevel, ) -> u64 { assert_eq!(src.cfg.width, dst.cfg.width); assert_eq!(src.cfg.height, dst.cfg.height); src.rows_iter() .zip(dst.rows_iter()) .map(|(src, dst)| { src.iter() .zip(dst.iter()) .map(|(&p1, &p2)| i32::cast_from(p1).abs_diff(i32::cast_from(p2))) .sum::() as u64 }) .sum() } pub fn get_sad_internal( plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize, h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel, ) -> u32 { debug_assert!(w <= 128 && h <= 128); let plane_org = plane_org.subregion(Area::Rect(Rect { x: 0, y: 0, width: w, height: h, })); let plane_ref = plane_ref.subregion(Area::Rect(Rect { x: 0, y: 0, width: w, height: h, })); plane_org .rows_iter() .zip(plane_ref.rows_iter()) .map(|(src, dst)| { src.iter() .zip(dst) .map(|(&p1, &p2)| i32::cast_from(p1).abs_diff(i32::cast_from(p2))) .sum::() }) .sum() } } /// Compute the sum of absolute differences (SADs) on 2 rows of pixels /// /// This differs from other SAD functions in that it operates over a row /// (or line) of unknown length rather than a `PlaneRegion`. pub(crate) fn sad_plane(src: &Plane, dst: &Plane, cpu: CpuFeatureLevel) -> u64 { sad_plane_internal(src, dst, cpu) } pub(crate) fn get_sad( plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { get_sad_internal(plane_org, plane_ref, w, h, bit_depth, cpu) } av-scenechange-0.14.1/src/data/satd/simd_neon.rs000064400000000000000000000156561046102023000175470ustar 00000000000000use v_frame::pixel::{Pixel, PixelType}; use super::{to_index, DIST_FNS_LENGTH}; use crate::{ cpu::CpuFeatureLevel, data::{block::BlockSize, plane::PlaneRegion}, }; type SatdFn = unsafe extern "C" fn( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ) -> u32; type SatdHbdFn = unsafe extern "C" fn( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ) -> u32; macro_rules! declare_asm_dist_fn { ($(($name: ident, $T: ident)),+) => ( $( extern "C" { fn $name ( src: *const $T, src_stride: isize, dst: *const $T, dst_stride: isize ) -> u32; } )+ ) } declare_asm_dist_fn![ // SATD (avsc_satd4x4_neon, u8), (avsc_satd4x8_neon, u8), (avsc_satd4x16_neon, u8), (avsc_satd8x4_neon, u8), (avsc_satd8x8_neon, u8), (avsc_satd8x16_neon, u8), (avsc_satd8x32_neon, u8), (avsc_satd16x4_neon, u8), (avsc_satd16x8_neon, u8), (avsc_satd16x16_neon, u8), (avsc_satd16x32_neon, u8), (avsc_satd16x64_neon, u8), (avsc_satd32x8_neon, u8), (avsc_satd32x16_neon, u8), (avsc_satd32x32_neon, u8), (avsc_satd32x64_neon, u8), (avsc_satd64x16_neon, u8), (avsc_satd64x32_neon, u8), (avsc_satd64x64_neon, u8), (avsc_satd64x128_neon, u8), (avsc_satd128x64_neon, u8), (avsc_satd128x128_neon, u8), // SATD HBD (avsc_satd4x4_hbd_neon, u16), (avsc_satd4x8_hbd_neon, u16), (avsc_satd4x16_hbd_neon, u16), (avsc_satd8x4_hbd_neon, u16), (avsc_satd8x8_hbd_neon, u16), (avsc_satd8x16_hbd_neon, u16), (avsc_satd8x32_hbd_neon, u16), (avsc_satd16x4_hbd_neon, u16), (avsc_satd16x8_hbd_neon, u16), (avsc_satd16x16_hbd_neon, u16), (avsc_satd16x32_hbd_neon, u16), (avsc_satd16x64_hbd_neon, u16), (avsc_satd32x8_hbd_neon, u16), (avsc_satd32x16_hbd_neon, u16), (avsc_satd32x32_hbd_neon, u16), (avsc_satd32x64_hbd_neon, u16), (avsc_satd64x16_hbd_neon, u16), (avsc_satd64x32_hbd_neon, u16), (avsc_satd64x64_hbd_neon, u16), (avsc_satd64x128_hbd_neon, u16), (avsc_satd128x64_hbd_neon, u16), (avsc_satd128x128_hbd_neon, u16) ]; static SATD_FNS_NEON: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use crate::data::block::BlockSize::*; out[BLOCK_4X4 as usize] = Some(avsc_satd4x4_neon); out[BLOCK_4X8 as usize] = Some(avsc_satd4x8_neon); out[BLOCK_4X16 as usize] = Some(avsc_satd4x16_neon); out[BLOCK_8X4 as usize] = Some(avsc_satd8x4_neon); out[BLOCK_16X4 as usize] = Some(avsc_satd16x4_neon); out[BLOCK_8X8 as usize] = Some(avsc_satd8x8_neon); out[BLOCK_8X16 as usize] = Some(avsc_satd8x16_neon); out[BLOCK_8X32 as usize] = Some(avsc_satd8x32_neon); out[BLOCK_16X8 as usize] = Some(avsc_satd16x8_neon); out[BLOCK_16X16 as usize] = Some(avsc_satd16x16_neon); out[BLOCK_16X32 as usize] = Some(avsc_satd16x32_neon); out[BLOCK_16X64 as usize] = Some(avsc_satd16x64_neon); out[BLOCK_32X8 as usize] = Some(avsc_satd32x8_neon); out[BLOCK_32X16 as usize] = Some(avsc_satd32x16_neon); out[BLOCK_32X32 as usize] = Some(avsc_satd32x32_neon); out[BLOCK_32X64 as usize] = Some(avsc_satd32x64_neon); out[BLOCK_64X16 as usize] = Some(avsc_satd64x16_neon); out[BLOCK_64X32 as usize] = Some(avsc_satd64x32_neon); out[BLOCK_64X64 as usize] = Some(avsc_satd64x64_neon); out[BLOCK_64X128 as usize] = Some(avsc_satd64x128_neon); out[BLOCK_128X64 as usize] = Some(avsc_satd128x64_neon); out[BLOCK_128X128 as usize] = Some(avsc_satd128x128_neon); out }; static SATD_HBD_FNS_NEON: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use crate::data::block::BlockSize::*; out[BLOCK_4X4 as usize] = Some(avsc_satd4x4_hbd_neon); out[BLOCK_4X8 as usize] = Some(avsc_satd4x8_hbd_neon); out[BLOCK_4X16 as usize] = Some(avsc_satd4x16_hbd_neon); out[BLOCK_8X4 as usize] = Some(avsc_satd8x4_hbd_neon); out[BLOCK_16X4 as usize] = Some(avsc_satd16x4_hbd_neon); out[BLOCK_8X8 as usize] = Some(avsc_satd8x8_hbd_neon); out[BLOCK_8X16 as usize] = Some(avsc_satd8x16_hbd_neon); out[BLOCK_8X32 as usize] = Some(avsc_satd8x32_hbd_neon); out[BLOCK_16X8 as usize] = Some(avsc_satd16x8_hbd_neon); out[BLOCK_16X16 as usize] = Some(avsc_satd16x16_hbd_neon); out[BLOCK_16X32 as usize] = Some(avsc_satd16x32_hbd_neon); out[BLOCK_16X64 as usize] = Some(avsc_satd16x64_hbd_neon); out[BLOCK_32X8 as usize] = Some(avsc_satd32x8_hbd_neon); out[BLOCK_32X16 as usize] = Some(avsc_satd32x16_hbd_neon); out[BLOCK_32X32 as usize] = Some(avsc_satd32x32_hbd_neon); out[BLOCK_32X64 as usize] = Some(avsc_satd32x64_hbd_neon); out[BLOCK_64X16 as usize] = Some(avsc_satd64x16_hbd_neon); out[BLOCK_64X32 as usize] = Some(avsc_satd64x32_hbd_neon); out[BLOCK_64X64 as usize] = Some(avsc_satd64x64_hbd_neon); out[BLOCK_64X128 as usize] = Some(avsc_satd64x128_hbd_neon); out[BLOCK_128X64 as usize] = Some(avsc_satd128x64_hbd_neon); out[BLOCK_128X128 as usize] = Some(avsc_satd128x128_hbd_neon); out }; cpu_function_lookup_table!( SATD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [NEON] ); cpu_function_lookup_table!( SATD_HBD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [NEON] ); pub(super) fn get_satd_internal( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { let bsize_opt = BlockSize::from_width_and_height_opt(w, h); let call_rust = || -> u32 { super::rust::get_satd_internal(src, dst, w, h, bit_depth, cpu) }; match (bsize_opt, T::type_enum()) { (Err(_), _) => call_rust(), (Ok(bsize), PixelType::U8) => { match SATD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } (Ok(bsize), PixelType::U16) => { match SATD_HBD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } } } av-scenechange-0.14.1/src/data/satd/simd_x86.rs000064400000000000000000000174221046102023000172260ustar 00000000000000use v_frame::pixel::{Pixel, PixelType}; use super::{to_index, DIST_FNS_LENGTH}; use crate::{ cpu::CpuFeatureLevel, data::{block::BlockSize, plane::PlaneRegion}, }; type SatdFn = unsafe extern "C" fn( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ) -> u32; type SatdHBDFn = unsafe extern "C" fn( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, bdmax: u32, ) -> u32; macro_rules! declare_asm_dist_fn { ($(($name: ident, $T: ident)),+) => ( $( extern "C" { fn $name ( src: *const $T, src_stride: isize, dst: *const $T, dst_stride: isize ) -> u32; } )+ ) } macro_rules! declare_asm_satd_hbd_fn { ($($name: ident),+) => ( $( extern "C" { pub(crate) fn $name ( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, bdmax: u32 ) -> u32; } )+ ) } declare_asm_dist_fn![ // SSSE3 (avsc_satd_8x8_ssse3, u8), // SSE4 (avsc_satd_4x4_sse4, u8), // AVX (avsc_satd_4x4_avx2, u8), (avsc_satd_8x8_avx2, u8), (avsc_satd_16x16_avx2, u8), (avsc_satd_32x32_avx2, u8), (avsc_satd_64x64_avx2, u8), (avsc_satd_128x128_avx2, u8), (avsc_satd_4x8_avx2, u8), (avsc_satd_8x4_avx2, u8), (avsc_satd_8x16_avx2, u8), (avsc_satd_16x8_avx2, u8), (avsc_satd_16x32_avx2, u8), (avsc_satd_32x16_avx2, u8), (avsc_satd_32x64_avx2, u8), (avsc_satd_64x32_avx2, u8), (avsc_satd_64x128_avx2, u8), (avsc_satd_128x64_avx2, u8), (avsc_satd_4x16_avx2, u8), (avsc_satd_16x4_avx2, u8), (avsc_satd_8x32_avx2, u8), (avsc_satd_32x8_avx2, u8), (avsc_satd_16x64_avx2, u8), (avsc_satd_64x16_avx2, u8) ]; declare_asm_satd_hbd_fn![ avsc_satd_4x4_hbd_avx2, avsc_satd_8x4_hbd_avx2, avsc_satd_4x8_hbd_avx2, avsc_satd_8x8_hbd_avx2, avsc_satd_16x8_hbd_avx2, avsc_satd_16x16_hbd_avx2, avsc_satd_32x32_hbd_avx2, avsc_satd_64x64_hbd_avx2, avsc_satd_128x128_hbd_avx2, avsc_satd_16x32_hbd_avx2, avsc_satd_16x64_hbd_avx2, avsc_satd_32x16_hbd_avx2, avsc_satd_32x64_hbd_avx2, avsc_satd_64x16_hbd_avx2, avsc_satd_64x32_hbd_avx2, avsc_satd_64x128_hbd_avx2, avsc_satd_128x64_hbd_avx2, avsc_satd_32x8_hbd_avx2, avsc_satd_8x16_hbd_avx2, avsc_satd_8x32_hbd_avx2, avsc_satd_16x4_hbd_avx2, avsc_satd_4x16_hbd_avx2 ]; static SATD_FNS_SSSE3: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_8X8 as usize] = Some(avsc_satd_8x8_ssse3); out }; static SATD_FNS_SSE4_1: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(avsc_satd_4x4_sse4); out[BLOCK_8X8 as usize] = Some(avsc_satd_8x8_ssse3); out }; static SATD_FNS_AVX2: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(avsc_satd_4x4_avx2); out[BLOCK_8X8 as usize] = Some(avsc_satd_8x8_avx2); out[BLOCK_16X16 as usize] = Some(avsc_satd_16x16_avx2); out[BLOCK_32X32 as usize] = Some(avsc_satd_32x32_avx2); out[BLOCK_64X64 as usize] = Some(avsc_satd_64x64_avx2); out[BLOCK_128X128 as usize] = Some(avsc_satd_128x128_avx2); out[BLOCK_4X8 as usize] = Some(avsc_satd_4x8_avx2); out[BLOCK_8X4 as usize] = Some(avsc_satd_8x4_avx2); out[BLOCK_8X16 as usize] = Some(avsc_satd_8x16_avx2); out[BLOCK_16X8 as usize] = Some(avsc_satd_16x8_avx2); out[BLOCK_16X32 as usize] = Some(avsc_satd_16x32_avx2); out[BLOCK_32X16 as usize] = Some(avsc_satd_32x16_avx2); out[BLOCK_32X64 as usize] = Some(avsc_satd_32x64_avx2); out[BLOCK_64X32 as usize] = Some(avsc_satd_64x32_avx2); out[BLOCK_64X128 as usize] = Some(avsc_satd_64x128_avx2); out[BLOCK_128X64 as usize] = Some(avsc_satd_128x64_avx2); out[BLOCK_4X16 as usize] = Some(avsc_satd_4x16_avx2); out[BLOCK_16X4 as usize] = Some(avsc_satd_16x4_avx2); out[BLOCK_8X32 as usize] = Some(avsc_satd_8x32_avx2); out[BLOCK_32X8 as usize] = Some(avsc_satd_32x8_avx2); out[BLOCK_16X64 as usize] = Some(avsc_satd_16x64_avx2); out[BLOCK_64X16 as usize] = Some(avsc_satd_64x16_avx2); out }; cpu_function_lookup_table!( SATD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [SSSE3, SSE4_1, AVX2] ); static SATD_HBD_FNS_AVX2: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(avsc_satd_4x4_hbd_avx2); out[BLOCK_8X8 as usize] = Some(avsc_satd_8x8_hbd_avx2); out[BLOCK_16X16 as usize] = Some(avsc_satd_16x16_hbd_avx2); out[BLOCK_32X32 as usize] = Some(avsc_satd_32x32_hbd_avx2); out[BLOCK_64X64 as usize] = Some(avsc_satd_64x64_hbd_avx2); out[BLOCK_128X128 as usize] = Some(avsc_satd_128x128_hbd_avx2); out[BLOCK_4X8 as usize] = Some(avsc_satd_4x8_hbd_avx2); out[BLOCK_8X4 as usize] = Some(avsc_satd_8x4_hbd_avx2); out[BLOCK_8X16 as usize] = Some(avsc_satd_8x16_hbd_avx2); out[BLOCK_16X8 as usize] = Some(avsc_satd_16x8_hbd_avx2); out[BLOCK_16X32 as usize] = Some(avsc_satd_16x32_hbd_avx2); out[BLOCK_32X16 as usize] = Some(avsc_satd_32x16_hbd_avx2); out[BLOCK_32X64 as usize] = Some(avsc_satd_32x64_hbd_avx2); out[BLOCK_64X32 as usize] = Some(avsc_satd_64x32_hbd_avx2); out[BLOCK_64X128 as usize] = Some(avsc_satd_64x128_hbd_avx2); out[BLOCK_128X64 as usize] = Some(avsc_satd_128x64_hbd_avx2); out[BLOCK_4X16 as usize] = Some(avsc_satd_4x16_hbd_avx2); out[BLOCK_16X4 as usize] = Some(avsc_satd_16x4_hbd_avx2); out[BLOCK_8X32 as usize] = Some(avsc_satd_8x32_hbd_avx2); out[BLOCK_32X8 as usize] = Some(avsc_satd_32x8_hbd_avx2); out[BLOCK_16X64 as usize] = Some(avsc_satd_16x64_hbd_avx2); out[BLOCK_64X16 as usize] = Some(avsc_satd_64x16_hbd_avx2); out }; cpu_function_lookup_table!( SATD_HBD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [AVX2] ); pub(super) fn get_satd_internal( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { let bsize_opt = BlockSize::from_width_and_height_opt(w, h); let call_rust = || -> u32 { super::rust::get_satd_internal(dst, src, w, h, bit_depth, cpu) }; match (bsize_opt, T::type_enum()) { (Err(_), _) => call_rust(), (Ok(bsize), PixelType::U8) => { match SATD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { func( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } (Ok(bsize), PixelType::U16) => { match SATD_HBD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { func( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), (1 << bit_depth) - 1, ) }, None => call_rust(), } } } } av-scenechange-0.14.1/src/data/satd/tests.rs000064400000000000000000000050711046102023000167240ustar 00000000000000use v_frame::{pixel::Pixel, plane::Plane}; use crate::{ data::{ plane::{Area, AsRegion}, satd::get_satd, }, CpuFeatureLevel, }; // Generate plane data for get_sad_same() fn setup_planes() -> (Plane, Plane) { // Two planes with different strides let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8); let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8); // Make the test pattern robust to data alignment let xpad_off = (input_plane.cfg.xorigin - input_plane.cfg.xpad) as i32 - 8i32; for (i, row) in input_plane .data .chunks_mut(input_plane.cfg.stride) .enumerate() { for (j, pixel) in row.iter_mut().enumerate() { let val = ((j + i) as i32 - xpad_off) & 255i32; assert!(val >= u8::MIN.into() && val <= u8::MAX.into()); *pixel = T::cast_from(val); } } for (i, row) in rec_plane.data.chunks_mut(rec_plane.cfg.stride).enumerate() { for (j, pixel) in row.iter_mut().enumerate() { let val = (j as i32 - i as i32 - xpad_off) & 255i32; assert!(val >= u8::MIN.into() && val <= u8::MAX.into()); *pixel = T::cast_from(val); } } (input_plane, rec_plane) } fn get_satd_same_inner() { let blocks: Vec<(usize, usize, u32)> = vec![ (4, 4, 1408), (4, 8, 2016), (8, 4, 1816), (8, 8, 3984), (8, 16, 5136), (16, 8, 4864), (16, 16, 9984), (16, 32, 13824), (32, 16, 13760), (32, 32, 27952), (32, 64, 37168), (64, 32, 45104), (64, 64, 84176), (64, 128, 127920), (128, 64, 173680), (128, 128, 321456), (4, 16, 3136), (16, 4, 2632), (8, 32, 7056), (32, 8, 6624), (16, 64, 18432), (64, 16, 21312), ]; let bit_depth: usize = 8; let (input_plane, rec_plane) = setup_planes::(); for (w, h, distortion) in blocks { let area = Area::StartingAt { x: 32, y: 40 }; let input_region = input_plane.region(area); let rec_region = rec_plane.region(area); assert_eq!( distortion, get_satd( &input_region, &rec_region, w, h, bit_depth, CpuFeatureLevel::default() ) ); } } #[test] fn get_satd_same_u8() { get_satd_same_inner::(); } #[test] fn get_satd_same_u16() { get_satd_same_inner::(); } av-scenechange-0.14.1/src/data/satd.rs000064400000000000000000000101631046102023000155600ustar 00000000000000#[cfg(asm_neon)] mod simd_neon; #[cfg(asm_x86_64)] mod simd_x86; #[cfg(test)] mod tests; #[cfg(not(any(asm_x86_64, asm_neon)))] use rust::*; #[cfg(asm_neon)] use simd_neon::*; #[cfg(asm_x86_64)] use simd_x86::*; use v_frame::pixel::Pixel; use super::{block::BlockSize, plane::PlaneRegion}; use crate::cpu::CpuFeatureLevel; mod rust { use v_frame::{ math::msb, pixel::{CastFromPrimitive, Pixel}, }; use crate::{ cpu::CpuFeatureLevel, data::{ hadamard::{hadamard4x4, hadamard8x8}, plane::{Area, PlaneRegion, Rect}, sad::get_sad, }, }; /// Sum of absolute transformed differences over a block. /// w and h can be at most 128, the size of the largest block. /// Use the sum of 4x4 and 8x8 hadamard transforms for the transform, but /// revert to sad on edges when these transforms do not fit into w and h. /// 4x4 transforms instead of 8x8 transforms when width or height < 8. #[cfg_attr(all(asm_x86_64, target_feature = "avx2"), cold)] #[cfg_attr(asm_neon, cold)] pub(super) fn get_satd_internal( plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { assert!(w <= 128 && h <= 128); assert!(plane_org.rect().width >= w && plane_org.rect().height >= h); assert!(plane_ref.rect().width >= w && plane_ref.rect().height >= h); // Size of hadamard transform should be 4x4 or 8x8 // 4x* and *x4 use 4x4 and all other use 8x8 let size: usize = w.min(h).min(8); let tx2d = if size == 4 { hadamard4x4 } else { hadamard8x8 }; let mut sum: u64 = 0; // Loop over chunks the size of the chosen transform for chunk_y in (0..h).step_by(size) { let chunk_h = (h - chunk_y).min(size); for chunk_x in (0..w).step_by(size) { let chunk_w = (w - chunk_x).min(size); let chunk_area = Area::Rect(Rect { x: chunk_x as isize, y: chunk_y as isize, width: chunk_w, height: chunk_h, }); let chunk_org = plane_org.subregion(chunk_area); let chunk_ref = plane_ref.subregion(chunk_area); // Revert to sad on edge blocks (frame edges) if chunk_w != size || chunk_h != size { sum += get_sad(&chunk_org, &chunk_ref, chunk_w, chunk_h, bit_depth, cpu) as u64; continue; } let buf: &mut [i32] = &mut [0; 8 * 8][..size * size]; // Move the difference of the transforms to a buffer for (row_diff, (row_org, row_ref)) in buf .chunks_mut(size) .zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter())) { for (diff, (a, b)) in row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter())) { *diff = i32::cast_from(*a) - i32::cast_from(*b); } } // Perform the hadamard transform on the differences // SAFETY: A sufficient number elements exist for the size of the transform. unsafe { tx2d(buf); } // Sum the absolute values of the transformed differences sum += buf.iter().map(|a| a.unsigned_abs() as u64).sum::(); } } // Normalize the results let ln = msb(size as i32) as u64; ((sum + (1 << ln >> 1)) >> ln) as u32 } } // BlockSize::BLOCK_SIZES.next_power_of_two() const DIST_FNS_LENGTH: usize = 32; const fn to_index(bsize: BlockSize) -> usize { bsize as usize & (DIST_FNS_LENGTH - 1) } pub(crate) fn get_satd( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { get_satd_internal(src, dst, w, h, bit_depth, cpu) } av-scenechange-0.14.1/src/data/superblock.rs000064400000000000000000000032341046102023000167770ustar 00000000000000use crate::data::{ block::{BlockOffset, MIB_SIZE_LOG2}, tile::TileBlockOffset, }; pub const MAX_SB_SIZE_LOG2: usize = 7; pub const SUPERBLOCK_TO_BLOCK_SHIFT: usize = MIB_SIZE_LOG2; pub const SB_SIZE_LOG2: usize = 6; pub const SB_SIZE: usize = 1 << SB_SIZE_LOG2; pub const MI_SIZE_LOG2: usize = 2; pub const MI_SIZE: usize = 1 << MI_SIZE_LOG2; /// Absolute offset in superblocks inside a tile, where a superblock is defined /// to be an `N*N` square where `N == (1 << SUPERBLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct TileSuperBlockOffset(pub SuperBlockOffset); impl TileSuperBlockOffset { /// Offset of a block inside the current superblock. pub const fn block_offset(self, block_x: usize, block_y: usize) -> TileBlockOffset { TileBlockOffset(self.0.block_offset(block_x, block_y)) } } /// Absolute offset in superblocks inside a plane, where a superblock is defined /// to be an `N*N` square where `N == (1 << SUPERBLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct PlaneSuperBlockOffset(pub SuperBlockOffset); /// Absolute offset in superblocks, where a superblock is defined /// to be an `N*N` square where `N == (1 << SUPERBLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct SuperBlockOffset { pub x: usize, pub y: usize, } impl SuperBlockOffset { /// Offset of a block inside the current superblock. const fn block_offset(self, block_x: usize, block_y: usize) -> BlockOffset { BlockOffset { x: (self.x << SUPERBLOCK_TO_BLOCK_SHIFT) + block_x, y: (self.y << SUPERBLOCK_TO_BLOCK_SHIFT) + block_y, } } } av-scenechange-0.14.1/src/data/tile.rs000064400000000000000000000341351046102023000155670ustar 00000000000000use std::iter::FusedIterator; use v_frame::{ frame::Frame, math::Fixed, pixel::Pixel, plane::{Plane, PlaneOffset}, }; use crate::data::{ block::BlockOffset, frame::{FrameState, MAX_PLANES}, motion::{FrameMEStats, TileMEStatsMut, WriteGuardMEStats}, plane::{PlaneBlockOffset, PlaneRegion, Rect}, superblock::{PlaneSuperBlockOffset, SuperBlockOffset, MI_SIZE, MI_SIZE_LOG2, SB_SIZE_LOG2}, }; pub const MAX_TILE_WIDTH: usize = 4096; pub const MAX_TILE_AREA: usize = 4096 * 2304; pub const MAX_TILE_COLS: usize = 64; pub const MAX_TILE_ROWS: usize = 64; pub const MAX_TILE_RATE: f64 = 4096f64 * 2176f64 * 60f64 * 1.1; /// Tiled view of a frame #[derive(Debug)] pub struct Tile<'a, T: Pixel> { pub planes: [PlaneRegion<'a, T>; MAX_PLANES], } // common impl for Tile and TileMut macro_rules! tile_common { // $name: Tile or TileMut // $pr_type: PlaneRegion or PlaneRegionMut // $iter: iter or iter_mut //opt_mut: nothing or mut ($name:ident, $pr_type:ident, $iter:ident $(,$opt_mut:tt)?) => { impl<'a, T: Pixel> $name<'a, T> { pub fn new( frame: &'a $($opt_mut)? Frame, luma_rect: TileRect, ) -> Self { let mut planes_iter = frame.planes.$iter(); Self { planes: [ { let plane = planes_iter.next().unwrap(); $pr_type::new(plane, luma_rect.into()) }, { let plane = planes_iter.next().unwrap(); let rect = luma_rect.decimated(plane.cfg.xdec, plane.cfg.ydec); $pr_type::new(plane, rect.into()) }, { let plane = planes_iter.next().unwrap(); let rect = luma_rect.decimated(plane.cfg.xdec, plane.cfg.ydec); $pr_type::new(plane, rect.into()) }, ], } } } } } tile_common!(Tile, PlaneRegion, iter); /// Rectangle of a tile, in pixels /// /// This is similar to Rect, but with unsigned (x, y) for convenience. #[derive(Debug, Clone, Copy)] pub struct TileRect { pub x: usize, pub y: usize, pub width: usize, pub height: usize, } impl TileRect { pub const fn decimated(self, xdec: usize, ydec: usize) -> Self { Self { x: self.x >> xdec, y: self.y >> ydec, width: self.width >> xdec, height: self.height >> ydec, } } pub const fn to_frame_plane_offset(self, tile_po: PlaneOffset) -> PlaneOffset { PlaneOffset { x: self.x as isize + tile_po.x, y: self.y as isize + tile_po.y, } } } impl From for Rect { fn from(tile_rect: TileRect) -> Rect { Rect { x: tile_rect.x as isize, y: tile_rect.y as isize, width: tile_rect.width, height: tile_rect.height, } } } /// Tiled view of `FrameState` /// /// Contrary to `PlaneRegionMut` and `TileMut`, there is no const version: /// - in practice, we don't need it; /// - it would require to instantiate a const version of every of its inner /// tiled views recursively. /// /// # `TileState` fields /// /// The way the `FrameState` fields are mapped depend on how they are accessed /// tile-wise and frame-wise. /// /// Some fields (like `qc`) are only used during tile-encoding, so they are only /// stored in `TileState`. /// /// Some other fields (like `input` or `segmentation`) are not written /// tile-wise, so they just reference the matching field in `FrameState`. /// /// Some others (like `rec`) are written tile-wise, but must be accessible /// frame-wise once the tile views vanish (e.g. for deblocking). #[derive(Debug)] pub struct TileStateMut<'a, T: Pixel> { pub sbo: PlaneSuperBlockOffset, pub sb_width: usize, pub sb_height: usize, pub mi_width: usize, pub mi_height: usize, pub width: usize, pub height: usize, pub input_tile: Tile<'a, T>, // the current tile pub input_hres: &'a Plane, pub input_qres: &'a Plane, pub me_stats: Vec>, } impl<'a, T: Pixel> TileStateMut<'a, T> { pub fn new( fs: &'a mut FrameState, sbo: PlaneSuperBlockOffset, width: usize, height: usize, frame_me_stats: &'a mut [FrameMEStats], ) -> Self { debug_assert!( width % MI_SIZE == 0, "Tile width must be a multiple of MI_SIZE" ); debug_assert!( height % MI_SIZE == 0, "Tile width must be a multiple of MI_SIZE" ); let sb_rounded_width = width.align_power_of_two(SB_SIZE_LOG2); let sb_rounded_height = height.align_power_of_two(SB_SIZE_LOG2); let luma_rect = TileRect { x: sbo.0.x << SB_SIZE_LOG2, y: sbo.0.y << SB_SIZE_LOG2, width: sb_rounded_width, height: sb_rounded_height, }; let sb_width = width.align_power_of_two_and_shift(SB_SIZE_LOG2); let sb_height = height.align_power_of_two_and_shift(SB_SIZE_LOG2); Self { sbo, sb_width, sb_height, mi_width: width >> MI_SIZE_LOG2, mi_height: height >> MI_SIZE_LOG2, width, height, input_tile: Tile::new(&fs.input, luma_rect), input_hres: &fs.input_hres, input_qres: &fs.input_qres, me_stats: frame_me_stats .iter_mut() .map(|fmvs| { TileMEStatsMut::new( fmvs, sbo.0.x << (SB_SIZE_LOG2 - MI_SIZE_LOG2), sbo.0.y << (SB_SIZE_LOG2 - MI_SIZE_LOG2), width >> MI_SIZE_LOG2, height >> MI_SIZE_LOG2, ) }) .collect(), } } pub fn to_frame_block_offset(&self, tile_bo: TileBlockOffset) -> PlaneBlockOffset { let bx = self.sbo.0.x << (SB_SIZE_LOG2 - MI_SIZE_LOG2); let by = self.sbo.0.y << (SB_SIZE_LOG2 - MI_SIZE_LOG2); PlaneBlockOffset(BlockOffset { x: bx + tile_bo.0.x, y: by + tile_bo.0.y, }) } } /// Absolute offset in blocks inside a tile, where a block is defined /// to be an `N*N` square where `N == (1 << BLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub struct TileBlockOffset(pub BlockOffset); impl TileBlockOffset { /// Convert to plane offset without decimation. pub const fn to_luma_plane_offset(self) -> PlaneOffset { self.0.to_luma_plane_offset() } pub fn with_offset(self, col_offset: isize, row_offset: isize) -> TileBlockOffset { Self(self.0.with_offset(col_offset, row_offset)) } } /// Tiling information /// /// This stores everything necessary to split a frame into tiles, and write /// headers fields into the bitstream. /// /// The method `tile_iter_mut()` actually provides tiled views of `FrameState` /// and `FrameBlocks`. #[derive(Debug, Clone, Copy)] pub struct TilingInfo { pub frame_width: usize, pub frame_height: usize, pub tile_width_sb: usize, pub tile_height_sb: usize, pub cols: usize, // number of columns of tiles within the whole frame pub rows: usize, // number of rows of tiles within the whole frame } impl TilingInfo { /// # Panics /// /// Panics if the resulting tile sizes would be too large. pub fn from_target_tiles( frame_width: usize, frame_height: usize, frame_rate: f64, tile_cols_log2: usize, tile_rows_log2: usize, is_422_p: bool, ) -> Self { // // Frame::new() aligns to the next multiple of 8 let frame_width = frame_width.align_power_of_two(3); let frame_height = frame_height.align_power_of_two(3); let frame_width_sb = frame_width.align_power_of_two_and_shift(SB_SIZE_LOG2); let frame_height_sb = frame_height.align_power_of_two_and_shift(SB_SIZE_LOG2); let sb_cols = frame_width.align_power_of_two_and_shift(SB_SIZE_LOG2); let sb_rows = frame_height.align_power_of_two_and_shift(SB_SIZE_LOG2); // these are bitstream-defined values and must not be changed let max_tile_width_sb = MAX_TILE_WIDTH >> SB_SIZE_LOG2; let max_tile_area_sb = MAX_TILE_AREA >> (2 * SB_SIZE_LOG2); let min_tile_cols_log2 = Self::tile_log2(max_tile_width_sb, sb_cols).unwrap(); let max_tile_cols_log2 = Self::tile_log2(1, sb_cols.min(MAX_TILE_COLS)).unwrap(); let max_tile_rows_log2 = Self::tile_log2(1, sb_rows.min(MAX_TILE_ROWS)).unwrap(); let min_tiles_log2 = min_tile_cols_log2.max(Self::tile_log2(max_tile_area_sb, sb_cols * sb_rows).unwrap()); // Implements restriction in Annex A of the spec. // Unlike the other restrictions, this one does not change // the header coding of the tile rows/cols. let min_tiles_ratelimit_log2 = min_tiles_log2.max( ((frame_width * frame_height) as f64 * frame_rate / MAX_TILE_RATE) .ceil() .log2() .ceil() as usize, ); let tile_cols_log2 = tile_cols_log2.clamp(min_tile_cols_log2, max_tile_cols_log2); let tile_width_sb_pre = sb_cols.align_power_of_two_and_shift(tile_cols_log2); // If this is 4:2:2, our UV horizontal is subsampled but not our // vertical. Loop Restoration Units must be square, so they // will always have an even number of horizontal superblocks. For // tiles and LRUs to align, tile_width_sb must be even in 4:2:2 // video. // This is only relevant when doing loop restoration RDO inline // with block/superblock encoding, that is, where tiles are // relevant. If (when) we introduce optionally delaying loop-filter // encode to after the partitioning loop, we won't need to make // any 4:2:2 adjustment. let tile_width_sb = if is_422_p { (tile_width_sb_pre + 1) >> 1 << 1 } else { tile_width_sb_pre }; let cols = frame_width_sb.div_ceil(tile_width_sb); // Adjust tile_cols_log2 in case of rounding tile_width_sb to even. let tile_cols_log2 = Self::tile_log2(1, cols).unwrap(); assert!(tile_cols_log2 >= min_tile_cols_log2); let min_tile_rows_log2 = min_tiles_log2.saturating_sub(tile_cols_log2); let min_tile_rows_ratelimit_log2 = min_tiles_ratelimit_log2.saturating_sub(tile_cols_log2); let tile_rows_log2 = tile_rows_log2 .max(min_tile_rows_log2) .clamp(min_tile_rows_ratelimit_log2, max_tile_rows_log2); let tile_height_sb = sb_rows.align_power_of_two_and_shift(tile_rows_log2); let rows = frame_height_sb.div_ceil(tile_height_sb); Self { frame_width, frame_height, tile_width_sb, tile_height_sb, cols, rows, } } /// Return the smallest value for `k` such that `blkSize << k` is greater /// than or equal to `target`. /// /// pub fn tile_log2(blk_size: usize, target: usize) -> Option { let mut k = 0; while (blk_size.checked_shl(k)?) < target { k += 1; } Some(k as usize) } /// Split frame-level structures into tiles /// /// Provide mutable tiled views of frame-level structures. pub fn tile_iter_mut<'a, T: Pixel>( &self, fs: &'a mut FrameState, ) -> TileContextIterMut<'a, T> { let afs = fs as *mut _; let frame_me_stats = fs.frame_me_stats.write().expect("poisoned lock"); TileContextIterMut { ti: *self, fs: afs, next: 0, frame_me_stats, } } } /// Iterator over tiled views pub struct TileContextIterMut<'a, T: Pixel> { ti: TilingInfo, fs: *mut FrameState, frame_me_stats: WriteGuardMEStats<'a>, next: usize, } impl<'a, T: Pixel> Iterator for TileContextIterMut<'a, T> { type Item = TileContextMut<'a, T>; fn next(&mut self) -> Option { if self.next < self.ti.rows * self.ti.cols { let tile_col = self.next % self.ti.cols; let tile_row = self.next / self.ti.cols; let ctx = TileContextMut { ts: { // SAFETY: Multiple tiles mutably access this struct. // The dimensions must be configured correctly to ensure // the tiles do not overlap. let fs = unsafe { &mut *self.fs }; // SAFETY: ditto let frame_me_stats = unsafe { let len = self.frame_me_stats.len(); let ptr = self.frame_me_stats.as_mut_ptr(); std::slice::from_raw_parts_mut(ptr, len) }; let sbo = PlaneSuperBlockOffset(SuperBlockOffset { x: tile_col * self.ti.tile_width_sb, y: tile_row * self.ti.tile_height_sb, }); let x = sbo.0.x << SB_SIZE_LOG2; let y = sbo.0.y << SB_SIZE_LOG2; let tile_width = self.ti.tile_width_sb << SB_SIZE_LOG2; let tile_height = self.ti.tile_height_sb << SB_SIZE_LOG2; let width = tile_width.min(self.ti.frame_width - x); let height = tile_height.min(self.ti.frame_height - y); TileStateMut::new(fs, sbo, width, height, frame_me_stats) }, }; self.next += 1; Some(ctx) } else { None } } fn size_hint(&self) -> (usize, Option) { let remaining = self.ti.cols * self.ti.rows - self.next; (remaining, Some(remaining)) } } impl ExactSizeIterator for TileContextIterMut<'_, T> { } impl FusedIterator for TileContextIterMut<'_, T> { } /// Container for all tiled views pub struct TileContextMut<'a, T: Pixel> { pub ts: TileStateMut<'a, T>, } av-scenechange-0.14.1/src/decoder.rs000064400000000000000000000040111046102023000153140ustar 00000000000000use std::io::Read; use num_rational::Rational32; use v_frame::{ frame::Frame, pixel::{ChromaSampling, Pixel}, }; #[cfg(feature = "ffmpeg")] use crate::ffmpeg::FfmpegDecoder; #[cfg(feature = "vapoursynth")] use crate::vapoursynth::VapoursynthDecoder; pub enum Decoder { Y4m(y4m::Decoder), #[cfg(feature = "vapoursynth")] Vapoursynth(VapoursynthDecoder), #[cfg(feature = "ffmpeg")] Ffmpeg(FfmpegDecoder), } impl Decoder { /// # Errors /// /// - If using a Vapoursynth script that contains an unsupported video /// format. #[inline] pub fn get_video_details(&self) -> anyhow::Result { match self { Decoder::Y4m(dec) => Ok(crate::y4m::get_video_details(dec)), #[cfg(feature = "vapoursynth")] Decoder::Vapoursynth(dec) => dec.get_video_details(), #[cfg(feature = "ffmpeg")] Decoder::Ffmpeg(dec) => Ok(dec.video_details), } } /// # Errors /// /// - If a frame cannot be read. #[inline] pub fn read_video_frame( &mut self, video_details: &VideoDetails, ) -> anyhow::Result> { match self { Decoder::Y4m(dec) => crate::y4m::read_video_frame::(dec, video_details), #[cfg(feature = "vapoursynth")] Decoder::Vapoursynth(dec) => dec.read_video_frame::(video_details), #[cfg(feature = "ffmpeg")] Decoder::Ffmpeg(dec) => dec.read_video_frame::(), } } } #[derive(Debug, Clone, Copy)] pub struct VideoDetails { pub width: usize, pub height: usize, pub bit_depth: usize, pub chroma_sampling: ChromaSampling, pub time_base: Rational32, } impl Default for VideoDetails { #[inline] fn default() -> Self { VideoDetails { width: 640, height: 480, bit_depth: 8, chroma_sampling: ChromaSampling::Cs420, time_base: Rational32::new(1, 30), } } } av-scenechange-0.14.1/src/ffmpeg.rs000064400000000000000000000162761046102023000151730ustar 00000000000000extern crate ffmpeg_the_third as ffmpeg; use std::path::Path; use anyhow::bail; use ffmpeg::{ codec::{decoder, packet}, format, format::context, frame, media::Type, }; use ffmpeg_the_third::threading; use num_rational::Rational32; use v_frame::{ frame::Frame, pixel::{ChromaSampling, Pixel}, }; use crate::decoder::VideoDetails; /// An interface that is used for decoding a video stream using ffmpeg /// /// There have been desync issue reported with this decoder /// on some video files. Use at your own risk! pub struct FfmpegDecoder { input_ctx: context::Input, decoder: decoder::Video, pub video_details: VideoDetails, frameno: usize, stream_index: usize, end_of_stream: bool, eof_sent: bool, } impl FfmpegDecoder { /// Initialize a new ffmpeg decoder for a given input file /// /// # Errors /// /// - If ffmpeg is not available or not working on the system /// - If the source contains no video /// - If the source has an unsupported pixel format #[inline] pub fn new>(input: P) -> anyhow::Result { ffmpeg::init()?; let input_ctx = format::input(&input)?; let input = input_ctx .streams() .best(Type::Video) .ok_or_else(|| anyhow::anyhow!("Could not find video stream"))?; let stream_index = input.index(); let mut context = ffmpeg::codec::context::Context::from_parameters(input.parameters())?; context.set_threading(threading::Config::kind(threading::Type::Frame)); let mut decoder = context.decoder().video()?; decoder.set_parameters(input.parameters())?; let frame_rate = input.avg_frame_rate(); Ok(Self { video_details: VideoDetails { width: decoder.width() as usize, height: decoder.height() as usize, bit_depth: match decoder.format() { format::pixel::Pixel::YUV420P | format::pixel::Pixel::YUV422P | format::pixel::Pixel::YUV444P | format::pixel::Pixel::YUVJ420P | format::pixel::Pixel::YUVJ422P | format::pixel::Pixel::YUVJ444P => 8, format::pixel::Pixel::YUV420P10LE | format::pixel::Pixel::YUV422P10LE | format::pixel::Pixel::YUV444P10LE => 10, format::pixel::Pixel::YUV420P12LE | format::pixel::Pixel::YUV422P12LE | format::pixel::Pixel::YUV444P12LE => 12, _ => { bail!("Unsupported pixel format {:?}", decoder.format()); } }, chroma_sampling: match decoder.format() { format::pixel::Pixel::YUV420P | format::pixel::Pixel::YUVJ420P | format::pixel::Pixel::YUV420P10LE | format::pixel::Pixel::YUV420P12LE => ChromaSampling::Cs420, format::pixel::Pixel::YUV422P | format::pixel::Pixel::YUVJ422P | format::pixel::Pixel::YUV422P10LE | format::pixel::Pixel::YUV422P12LE => ChromaSampling::Cs422, format::pixel::Pixel::YUV444P | format::pixel::Pixel::YUVJ444P | format::pixel::Pixel::YUV444P10LE | format::pixel::Pixel::YUV444P12LE => ChromaSampling::Cs444, _ => { bail!("Unsupported pixel format {:?}", decoder.format()); } }, time_base: Rational32::new(frame_rate.denominator(), frame_rate.numerator()), }, decoder, input_ctx, frameno: 0, stream_index, end_of_stream: false, eof_sent: false, }) } fn decode_frame(&self, decoded: &frame::Video) -> Frame { const SB_SIZE_LOG2: usize = 6; const SB_SIZE: usize = 1 << SB_SIZE_LOG2; const SUBPEL_FILTER_SIZE: usize = 8; const FRAME_MARGIN: usize = 16 + SUBPEL_FILTER_SIZE; const LUMA_PADDING: usize = SB_SIZE + FRAME_MARGIN; let mut f: Frame = Frame::new_with_padding( self.video_details.width, self.video_details.height, self.video_details.chroma_sampling, LUMA_PADDING, ); let width = self.video_details.width; let height = self.video_details.height; let bit_depth = self.video_details.bit_depth; let bytes = if bit_depth > 8 { 2 } else { 1 }; let (chroma_width, _) = self .video_details .chroma_sampling .get_chroma_dimensions(width, height); f.planes[0].copy_from_raw_u8(decoded.data(0), width * bytes, bytes); f.planes[1].copy_from_raw_u8(decoded.data(1), chroma_width * bytes, bytes); f.planes[2].copy_from_raw_u8(decoded.data(2), chroma_width * bytes, bytes); f } /// # Errors /// /// - If there are no frames remaining #[inline] pub fn read_video_frame(&mut self) -> anyhow::Result> { // For some reason there's a crap ton of work needed to get ffmpeg to do // something simple, because each codec has it's own stupid way of doing // things and they don't all decode the same way. // // Maybe ffmpeg could have made a simple, singular interface that does this for // us, but noooooo. // // Reference: https://ffmpeg.org/doxygen/trunk/api-h264-test_8c_source.html#l00110 loop { // This iterator is actually really stupid... it doesn't reset itself after each // `new`. But that solves our lifetime hell issues, ironically. let packet = self .input_ctx .packets() .next() .and_then(Result::ok) .map(|(_, packet)| packet); let mut packet = if let Some(packet) = packet { packet } else { self.end_of_stream = true; packet::Packet::empty() }; if self.end_of_stream && !self.eof_sent { let _ = self.decoder.send_eof(); self.eof_sent = true; } if self.end_of_stream || packet.stream() == self.stream_index { let mut decoded = frame::Video::new( self.decoder.format(), self.video_details.width as u32, self.video_details.height as u32, ); packet.set_pts(Some(self.frameno as i64)); packet.set_dts(Some(self.frameno as i64)); if !self.end_of_stream { let _ = self.decoder.send_packet(&packet); } if self.decoder.receive_frame(&mut decoded).is_ok() { let f = self.decode_frame(&decoded); self.frameno += 1; return Ok(f); } else if self.end_of_stream { bail!("No frames left"); } } } } } av-scenechange-0.14.1/src/lib.rs000064400000000000000000000146611046102023000144710ustar 00000000000000// Documentation lints // FIXME: add docs and turn this to warn #![allow(missing_docs)] #![warn(clippy::doc_link_with_quotes)] #![warn(clippy::doc_markdown)] #![warn(clippy::missing_errors_doc)] #![warn(clippy::missing_panics_doc)] pub mod decoder; mod analyze; #[macro_use] mod cpu; mod data; #[cfg(feature = "ffmpeg")] pub mod ffmpeg; #[cfg(feature = "vapoursynth")] pub mod vapoursynth; mod y4m; use std::{ collections::{BTreeMap, BTreeSet}, io::Read, sync::Arc, time::Instant, }; pub use ::y4m::Decoder as Y4mDecoder; use decoder::Decoder; pub use num_rational::Rational32; use v_frame::pixel::Pixel; pub use crate::{analyze::SceneChangeDetector, cpu::CpuFeatureLevel}; /// Options determining how to run scene change detection. #[derive(Debug, Clone, Copy)] pub struct DetectionOptions { /// The speed of detection algorithm to use. /// Slower algorithms are more accurate/better for use in encoders. pub analysis_speed: SceneDetectionSpeed, /// Enabling this will utilize heuristics to avoid scenecuts /// that are too close to each other. /// This is generally useful if you want scenecut detection /// for use in an encoder. /// If you want a raw list of scene changes, you should disable this. pub detect_flashes: bool, /// The minimum distance between two scene changes. pub min_scenecut_distance: Option, /// The maximum distance between two scene changes. pub max_scenecut_distance: Option, /// The distance to look ahead in the video /// for scene flash detection. /// /// Not used if `detect_flashes` is `false`. pub lookahead_distance: usize, } impl Default for DetectionOptions { #[inline] fn default() -> Self { DetectionOptions { analysis_speed: SceneDetectionSpeed::Standard, detect_flashes: true, lookahead_distance: 5, min_scenecut_distance: None, max_scenecut_distance: None, } } } /// Results from a scene change detection pass. #[derive(Debug, Clone)] #[cfg_attr(feature = "serialize", derive(serde::Serialize))] pub struct DetectionResults { /// The 0-indexed frame numbers where scene changes were detected. pub scene_changes: Vec, /// The total number of frames read. pub frame_count: usize, /// Average speed (FPS) pub speed: f64, } /// # Errors /// /// - If using a Vapoursynth script that contains an unsupported video format. #[inline] pub fn new_detector( dec: &mut Decoder, opts: DetectionOptions, ) -> anyhow::Result> { let video_details = dec.get_video_details()?; Ok(SceneChangeDetector::new( (video_details.width, video_details.height), video_details.bit_depth, video_details.time_base.recip(), video_details.chroma_sampling, if opts.detect_flashes { opts.lookahead_distance } else { 1 }, opts.analysis_speed, opts.min_scenecut_distance.map_or(0, |val| val), opts.max_scenecut_distance .map_or_else(|| u32::MAX as usize, |val| val), CpuFeatureLevel::default(), )) } /// Runs through a y4m video clip, /// detecting where scene changes occur. /// This is adjustable based on the `opts` parameters. /// /// This is the preferred, simplified interface /// for analyzing a whole clip for scene changes. /// /// # Arguments /// /// - `progress_callback`: An optional callback that will fire after each frame /// is analyzed. Arguments passed in will be, in order, the number of frames /// analyzed, and the number of keyframes detected. This is generally useful /// for displaying progress, etc. /// /// # Errors /// /// - If using a Vapoursynth script that contains an unsupported video format. /// /// # Panics /// /// - If `opts.lookahead_distance` is 0. #[inline] pub fn detect_scene_changes( dec: &mut Decoder, opts: DetectionOptions, frame_limit: Option, progress_callback: Option<&dyn Fn(usize, usize)>, ) -> anyhow::Result { assert!(opts.lookahead_distance >= 1); let mut detector = new_detector::(dec, opts)?; let video_details = dec.get_video_details()?; let mut frame_queue = BTreeMap::new(); let mut keyframes = BTreeSet::new(); keyframes.insert(0); let start_time = Instant::now(); let mut frameno = 0; loop { let mut next_input_frameno = frame_queue.keys().last().copied().map_or(0, |key| key + 1); while next_input_frameno < (frameno + opts.lookahead_distance + 1).min(frame_limit.unwrap_or(usize::MAX)) { let frame = dec.read_video_frame(&video_details); if let Ok(frame) = frame { frame_queue.insert(next_input_frameno, Arc::new(frame)); next_input_frameno += 1; } else { // End of input break; } } // The frame_queue should start at whatever the previous frame was let frame_set = frame_queue .values() .take(opts.lookahead_distance + 2) .collect::>(); if frame_set.len() < 2 { // End of video break; } if frameno == 0 || detector.analyze_next_frame( &frame_set, frameno, *keyframes .iter() .last() .expect("at least 1 keyframe should exist"), ) { keyframes.insert(frameno); }; if frameno > 0 { frame_queue.remove(&(frameno - 1)); } frameno += 1; if let Some(progress_fn) = progress_callback { progress_fn(frameno, keyframes.len()); } if let Some(frame_limit) = frame_limit { if frameno == frame_limit { break; } } } Ok(DetectionResults { scene_changes: keyframes.into_iter().collect(), frame_count: frameno, speed: frameno as f64 / start_time.elapsed().as_secs_f64(), }) } #[derive(Clone, Copy, Debug, PartialOrd, PartialEq, Eq)] pub enum SceneDetectionSpeed { /// Fastest scene detection using pixel-wise comparison Fast, /// Scene detection using frame costs and motion vectors Standard, /// Do not perform scenecut detection, only place keyframes at fixed /// intervals None, } av-scenechange-0.14.1/src/main.rs000064400000000000000000000077041046102023000146470ustar 00000000000000use std::{ fs::File, io::{self, BufReader, Read, Write}, }; use anyhow::Result; use av_scenechange::{ decoder::Decoder, detect_scene_changes, DetectionOptions, SceneDetectionSpeed, }; use clap::Parser; #[derive(Parser, Debug)] struct Args { /// Sets the input file to use #[clap(value_parser)] pub input: String, /// Optional file to write results to #[clap(long, short, value_parser)] pub output: Option, /// Speed level for scene-change detection, 0: best quality, 1: fastest mode #[clap(long, short, value_parser, default_value_t = 0)] pub speed: u8, /// Do not detect short scene flashes and exclude them as scene cuts #[clap(long)] pub no_flash_detection: bool, /// Sets a minimum interval between two consecutive scenecuts #[clap(long, value_parser)] pub min_scenecut: Option, /// Sets a maximum interval between two consecutive scenecuts, /// after which a scenecut will be forced #[clap(long, value_parser)] pub max_scenecut: Option, } fn main() -> Result<()> { init_logger(); #[cfg(feature = "tracing")] let (chrome_layer, _guard) = tracing_chrome::ChromeLayerBuilder::new().build(); #[cfg(feature = "tracing")] { use tracing_subscriber::layer::SubscriberExt; tracing::subscriber::set_global_default(tracing_subscriber::registry().with(chrome_layer)) .expect("Could not initialize tracing subscriber"); } let matches = Args::parse(); let input = match matches.input.as_str() { "-" => Box::new(io::stdin()) as Box, f => Box::new(File::open(f)?) as Box, }; let mut reader = BufReader::new(input); let mut opts = DetectionOptions { detect_flashes: !matches.no_flash_detection, min_scenecut_distance: matches.min_scenecut, max_scenecut_distance: matches.max_scenecut, ..DetectionOptions::default() }; opts.analysis_speed = match matches.speed { 0 => SceneDetectionSpeed::Standard, 1 => SceneDetectionSpeed::Fast, _ => panic!("Speed mode must be in range [0; 1]"), }; let mut dec = Decoder::Y4m(y4m::Decoder::new(&mut reader)?); let bit_depth = dec.get_video_details()?.bit_depth; let results = if bit_depth == 8 { detect_scene_changes::<_, u8>(&mut dec, opts, None, None)? } else { detect_scene_changes::<_, u16>(&mut dec, opts, None, None)? }; print!("{}", serde_json::to_string(&results)?); if let Some(output_file) = matches.output { let mut file = File::create(output_file)?; let output = serde_json::to_string_pretty(&results)?; file.write_all(&output.into_bytes())?; } Ok(()) } #[cfg(not(feature = "devel"))] const fn init_logger() { // Do nothing } #[cfg(feature = "devel")] fn init_logger() { use std::str::FromStr; fn level_colored(l: log::Level) -> console::StyledObject<&'static str> { use console::style; use log::Level; match l { Level::Trace => style("??").dim(), Level::Debug => style("? ").dim(), Level::Info => style("> ").green(), Level::Warn => style("! ").yellow(), Level::Error => style("!!").red(), } } let level = std::env::var("LOG") .ok() .and_then(|l| log::LevelFilter::from_str(&l).ok()) .unwrap_or(log::LevelFilter::Warn); fern::Dispatch::new() .format(move |out, message, record| { out.finish(format_args!( "{level} {message}", level = level_colored(record.level()), message = message, )); }) // set the default log level. to filter out verbose log messages from dependencies, set // this to Warn and overwrite the log level for your crate. .level(level) // output to stdout .chain(std::io::stderr()) .apply() .unwrap(); } av-scenechange-0.14.1/src/vapoursynth.rs000064400000000000000000000144501046102023000163210ustar 00000000000000use std::{mem::size_of, path::Path, slice}; use anyhow::{bail, ensure}; use num_rational::Rational32; use v_frame::{ frame::Frame, pixel::{ChromaSampling, Pixel}, }; use vapoursynth::{ video_info::{Property, VideoInfo}, vsscript::{Environment, EvalFlags}, }; use crate::decoder::VideoDetails; const OUTPUT_INDEX: i32 = 0; pub struct VapoursynthDecoder { env: Environment, frames_read: usize, total_frames: usize, } impl VapoursynthDecoder { /// # Errors /// /// - If sourcing an invalid Vapoursynth script. /// - If using a Vapoursynth script that contains an unsupported video /// format. #[inline] pub fn new(source: &Path) -> anyhow::Result { let env = Environment::from_file(source, EvalFlags::SetWorkingDir)?; let total_frames = { let (node, _) = env.get_output(OUTPUT_INDEX)?; get_num_frames(node.info())? }; Ok(Self { env, frames_read: 0, total_frames, }) } /// # Errors /// /// - If sourcing an invalid Vapoursynth script. /// - If using a Vapoursynth script that contains an unsupported video /// format. #[inline] pub fn get_video_details(&self) -> anyhow::Result { let (node, _) = self.env.get_output(OUTPUT_INDEX)?; let info = node.info(); let (width, height) = get_resolution(info)?; Ok(VideoDetails { width, height, bit_depth: get_bit_depth(info)?, chroma_sampling: get_chroma_sampling(info)?, time_base: get_time_base(info)?, }) } /// # Errors /// /// - If sourcing an invalid Vapoursynth script. /// - If using a Vapoursynth script that contains an unsupported video /// format. /// - If a frame cannot be read. #[allow(clippy::transmute_ptr_to_ptr)] #[inline] pub fn read_video_frame(&mut self, cfg: &VideoDetails) -> anyhow::Result> { const SB_SIZE_LOG2: usize = 6; const SB_SIZE: usize = 1 << SB_SIZE_LOG2; const SUBPEL_FILTER_SIZE: usize = 8; const FRAME_MARGIN: usize = 16 + SUBPEL_FILTER_SIZE; const LUMA_PADDING: usize = SB_SIZE + FRAME_MARGIN; if self.frames_read >= self.total_frames { bail!("No frames left"); } let (node, _) = self.env.get_output(OUTPUT_INDEX)?; let vs_frame = node.get_frame(self.frames_read)?; self.frames_read += 1; let bytes = size_of::(); let mut f: Frame = Frame::new_with_padding(cfg.width, cfg.height, cfg.chroma_sampling, LUMA_PADDING); // SAFETY: We are using the stride to compute the length of the data slice unsafe { f.planes[0].copy_from_raw_u8( slice::from_raw_parts( vs_frame.data_ptr(0), vs_frame.stride(0) * vs_frame.height(0), ), vs_frame.stride(0), bytes, ); f.planes[1].copy_from_raw_u8( slice::from_raw_parts( vs_frame.data_ptr(1), vs_frame.stride(1) * vs_frame.height(1), ), vs_frame.stride(1), bytes, ); f.planes[2].copy_from_raw_u8( slice::from_raw_parts( vs_frame.data_ptr(2), vs_frame.stride(2) * vs_frame.height(2), ), vs_frame.stride(2), bytes, ); } Ok(f) } } /// Get the number of frames from a Vapoursynth `VideoInfo` struct. fn get_num_frames(info: VideoInfo) -> anyhow::Result { let num_frames = { if Property::Variable == info.format { bail!("Cannot output clips with varying format"); } if Property::Variable == info.resolution { bail!("Cannot output clips with varying dimensions"); } if Property::Variable == info.framerate { bail!("Cannot output clips with varying framerate"); } info.num_frames }; ensure!(num_frames != 0, "vapoursynth reported 0 frames"); Ok(num_frames) } /// Get the bit depth from a Vapoursynth `VideoInfo` struct. fn get_bit_depth(info: VideoInfo) -> anyhow::Result { let bits_per_sample = { match info.format { Property::Variable => { bail!("Cannot output clips with variable format"); } Property::Constant(x) => x.bits_per_sample(), } }; Ok(bits_per_sample as usize) } /// Get the resolution from a Vapoursynth `VideoInfo` struct. fn get_resolution(info: VideoInfo) -> anyhow::Result<(usize, usize)> { let resolution = { match info.resolution { Property::Variable => { bail!("Cannot output clips with variable resolution"); } Property::Constant(x) => x, } }; Ok((resolution.width, resolution.height)) } /// Get the time base (inverse of frame rate) from a Vapoursynth `VideoInfo` /// struct. fn get_time_base(info: VideoInfo) -> anyhow::Result { match info.framerate { Property::Variable => bail!("Cannot output clips with varying framerate"), Property::Constant(fps) => Ok(Rational32::new( fps.denominator as i32, fps.numerator as i32, )), } } /// Get the chroma sampling from a Vapoursynth `VideoInfo` struct. fn get_chroma_sampling(info: VideoInfo) -> anyhow::Result { match info.format { Property::Variable => bail!("Variable pixel format not supported"), Property::Constant(x) => match x.color_family() { vapoursynth::format::ColorFamily::YUV => { let ss = (x.sub_sampling_w(), x.sub_sampling_h()); match ss { (1, 1) => Ok(ChromaSampling::Cs420), (1, 0) => Ok(ChromaSampling::Cs422), (0, 0) => Ok(ChromaSampling::Cs444), _ => bail!("Unrecognized chroma subsampling"), } } vapoursynth::format::ColorFamily::Gray => Ok(ChromaSampling::Cs400), _ => bail!("Currently only YUV input is supported"), }, } } av-scenechange-0.14.1/src/y4m.rs000064400000000000000000000046211046102023000144270ustar 00000000000000use std::io::Read; use num_rational::Rational32; use v_frame::{ frame::Frame, pixel::{ChromaSampling, Pixel}, }; use crate::decoder::VideoDetails; pub fn get_video_details(dec: &y4m::Decoder) -> VideoDetails { let width = dec.get_width(); let height = dec.get_height(); let color_space = dec.get_colorspace(); let bit_depth = color_space.get_bit_depth(); let chroma_sampling = map_y4m_color_space(color_space); let framerate = dec.get_framerate(); let time_base = Rational32::new(framerate.den as i32, framerate.num as i32); VideoDetails { width, height, bit_depth, chroma_sampling, time_base, } } const fn map_y4m_color_space(color_space: y4m::Colorspace) -> ChromaSampling { use y4m::Colorspace::{ C420jpeg, C420mpeg2, C420p10, C420p12, C420paldv, C422p10, C422p12, C444p10, C444p12, Cmono, Cmono12, C420, C422, C444, }; use ChromaSampling::{Cs400, Cs420, Cs422, Cs444}; match color_space { Cmono | Cmono12 => Cs400, C420jpeg | C420paldv => Cs420, C420mpeg2 => Cs420, C420 | C420p10 | C420p12 => Cs420, C422 | C422p10 | C422p12 => Cs422, C444 | C444p10 | C444p12 => Cs444, _ => unimplemented!(), } } pub fn read_video_frame( dec: &mut y4m::Decoder, cfg: &VideoDetails, ) -> anyhow::Result> { const SB_SIZE_LOG2: usize = 6; const SB_SIZE: usize = 1 << SB_SIZE_LOG2; const SUBPEL_FILTER_SIZE: usize = 8; const FRAME_MARGIN: usize = 16 + SUBPEL_FILTER_SIZE; const LUMA_PADDING: usize = SB_SIZE + FRAME_MARGIN; let bytes = dec.get_bytes_per_sample(); dec.read_frame() .map(|frame| { let mut f: Frame = Frame::new_with_padding(cfg.width, cfg.height, cfg.chroma_sampling, LUMA_PADDING); let (chroma_width, _) = cfg .chroma_sampling .get_chroma_dimensions(cfg.width, cfg.height); f.planes[0].copy_from_raw_u8(frame.get_y_plane(), cfg.width * bytes, bytes); f.planes[1].copy_from_raw_u8(frame.get_u_plane(), chroma_width * bytes, bytes); f.planes[2].copy_from_raw_u8(frame.get_v_plane(), chroma_width * bytes, bytes); f }) .map_err(|e| e.into()) }