simsimd-6.5.12/.cargo_vcs_info.json 0000644 00000000136 00000000001 0012574 0 ustar {
"git": {
"sha1": "3e18990b79bd09aeb4a28fa1972f2d069bd47fdf"
},
"path_in_vcs": ""
} simsimd-6.5.12/Cargo.lock 0000644 00000040021 00000000001 0010544 0 ustar # This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
dependencies = [
"memchr",
]
[[package]]
name = "anes"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
[[package]]
name = "anstyle"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
[[package]]
name = "autocfg"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "bumpalo"
version = "3.19.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510"
[[package]]
name = "cast"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "cc"
version = "1.2.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f50d563227a1c37cc0a263f64eca3334388c01c5e4c4861a9def205c614383c"
dependencies = [
"find-msvc-tools",
"shlex",
]
[[package]]
name = "cfg-if"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "ciborium"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
dependencies = [
"ciborium-io",
"ciborium-ll",
"serde",
]
[[package]]
name = "ciborium-io"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
[[package]]
name = "ciborium-ll"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
dependencies = [
"ciborium-io",
"half",
]
[[package]]
name = "clap"
version = "4.5.53"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8"
dependencies = [
"clap_builder",
]
[[package]]
name = "clap_builder"
version = "4.5.53"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00"
dependencies = [
"anstyle",
"clap_lex",
]
[[package]]
name = "clap_lex"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
[[package]]
name = "criterion"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928"
dependencies = [
"anes",
"cast",
"ciborium",
"clap",
"criterion-plot",
"itertools",
"num-traits",
"oorandom",
"plotters",
"rayon",
"regex",
"serde",
"serde_json",
"tinytemplate",
"walkdir",
]
[[package]]
name = "criterion-plot"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338"
dependencies = [
"cast",
"itertools",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crunchy"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
[[package]]
name = "either"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "find-msvc-tools"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844"
[[package]]
name = "getrandom"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasip2",
]
[[package]]
name = "half"
version = "2.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
dependencies = [
"cfg-if",
"crunchy",
"zerocopy",
]
[[package]]
name = "itertools"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "js-sys"
version = "0.3.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.178"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091"
[[package]]
name = "memchr"
version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "oorandom"
version = "11.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
[[package]]
name = "plotters"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
dependencies = [
"num-traits",
"plotters-backend",
"plotters-svg",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "plotters-backend"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
[[package]]
name = "plotters-svg"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
dependencies = [
"plotters-backend",
]
[[package]]
name = "ppv-lite86"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
dependencies = [
"zerocopy",
]
[[package]]
name = "proc-macro2"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
dependencies = [
"proc-macro2",
]
[[package]]
name = "r-efi"
version = "5.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
[[package]]
name = "rand"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
dependencies = [
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
dependencies = [
"getrandom",
]
[[package]]
name = "rayon"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "regex"
version = "1.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
[[package]]
name = "rustversion"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]]
name = "ryu"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62049b2877bf12821e8f9ad256ee38fdc31db7387ec2d3b3f403024de2034aea"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
"serde_core",
]
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "simsimd"
version = "6.5.12"
dependencies = [
"cc",
"criterion",
"half",
"rand",
]
[[package]]
name = "syn"
version = "2.0.111"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tinytemplate"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "unicode-ident"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
[[package]]
name = "walkdir"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
dependencies = [
"same-file",
"winapi-util",
]
[[package]]
name = "wasip2"
version = "1.0.1+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
dependencies = [
"wit-bindgen",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd"
dependencies = [
"cfg-if",
"once_cell",
"rustversion",
"wasm-bindgen-macro",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40"
dependencies = [
"bumpalo",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4"
dependencies = [
"unicode-ident",
]
[[package]]
name = "web-sys"
version = "0.3.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "winapi-util"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [
"windows-sys",
]
[[package]]
name = "windows-link"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-sys"
version = "0.61.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
dependencies = [
"windows-link",
]
[[package]]
name = "wit-bindgen"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
[[package]]
name = "zerocopy"
version = "0.8.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
simsimd-6.5.12/Cargo.toml 0000644 00000003136 00000000001 0010575 0 ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.64"
name = "simsimd"
version = "6.5.12"
authors = ["Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>"]
build = "build.rs"
include = [
"rust/**",
"c/**",
"include/**",
"build.rs",
]
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Portable mixed-precision BLAS-like vector math library for x86 and ARM"
homepage = "https://ashvardanian.com/posts/simsimd-faster-scipy"
documentation = "https://docs.rs/simsimd"
readme = "README.md"
keywords = [
"simd",
"search",
"linear-algebra",
"vector",
]
categories = [
"mathematics",
"hardware-support",
"no-std",
"wasm",
"external-ffi-bindings",
]
license = "Apache-2.0"
repository = "https://github.com/ashvardanian/SimSIMD"
[features]
default = []
std = []
[lib]
name = "simsimd"
path = "rust/lib.rs"
[dev-dependencies.criterion]
version = "0.7.0"
[dev-dependencies.half]
version = "2.6.0"
[dev-dependencies.rand]
version = "0.9.1"
[build-dependencies.cc]
version = "1.2.36"
[profile.bench]
opt-level = 3
lto = true
codegen-units = 1
rpath = false
simsimd-6.5.12/Cargo.toml.orig 0000644 0000000 0000000 00000002400 10461020230 0014247 0 ustar 0000000 0000000 [package]
name = "simsimd"
description = "Portable mixed-precision BLAS-like vector math library for x86 and ARM"
version = "6.5.12"
edition = "2021"
license = "Apache-2.0"
authors = ["Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>"]
repository = "https://github.com/ashvardanian/SimSIMD"
documentation = "https://docs.rs/simsimd"
homepage = "https://ashvardanian.com/posts/simsimd-faster-scipy"
keywords = ["simd", "search", "linear-algebra", "vector"]
categories = [
"mathematics",
"hardware-support",
"no-std",
"wasm",
"external-ffi-bindings",
]
rust-version = "1.64" # Introduced Core C FFI in stable Rust
include = ["rust/**", "c/**", "include/**", "build.rs"]
[lib]
name = "simsimd"
path = "rust/lib.rs"
[build-dependencies]
cc = "1.2.36"
[[bench]]
name = "bench"
harness = false
path = "scripts/bench.rs"
[profile.bench]
opt-level = 3 # Corresponds to -O3
lto = true # Enables Link Time Optimization for further optimizations
codegen-units = 1 # May increase compilation time but optimizes further
rpath = false # On some systems, setting this to false can help with optimizations
[features]
default = []
std = []
[dev-dependencies]
criterion = { version = "0.7.0" }
rand = { version = "0.9.1" }
half = { version = "2.6.0" }
simsimd-6.5.12/README.md 0000644 0000000 0000000 00000205755 10461020230 0012661 0 ustar 0000000 0000000 
Computing dot-products, similarity measures, and distances between low- and high-dimensional vectors is ubiquitous in Machine Learning, Scientific Computing, Geospatial Analysis, and Information Retrieval.
These algorithms generally have linear complexity in time, constant or linear complexity in space, and are data-parallel.
In other words, it is easily parallelizable and vectorizable and often available in packages like BLAS (level 1) and LAPACK, as well as higher-level `numpy` and `scipy` Python libraries.
Ironically, even with decades of evolution in compilers and numerical computing, [most libraries can be 3-200x slower than hardware potential][benchmarks] even on the most popular hardware, like 64-bit x86 and Arm CPUs.
Moreover, most lack mixed-precision support, which is crucial for modern AI!
The rare few that support minimal mixed precision, run only on one platform, and are vendor-locked, by companies like Intel and Nvidia.
SimSIMD provides an alternative.
1️⃣ SimSIMD functions are practically as fast as `memcpy`.
2️⃣ Unlike BLAS, most kernels are designed for mixed-precision and bit-level operations.
3️⃣ SimSIMD often [ships more binaries than NumPy][compatibility] and has more backends than most BLAS implementations, and more high-level interfaces than most libraries.
[benchmarks]: https://ashvardanian.com/posts/simsimd-faster-scipy
[compatibility]: https://pypi.org/project/simsimd/#files
## Features
__SimSIMD__ (Arabic: "سيمسيم دي") is a mixed-precision math library of __over 350 SIMD-optimized kernels__ extensively used in AI, Search, and DBMS workloads.
Named after the iconic ["Open Sesame"](https://en.wikipedia.org/wiki/Open_sesame) command that opened doors to treasure in _Ali Baba and the Forty Thieves_, SimSIMD can help you 10x the cost-efficiency of your computational pipelines.
Implemented distance functions include:
- Euclidean (L2) and Cosine (Angular) spatial distances for Vector Search. _[docs][docs-spatial]_
- Dot-Products for real & complex vectors for DSP & Quantum computing. _[docs][docs-dot]_
- Hamming (~ Manhattan) and Jaccard (~ Tanimoto) bit-level distances. _[docs][docs-binary]_
- Set Intersections for Sparse Vectors and Text Analysis. _[docs][docs-sparse]_
- Mahalanobis distance and Quadratic forms for Scientific Computing. _[docs][docs-curved]_
- Kullback-Leibler and Jensen–Shannon divergences for probability distributions. _[docs][docs-probability]_
- Fused-Multiply-Add (FMA) and Weighted Sums to replace BLAS level 1 functions. _[docs][docs-fma]_
- For Levenshtein, Needleman–Wunsch, and Smith-Waterman, check [StringZilla][stringzilla].
- 🔜 Haversine and Vincenty's formulae for Geospatial Analysis.
[docs-spatial]: #cosine-similarity-reciprocal-square-root-and-newton-raphson-iteration
[docs-curved]: #curved-spaces-mahalanobis-distance-and-bilinear-quadratic-forms
[docs-sparse]: #set-intersection-galloping-and-binary-search
[docs-binary]: https://github.com/ashvardanian/SimSIMD/pull/138
[docs-dot]: #complex-dot-products-conjugate-dot-products-and-complex-numbers
[docs-probability]: #logarithms-in-kullback-leibler--jensenshannon-divergences
[docs-fma]: #mixed-precision-in-fused-multiply-add-and-weighted-sums
[scipy]: https://docs.scipy.org/doc/scipy/reference/spatial.distance.html#module-scipy.spatial.distance
[numpy]: https://numpy.org/doc/stable/reference/generated/numpy.inner.html
[stringzilla]: https://github.com/ashvardanian/stringzilla
Moreover, SimSIMD...
- handles `float64`, `float32`, `float16`, and `bfloat16` real & complex vectors.
- handles `int8` integral, `int4` sub-byte, and `b8` binary vectors.
- handles sparse `uint32` and `uint16` sets, and weighted sparse vectors.
- is a zero-dependency [header-only C 99](#using-simsimd-in-c) library.
- has [Python](#using-simsimd-in-python), [Rust](#using-simsimd-in-rust), [JS](#using-simsimd-in-javascript), and [Swift](#using-simsimd-in-swift) bindings.
- has Arm backends for NEON, Scalable Vector Extensions (SVE), and SVE2.
- has x86 backends for Haswell, Skylake, Ice Lake, Genoa, and Sapphire Rapids.
- with both compile-time and runtime CPU feature detection easily integrates anywhere!
Due to the high-level of fragmentation of SIMD support in different x86 CPUs, SimSIMD generally uses the names of select Intel CPU generations for its backends.
They, however, also work on AMD CPUs.
Intel Haswell is compatible with AMD Zen 1/2/3, while AMD Genoa Zen 4 covers AVX-512 instructions added to Intel Skylake and Ice Lake.
You can learn more about the technical implementation details in the following blog-posts:
- [Uses Horner's method for polynomial approximations, beating GCC 12 by 119x](https://ashvardanian.com/posts/gcc-12-vs-avx512fp16/).
- [Uses Arm SVE and x86 AVX-512's masked loads to eliminate tail `for`-loops](https://ashvardanian.com/posts/simsimd-faster-scipy/#tails-of-the-past-the-significance-of-masked-loads).
- [Substitutes libc's `sqrt` with Newton Raphson iterations](https://github.com/ashvardanian/SimSIMD/releases/tag/v5.4.0).
- [Uses Galloping and SVE2 histograms to intersect sparse vectors](https://ashvardanian.com/posts/simd-set-intersections-sve2-avx512/).
- For Python: [avoids slow PyBind11, SWIG, & `PyArg_ParseTuple`](https://ashvardanian.com/posts/pybind11-cpython-tutorial/) [using faster calling convention](https://ashvardanian.com/posts/discount-on-keyword-arguments-in-python/).
- For JavaScript: [uses typed arrays and NAPI for zero-copy calls](https://ashvardanian.com/posts/javascript-ai-vector-search/).
## Benchmarks
| NumPy |
C 99 |
SimSIMD |
cosine distances between 1536d vectors in int8 |
🚧 overflows
|
x86: 10,548,600 ops/s
arm: 11,379,300 ops/s
|
x86: 16,151,800 ops/s
arm: 13,524,000 ops/s
|
cosine distances between 1536d vectors in bfloat16 |
🚧 not supported
|
x86: 119,835 ops/s
arm: 403,909 ops/s
|
x86: 9,738,540 ops/s
arm: 4,881,900 ops/s
|
cosine distances between 1536d vectors in float16 |
x86: 40,481 ops/s
arm: 21,451 ops/s
|
x86: 501,310 ops/s
arm: 871,963 ops/s
|
x86: 7,627,600 ops/s
arm: 3,316,810 ops/s
|
cosine distances between 1536d vectors in float32 |
x86: 253,902 ops/s
arm: 46,394 ops/s
|
x86: 882,484 ops/s
arm: 399,661 ops/s
|
x86: 8,202,910 ops/s
arm: 3,400,620 ops/s
|
cosine distances between 1536d vectors in float64 |
x86: 212,421 ops/s
arm: 52,904 ops/s
|
x86: 839,301 ops/s
arm: 837,126 ops/s
|
x86: 1,538,530 ops/s
arm: 1,678,920 ops/s
|
euclidean distance between 1536d vectors in int8 |
x86: 252,113 ops/s
arm: 177,443 ops/s
|
x86: 6,690,110 ops/s
arm: 4,114,160 ops/s
|
x86: 18,989,000 ops/s
arm: 18,878,200 ops/s
|
euclidean distance between 1536d vectors in bfloat16 |
🚧 not supported
|
x86: 119,842 ops/s
arm: 1,049,230 ops/s
|
x86: 9,727,210 ops/s
arm: 4,233,420 ops/s
|
euclidean distance between 1536d vectors in float16 |
x86: 54,621 ops/s
arm: 71,793 ops/s
|
x86: 196,413 ops/s
arm: 911,370 ops/s
|
x86: 19,466,800 ops/s
arm: 3,522,760 ops/s
|
euclidean distance between 1536d vectors in float32 |
x86: 424,944 ops/s
arm: 292,629 ops/s
|
x86: 1,295,210 ops/s
arm: 1,055,940 ops/s
|
x86: 8,924,100 ops/s
arm: 3,602,650 ops/s
|
euclidean distance between 1536d vectors in float64 |
x86: 334,929 ops/s
arm: 237,505 ops/s
|
x86: 1,215,190 ops/s
arm: 905,782 ops/s
|
x86: 1,701,740 ops/s
arm: 1,735,840 ops/s
|
> For benchmarks we mostly use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API.
> The code was compiled with GCC 12, using glibc v2.35.
> The benchmarks performed on Arm-based Graviton3 AWS `c7g` instances and `r7iz` Intel Sapphire Rapids.
> Most modern Arm-based 64-bit CPUs will have similar relative speedups.
> Variance within x86 CPUs will be larger.
Similar speedups are often observed even when compared to BLAS and LAPACK libraries underlying most numerical computing libraries, including NumPy and SciPy in Python.
Broader benchmarking results:
- [Apple M2 Pro](https://ashvardanian.com/posts/simsimd-faster-scipy/#appendix-1-performance-on-apple-m2-pro).
- [Intel Sapphire Rapids](https://ashvardanian.com/posts/simsimd-faster-scipy/#appendix-2-performance-on-4th-gen-intel-xeon-platinum-8480).
- [AWS Graviton 3](https://ashvardanian.com/posts/simsimd-faster-scipy/#appendix-3-performance-on-aws-graviton-3).
## Using SimSIMD in Python
The package is intended to replace the usage of `numpy.inner`, `numpy.dot`, and `scipy.spatial.distance`.
Aside from drastic performance improvements, SimSIMD significantly improves accuracy in mixed precision setups.
NumPy and SciPy, processing `int8`, `uint8` or `float16` vectors, will use the same types for accumulators, while SimSIMD can combine `int8` enumeration, `int16` multiplication, and `int32` accumulation to avoid overflows entirely.
The same applies to processing `float16` and `bfloat16` values with `float32` precision.
### Installation
Use the following snippet to install SimSIMD and list hardware acceleration options available on your machine:
```sh
pip install simsimd
python -c "import simsimd; print(simsimd.get_capabilities())" # for hardware introspection
python -c "import simsimd; help(simsimd)" # for documentation
```
With precompiled binaries, SimSIMD ships `.pyi` interface files for type hinting and static analysis.
You can check all the available functions in [`python/annotations/__init__.pyi`](https://github.com/ashvardanian/SimSIMD/blob/main/python/annotations/__init__.pyi).
### One-to-One Distance
```py
import simsimd
import numpy as np
vec1 = np.random.randn(1536).astype(np.float32)
vec2 = np.random.randn(1536).astype(np.float32)
dist = simsimd.cosine(vec1, vec2)
```
Supported functions include `cosine`, `inner`, `sqeuclidean`, `hamming`, `jaccard`, `kullbackleibler`, `jensenshannon`, and `intersect`.
Dot products are supported for both real and complex numbers:
```py
vec1 = np.random.randn(768).astype(np.float64) + 1j * np.random.randn(768).astype(np.float64)
vec2 = np.random.randn(768).astype(np.float64) + 1j * np.random.randn(768).astype(np.float64)
dist = simsimd.dot(vec1.astype(np.complex128), vec2.astype(np.complex128))
dist = simsimd.dot(vec1.astype(np.complex64), vec2.astype(np.complex64))
dist = simsimd.vdot(vec1.astype(np.complex64), vec2.astype(np.complex64)) # conjugate, same as `np.vdot`
```
Unlike SciPy, SimSIMD allows explicitly stating the precision of the input vectors, which is especially useful for mixed-precision setups.
The `dtype` argument can be passed both by name and as a positional argument:
```py
dist = simsimd.cosine(vec1, vec2, "int8")
dist = simsimd.cosine(vec1, vec2, "float16")
dist = simsimd.cosine(vec1, vec2, "float32")
dist = simsimd.cosine(vec1, vec2, "float64")
dist = simsimd.hamming(vec1, vec2, "bin8")
```
Binary distance functions are computed at a bit-level.
Meaning a vector of 10x 8-bit integers will be treated as a sequence of 80 individual bits or dimensions.
This differs from NumPy, that can't handle smaller-than-byte types, but you can still avoid the `bin8` argument by reinterpreting the vector as booleans:
```py
vec1 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
vec2 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
hamming_distance = simsimd.hamming(vec1, vec2)
jaccard_distance = simsimd.jaccard(vec1, vec2)
```
With other frameworks, like PyTorch, one can get a richer type-system than NumPy, but the lack of good CPython interoperability makes it hard to pass data without copies.
Here is an example of using SimSIMD with PyTorch to compute the cosine similarity between two `bfloat16` vectors:
```py
import numpy as np
buf1 = np.empty(8, dtype=np.uint16)
buf2 = np.empty(8, dtype=np.uint16)
# View the same memory region with PyTorch and randomize it
import torch
vec1 = torch.asarray(memoryview(buf1), copy=False).view(torch.bfloat16)
vec2 = torch.asarray(memoryview(buf2), copy=False).view(torch.bfloat16)
torch.randn(8, out=vec1)
torch.randn(8, out=vec2)
# Both libs will look into the same memory buffers and report the same results
dist_slow = 1 - torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
dist_fast = simsimd.cosine(buf1, buf2, "bfloat16")
```
It also allows using SimSIMD for half-precision complex numbers, which NumPy does not support.
For that, view data as continuous even-length `np.float16` vectors and override type-resolution with `complex32` string.
```py
vec1 = np.random.randn(1536).astype(np.float16)
vec2 = np.random.randn(1536).astype(np.float16)
simd.dot(vec1, vec2, "complex32")
simd.vdot(vec1, vec2, "complex32")
```
When dealing with sparse representations and integer sets, you can apply the `intersect` function to two 1-dimensional arrays of `uint16` or `uint32` integers:
```py
from random import randint
import numpy as np
import simsimd as simd
length1, length2 = randint(1, 100), randint(1, 100)
vec1 = np.sort(np.random.randint(0, 1000, length1).astype(np.uint16))
vec2 = np.sort(np.random.randint(0, 1000, length2).astype(np.uint16))
slow_result = len(np.intersect1d(vec1, vec2))
fast_result = simd.intersect(vec1, vec2)
assert slow_result == fast_result
```
### One-to-Many Distances
Every distance function can be used not only for one-to-one but also one-to-many and many-to-many distance calculations.
For one-to-many:
```py
vec1 = np.random.randn(1536).astype(np.float32) # rank 1 tensor
batch1 = np.random.randn(1, 1536).astype(np.float32) # rank 2 tensor
batch2 = np.random.randn(100, 1536).astype(np.float32)
dist_rank1 = simsimd.cosine(vec1, batch2)
dist_rank2 = simsimd.cosine(batch1, batch2)
```
### Many-to-Many Distances
All distance functions in SimSIMD can be used to compute many-to-many distances.
For two batches of 100 vectors to compute 100 distances, one would call it like this:
```py
batch1 = np.random.randn(100, 1536).astype(np.float32)
batch2 = np.random.randn(100, 1536).astype(np.float32)
dist = simsimd.cosine(batch1, batch2)
```
Input matrices must have identical shapes.
This functionality isn't natively present in NumPy or SciPy, and generally requires creating intermediate arrays, which is inefficient and memory-consuming.
### Many-to-Many All-Pairs Distances
One can use SimSIMD to compute distances between all possible pairs of rows across two matrices (akin to [`scipy.spatial.distance.cdist`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html)).
The resulting object will have a type `DistancesTensor`, zero-copy compatible with NumPy and other libraries.
For two arrays of 10 and 1,000 entries, the resulting tensor will have 10,000 cells:
```py
import numpy as np
from simsimd import cdist, DistancesTensor
matrix1 = np.random.randn(1000, 1536).astype(np.float32)
matrix2 = np.random.randn(10, 1536).astype(np.float32)
distances: DistancesTensor = simsimd.cdist(matrix1, matrix2, metric="cosine") # zero-copy, managed by SimSIMD
distances_array: np.ndarray = np.array(distances, copy=True) # now managed by NumPy
```
### Element-wise Kernels
SimSIMD also provides mixed-precision element-wise kernels, where the input vectors and the output have the same numeric type, but the intermediate accumulators are of a higher precision.
```py
import numpy as np
from simsimd import fma, wsum
# Let's take two FullHD video frames
first_frame = np.random.randn(1920 * 1024).astype(np.uint8)
second_frame = np.random.randn(1920 * 1024).astype(np.uint8)
average_frame = np.empty_like(first_frame)
wsum(first_frame, second_frame, alpha=0.5, beta=0.5, out=average_frame)
# Slow analog with NumPy:
slow_average_frame = (0.5 * first_frame + 0.5 * second_frame).astype(np.uint8)
```
Similarly, the `fma` takes three arguments and computes the fused multiply-add operation.
In applications like Machine Learning you may also benefit from using the "brain-float" format not natively supported by NumPy.
In 3D Graphics, for example, we can use FMA to compute the [Phong shading model](https://en.wikipedia.org/wiki/Phong_shading):
```py
# Assume a FullHD frame with random values for simplicity
light_intensity = np.random.rand(1920 * 1080).astype(np.float16) # Intensity of light on each pixel
diffuse_component = np.random.rand(1920 * 1080).astype(np.float16) # Diffuse reflectance on the surface
specular_component = np.random.rand(1920 * 1080).astype(np.float16) # Specular reflectance for highlights
output_color = np.empty_like(light_intensity) # Array to store the resulting color intensity
# Define the scaling factors for diffuse and specular contributions
alpha = 0.7 # Weight for the diffuse component
beta = 0.3 # Weight for the specular component
# Formula: color = alpha * light_intensity * diffuse_component + beta * specular_component
fma(light_intensity, diffuse_component, specular_component,
dtype="float16", # Optional, unless it can't be inferred from the input
alpha=alpha, beta=beta, out=output_color)
# Slow analog with NumPy for comparison
slow_output_color = (alpha * light_intensity * diffuse_component + beta * specular_component).astype(np.float16)
```
### Multithreading and Memory Usage
By default, computations use a single CPU core.
To override this behavior, use the `threads` argument.
Set it to `0` to use all available CPU cores and let the underlying C library manage the thread pool.
Here is an example of dealing with large sets of binary vectors:
```py
ndim = 1536 # OpenAI Ada embeddings
matrix1 = np.packbits(np.random.randint(2, size=(10_000, ndim)).astype(np.uint8))
matrix2 = np.packbits(np.random.randint(2, size=(1_000, ndim)).astype(np.uint8))
distances = simsimd.cdist(matrix1, matrix2,
metric="hamming", # Unlike SciPy, SimSIMD doesn't divide by the number of dimensions
out_dtype="uint8", # so we can use `uint8` instead of `float64` to save memory.
threads=0, # Use all CPU cores with OpenMP.
dtype="bin8", # Override input argument type to `bin8` eight-bit words.
)
```
Alternatively, when using free-threading Python 3.13t builds, one can combine single-threaded SimSIMD operations with Python's `concurrent.futures.ThreadPoolExecutor` to parallelize the computations.
By default, the output distances will be stored in double-precision `float64` floating-point numbers.
That behavior may not be space-efficient, especially if you are computing the hamming distance between short binary vectors, that will generally fit into 8x smaller `uint8` or `uint16` types.
To override this behavior, use the `out_dtype` argument, or consider pre-allocating the output array and passing it to the `out` argument.
A more complete example may look like this:
```py
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor
from simsimd import cosine
import numpy as np
# Generate large dataset
vectors_a = np.random.rand(100_000, 1536).astype(np.float32)
vectors_b = np.random.rand(100_000, 1536).astype(np.float32)
distances = np.zeros((100_000,), dtype=np.float32)
def compute_batch(start_idx, end_idx):
batch_a = vectors_a[start_idx:end_idx]
batch_b = vectors_b[start_idx:end_idx]
cosine(batch_a, batch_b, out=distances[start_idx:end_idx])
# Use all CPU cores with true parallelism (no GIL!)
num_threads = cpu_count()
chunk_size = len(vectors_a) // num_threads
with ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = []
for i in range(num_threads):
start_idx = i * chunk_size
end_idx = (i + 1) * chunk_size if i < num_threads - 1 else len(vectors_a)
futures.append(executor.submit(compute_batch, start_idx, end_idx))
# Collect results from all threads
results = [future.result() for future in futures]
```
### Half-Precision Brain-Float Numbers
The "brain-float-16" is a popular machine learning format.
It's broadly supported in hardware and is very machine-friendly, but software support is still lagging behind.
[Unlike NumPy](https://github.com/numpy/numpy/issues/19808), you can already use `bf16` datatype in SimSIMD.
Luckily, to downcast `f32` to `bf16` you only have to drop the last 16 bits:
```py
import numpy as np
import simsimd as simd
ndim = 1536
a = np.random.randn(ndim).astype(np.float32)
b = np.random.randn(ndim).astype(np.float32)
# NumPy doesn't natively support brain-float, so we need a trick!
# Luckily, it's very easy to reduce the representation accuracy
# by simply masking the low 16-bits of our 32-bit single-precision
# numbers. We can also add `0x8000` to round the numbers.
a_f32rounded = ((a.view(np.uint32) + 0x8000) & 0xFFFF0000).view(np.float32)
b_f32rounded = ((b.view(np.uint32) + 0x8000) & 0xFFFF0000).view(np.float32)
# To represent them as brain-floats, we need to drop the second half
a_bf16 = np.right_shift(a_f32rounded.view(np.uint32), 16).astype(np.uint16)
b_bf16 = np.right_shift(b_f32rounded.view(np.uint32), 16).astype(np.uint16)
# Now we can compare the results
expected = np.inner(a_f32rounded, b_f32rounded)
result = simd.inner(a_bf16, b_bf16, "bf16")
```
### Helper Functions
You can turn specific backends on or off depending on the exact environment.
A common case may be avoiding AVX-512 on older AMD CPUs and [Intel Ice Lake](https://travisdowns.github.io/blog/2020/08/19/icl-avx512-freq.html) CPUs to ensure the CPU doesn't change the frequency license and throttle performance.
```py
$ simsimd.get_capabilities()
> {'serial': True, 'neon': False, 'sve': False, 'neon_f16': False, 'sve_f16': False, 'neon_bf16': False, 'sve_bf16': False, 'neon_i8': False, 'sve_i8': False, 'haswell': True, 'skylake': True, 'ice': True, 'genoa': True, 'sapphire': True, 'turin': True}
$ simsimd.disable_capability("sapphire")
$ simsimd.enable_capability("sapphire")
```
### Using Python API with USearch
Want to use it in Python with [USearch](https://github.com/unum-cloud/usearch)?
You can wrap the raw C function pointers SimSIMD backends into a `CompiledMetric` and pass it to USearch, similar to how it handles Numba's JIT-compiled code.
```py
from usearch.index import Index, CompiledMetric, MetricKind, MetricSignature
from simsimd import pointer_to_sqeuclidean, pointer_to_cosine, pointer_to_inner
metric = CompiledMetric(
pointer=pointer_to_cosine("f16"),
kind=MetricKind.Cos,
signature=MetricSignature.ArrayArraySize,
)
index = Index(256, metric=metric)
```
## Using SimSIMD in Rust
To install, add the following to your `Cargo.toml`:
```toml
[dependencies]
simsimd = "..."
```
Before using the SimSIMD library, ensure you have imported the necessary traits and types into your Rust source file.
The library provides several traits for different distance/similarity kinds - `SpatialSimilarity`, `BinarySimilarity`, and `ProbabilitySimilarity`.
### Spatial Similarity: Cosine and Euclidean Distances
```rust
use simsimd::SpatialSimilarity;
fn main() {
let vector_a: Vec = vec![1.0, 2.0, 3.0];
let vector_b: Vec = vec![4.0, 5.0, 6.0];
// Compute the cosine distance between vectors
let cosine_distance = f32::cosine(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Cosine Distance: {}", cosine_distance);
// Compute the squared Euclidean distance between vectors
let sq_euclidean_distance = f32::sqeuclidean(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Squared Euclidean Distance: {}", sq_euclidean_distance);
}
```
Spatial similarity functions are available for `f64`, `f32`, `f16`, and `i8` types.
### Dot-Products: Inner and Complex Inner Products
```rust
use simsimd::SpatialSimilarity;
use simsimd::ComplexProducts;
fn main() {
// Complex vectors have interleaved real & imaginary components
let vector_a: Vec = vec![1.0, 2.0, 3.0, 4.0];
let vector_b: Vec = vec![5.0, 6.0, 7.0, 8.0];
// Compute the inner product between vectors
let inner_product = SpatialSimilarity::dot(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Inner Product: {}", inner_product);
// Compute the complex inner product between vectors
let complex_inner_product = ComplexProducts::dot(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
let complex_conjugate_inner_product = ComplexProducts::vdot(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Complex Inner Product: {:?}", complex_inner_product); // -18, 69
println!("Complex C. Inner Product: {:?}", complex_conjugate_inner_product); // 70, -8
}
```
Complex inner products are available for `f64`, `f32`, and `f16` types.
### Probability Distributions: Jensen-Shannon and Kullback-Leibler Divergences
```rust
use simsimd::ProbabilitySimilarity;
fn main() {
let vector_a: Vec = vec![1.0, 2.0, 3.0];
let vector_b: Vec = vec![4.0, 5.0, 6.0];
let jensen_shannon = f32::jensenshannon(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Jensen-Shannon Divergence: {}", jensen_shannon);
let kullback_leibler = f32::kullbackleibler(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Kullback-Leibler Divergence: {}", kullback_leibler);
}
```
Probability similarity functions are available for `f64`, `f32`, and `f16` types.
### Binary Similarity: Hamming and Jaccard Distances
Similar to spatial distances, one can compute bit-level distance functions between slices of unsigned integers:
```rust
use simsimd::BinarySimilarity;
fn main() {
let vector_a = &[0b11110000, 0b00001111, 0b10101010];
let vector_b = &[0b11110000, 0b00001111, 0b01010101];
// Compute the Hamming distance between vectors
let hamming_distance = u8::hamming(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Hamming Distance: {}", hamming_distance);
// Compute the Jaccard distance between vectors
let jaccard_distance = u8::jaccard(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Jaccard Distance: {}", jaccard_distance);
}
```
Binary similarity functions are available only for `u8` types.
### Half-Precision Floating-Point Numbers
Rust has no native support for half-precision floating-point numbers, but SimSIMD provides a `f16` type with built-in conversion methods.
The underlying `u16` representation is publicly accessible for direct bit manipulation.
```rust
use simsimd::{SpatialSimilarity, f16};
fn main() {
// Create f16 vectors using built-in conversion methods
let vector_a: Vec = vec![1.0, 2.0, 3.0].iter().map(|&x| f16::from_f32(x)).collect();
let vector_b: Vec = vec![4.0, 5.0, 6.0].iter().map(|&x| f16::from_f32(x)).collect();
// Compute the cosine distance
let cosine_distance = f16::cosine(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Cosine Distance: {}", cosine_distance);
// Direct bit manipulation
let half = f16::from_f32(3.14159);
let bits = half.0; // Access raw u16 representation
let reconstructed = f16(bits);
// Convert back to f32
let float_value = half.to_f32();
}
```
For interoperability with the `half` crate:
```rust
use simsimd::{SpatialSimilarity, f16 as SimF16};
use half::f16 as HalfF16;
fn main() {
let vector_a: Vec = vec![1.0, 2.0, 3.0].iter().map(|&x| HalfF16::from_f32(x)).collect();
let vector_b: Vec = vec![4.0, 5.0, 6.0].iter().map(|&x| HalfF16::from_f32(x)).collect();
// Safe reinterpret cast due to identical memory layout
let buffer_a: &[SimF16] = unsafe { std::slice::from_raw_parts(vector_a.as_ptr() as *const SimF16, vector_a.len()) };
let buffer_b: &[SimF16] = unsafe { std::slice::from_raw_parts(vector_b.as_ptr() as *const SimF16, vector_b.len()) };
let cosine_distance = SimF16::cosine(buffer_a, buffer_b)
.expect("Vectors must be of the same length");
println!("Cosine Distance: {}", cosine_distance);
}
```
### Half-Precision Brain-Float Numbers
The "brain-float-16" is a popular machine learning format.
It's broadly supported in hardware and is very machine-friendly, but software support is still lagging behind.
[Unlike NumPy](https://github.com/numpy/numpy/issues/19808), you can already use `bf16` datatype in SimSIMD.
SimSIMD provides a `bf16` type with built-in conversion methods and direct bit access.
```rust
use simsimd::{SpatialSimilarity, bf16};
fn main() {
// Create bf16 vectors using built-in conversion methods
let vector_a: Vec = vec![1.0, 2.0, 3.0].iter().map(|&x| bf16::from_f32(x)).collect();
let vector_b: Vec = vec![4.0, 5.0, 6.0].iter().map(|&x| bf16::from_f32(x)).collect();
// Compute the cosine similarity
let cosine_distance = bf16::cosine(&vector_a, &vector_b)
.expect("Vectors must be of the same length");
println!("Cosine Distance: {}", cosine_distance);
// Direct bit manipulation
let brain_half = bf16::from_f32(3.14159);
let bits = brain_half.0; // Access raw u16 representation
let reconstructed = bf16(bits);
// Convert back to f32
let float_value = brain_half.to_f32();
// Compare precision differences
let original = 3.14159_f32;
let f16_roundtrip = f16::from_f32(original).to_f32();
let bf16_roundtrip = bf16::from_f32(original).to_f32();
println!("Original: {}", original);
println!("f16 roundtrip: {}", f16_roundtrip);
println!("bf16 roundtrip: {}", bf16_roundtrip);
}
```
### Dynamic Dispatch in Rust
SimSIMD provides a [dynamic dispatch](#dynamic-dispatch) mechanism to select the most advanced micro-kernel for the current CPU.
You can query supported backends and use the `SimSIMD::capabilities` function to select the best one.
```rust
println!("uses neon: {}", capabilities::uses_neon());
println!("uses sve: {}", capabilities::uses_sve());
println!("uses haswell: {}", capabilities::uses_haswell());
println!("uses skylake: {}", capabilities::uses_skylake());
println!("uses ice: {}", capabilities::uses_ice());
println!("uses genoa: {}", capabilities::uses_genoa());
println!("uses sapphire: {}", capabilities::uses_sapphire());
println!("uses turin: {}", capabilities::uses_turin());
println!("uses sierra: {}", capabilities::uses_sierra());
```
## Using SimSIMD in JavaScript
To install, choose one of the following options depending on your environment:
- `npm install --save simsimd`
- `yarn add simsimd`
- `pnpm add simsimd`
- `bun install simsimd`
The package is distributed with prebuilt binaries, but if your platform is not supported, you can build the package from the source via `npm run build`.
This will automatically happen unless you install the package with the `--ignore-scripts` flag or use Bun.
After you install it, you will be able to call the SimSIMD functions on various `TypedArray` variants:
```js
const { sqeuclidean, cosine, inner, hamming, jaccard } = require("simsimd");
const vectorA = new Float32Array([1.0, 2.0, 3.0]);
const vectorB = new Float32Array([4.0, 5.0, 6.0]);
const distance = sqeuclidean(vectorA, vectorB);
console.log("Squared Euclidean Distance:", distance);
```
Other numeric types and precision levels are supported as well.
For double-precision floating-point numbers, use `Float64Array`:
```js
const vectorA = new Float64Array([1.0, 2.0, 3.0]);
const vectorB = new Float64Array([4.0, 5.0, 6.0]);
const distance = cosine(vectorA, vectorB);
```
When doing machine learning and vector search with high-dimensional vectors you may want to quantize them to 8-bit integers.
You may want to project values from the $[-1, 1]$ range to the $[-127, 127]$ range and then cast them to `Int8Array`:
```js
const quantizedVectorA = new Int8Array(vectorA.map((v) => v * 127));
const quantizedVectorB = new Int8Array(vectorB.map((v) => v * 127));
const distance = cosine(quantizedVectorA, quantizedVectorB);
```
A more extreme quantization case would be to use binary vectors.
You can map all positive values to `1` and all negative values and zero to `0`, packing eight values into a single byte.
After that, Hamming and Jaccard distances can be computed.
```js
const { toBinary, hamming } = require("simsimd");
const binaryVectorA = toBinary(vectorA);
const binaryVectorB = toBinary(vectorB);
const distance = hamming(binaryVectorA, binaryVectorB);
```
## Using SimSIMD in Swift
To install, simply add the following dependency to your `Package.swift`:
```swift
dependencies: [
.package(url: "https://github.com/ashvardanian/simsimd")
]
```
The package provides the most common spatial metrics for `Int8`, `Float16`, `Float32`, and `Float64` vectors.
```swift
import SimSIMD
let vectorA: [Int8] = [1, 2, 3]
let vectorB: [Int8] = [4, 5, 6]
let dotProduct = vectorA.dot(vectorB) // Computes the dot product
let cosineDistance = vectorA.cosine(vectorB) // Computes the cosine distance
let sqEuclidean = vectorA.sqeuclidean(vectorB) // Computes the squared Euclidean distance
```
## Using SimSIMD in C
For integration within a CMake-based project, add the following segment to your `CMakeLists.txt`:
```cmake
FetchContent_Declare(
simsimd
GIT_REPOSITORY https://github.com/ashvardanian/simsimd.git
GIT_SHALLOW TRUE
)
FetchContent_MakeAvailable(simsimd)
```
After that, you can use the SimSIMD library in your C code in several ways.
Simplest of all, you can include the headers, and the compiler will automatically select the most recent CPU extensions that SimSIMD will use.
```c
#include
int main() {
simsimd_flush_denormals(); // Optional, to avoid performance penalties on denormal numbers
simsimd_f32_t vector_a[1536];
simsimd_f32_t vector_b[1536];
simsimd_kernel_punned_t metric_punned = simsimd_metric_punned(
simsimd_metric_cos_k, // Metric kind, like the angular cosine distance
simsimd_datatype_f32_k, // Data type, like: f16, f32, f64, i8, b8, and complex variants
simsimd_cap_any_k); // Which CPU capabilities are we allowed to use
simsimd_distance_t distance;
simsimd_metric_dense_punned_t metric = (simsimd_metric_dense_punned_t)metric_punned;
metric(vector_a, vector_b, 1536, &distance);
return 0;
}
```
### Dynamic Dispatch in C
To avoid hard-coding the backend, you can rely on `c/lib.c` to prepackage all possible backends in one binary, and select the most recent CPU features at runtime.
That feature of the C library is called [dynamic dispatch](#dynamic-dispatch) and is extensively used in the Python, JavaScript, and Rust bindings.
To test which CPU features are available on the machine at runtime, use the following APIs:
```c
int uses_dynamic_dispatch = simsimd_uses_dynamic_dispatch(); // Check if dynamic dispatch was enabled
simsimd_capability_t capabilities = simsimd_capabilities(); // Returns a bitmask
int uses_neon = simsimd_uses_neon();
int uses_sve = simsimd_uses_sve();
int uses_haswell = simsimd_uses_haswell();
int uses_skylake = simsimd_uses_skylake();
int uses_ice = simsimd_uses_ice();
int uses_genoa = simsimd_uses_genoa();
int uses_sapphire = simsimd_uses_sapphire();
```
To override compilation settings and switch between runtime and compile-time dispatch, define the following macro:
```c
#define SIMSIMD_DYNAMIC_DISPATCH 1 // or 0
```
### Spatial Distances: Cosine and Euclidean Distances
```c
#include
int main() {
simsimd_i8_t i8s[1536];
simsimd_u8_t u8s[1536];
simsimd_f64_t f64s[1536];
simsimd_f32_t f32s[1536];
simsimd_f16_t f16s[1536];
simsimd_bf16_t bf16s[1536];
simsimd_distance_t distance;
// Cosine distance between two vectors
simsimd_cos_i8(i8s, i8s, 1536, &distance);
simsimd_cos_u8(u8s, u8s, 1536, &distance);
simsimd_cos_f16(f16s, f16s, 1536, &distance);
simsimd_cos_f32(f32s, f32s, 1536, &distance);
simsimd_cos_f64(f64s, f64s, 1536, &distance);
simsimd_cos_bf16(bf16s, bf16s, 1536, &distance);
// Euclidean distance between two vectors
simsimd_l2sq_i8(i8s, i8s, 1536, &distance);
simsimd_l2sq_u8(u8s, u8s, 1536, &distance);
simsimd_l2sq_f16(f16s, f16s, 1536, &distance);
simsimd_l2sq_f32(f32s, f32s, 1536, &distance);
simsimd_l2sq_f64(f64s, f64s, 1536, &distance);
simsimd_l2sq_bf16(bf16s, bf16s, 1536, &distance);
return 0;
}
```
### Dot-Products: Inner and Complex Inner Products
```c
#include
int main() {
// SimSIMD provides "sized" type-aliases without relying on `stdint.h`
simsimd_i8_t i8[1536];
simsimd_i8_t u8[1536];
simsimd_f16_t f16s[1536];
simsimd_f32_t f32s[1536];
simsimd_f64_t f64s[1536];
simsimd_bf16_t bf16s[1536];
simsimd_distance_t product;
// Inner product between two real vectors
simsimd_dot_i8(i8s, i8s, 1536, &product);
simsimd_dot_u8(u8s, u8s, 1536, &product);
simsimd_dot_f16(f16s, f16s, 1536, &product);
simsimd_dot_f32(f32s, f32s, 1536, &product);
simsimd_dot_f64(f64s, f64s, 1536, &product);
simsimd_dot_bf16(bf16s, bf16s, 1536, &product);
// SimSIMD provides complex types with `real` and `imag` fields
simsimd_f64c_t f64cs[768];
simsimd_f32c_t f32cs[768];
simsimd_f16c_t f16cs[768];
simsimd_bf16c_t bf16cs[768];
simsimd_distance_t products[2]; // real and imaginary parts
// Complex inner product between two vectors
simsimd_dot_f16c(f16cs, f16cs, 768, &products[0]);
simsimd_dot_f32c(f32cs, f32cs, 768, &products[0]);
simsimd_dot_f64c(f64cs, f64cs, 768, &products[0]);
simsimd_dot_bf16c(bf16cs, bf16cs, 768, &products[0]);
// Complex conjugate inner product between two vectors
simsimd_vdot_f16c(f16cs, f16cs, 768, &products[0]);
simsimd_vdot_f32c(f32cs, f32cs, 768, &products[0]);
simsimd_vdot_f64c(f64cs, f64cs, 768, &products[0]);
simsimd_vdot_bf16c(bf16cs, bf16cs, 768, &products[0]);
return 0;
}
```
### Binary Distances: Hamming and Jaccard Distances
```c
#include
int main() {
simsimd_b8_t b8s[1536 / 8]; // 8 bits per word
simsimd_distance_t distance;
simsimd_hamming_b8(b8s, b8s, 1536 / 8, &distance);
simsimd_jaccard_b8(b8s, b8s, 1536 / 8, &distance);
return 0;
}
```
### Probability Distributions: Jensen-Shannon and Kullback-Leibler Divergences
```c
#include
int main() {
simsimd_f64_t f64s[1536];
simsimd_f32_t f32s[1536];
simsimd_f16_t f16s[1536];
simsimd_distance_t divergence;
// Jensen-Shannon divergence between two vectors
simsimd_js_f16(f16s, f16s, 1536, &divergence);
simsimd_js_f32(f32s, f32s, 1536, &divergence);
simsimd_js_f64(f64s, f64s, 1536, &divergence);
// Kullback-Leibler divergence between two vectors
simsimd_kl_f16(f16s, f16s, 1536, &divergence);
simsimd_kl_f32(f32s, f32s, 1536, &divergence);
simsimd_kl_f64(f64s, f64s, 1536, &divergence);
return 0;
}
```
### Half-Precision Floating-Point Numbers
If you aim to utilize the `_Float16` functionality with SimSIMD, ensure your development environment is compatible with C 11.
For other SimSIMD functionalities, C 99 compatibility will suffice.
To explicitly disable half-precision support, define the following macro before imports:
```c
#define SIMSIMD_NATIVE_F16 0 // or 1
#define SIMSIMD_NATIVE_BF16 0 // or 1
#include
```
### Compilation Settings and Debugging
`SIMSIMD_DYNAMIC_DISPATCH`:
> By default, SimSIMD is a header-only library.
> But if you are running on different generations of devices, it makes sense to pre-compile the library for all supported generations at once, and dispatch at runtime.
> This flag does just that and is used to produce the `simsimd.so` shared library, as well as the Python and other bindings.
For Arm: `SIMSIMD_TARGET_NEON`, `SIMSIMD_TARGET_SVE`, `SIMSIMD_TARGET_SVE2`, `SIMSIMD_TARGET_NEON_F16`, `SIMSIMD_TARGET_SVE_F16`, `SIMSIMD_TARGET_NEON_BF16`, `SIMSIMD_TARGET_SVE_BF16`.
For x86: `SIMSIMD_TARGET_HASWELL`, `SIMSIMD_TARGET_SKYLAKE`, `SIMSIMD_TARGET_ICE`, `SIMSIMD_TARGET_GENOA`, `SIMSIMD_TARGET_SAPPHIRE`, `SIMSIMD_TARGET_TURIN`, `SIMSIMD_TARGET_SIERRA`.
> By default, SimSIMD automatically infers the target architecture and pre-compiles as many kernels as possible.
> In some cases, you may want to explicitly disable some of the kernels.
> Most often it's due to compiler support issues, like the lack of some recent intrinsics or low-precision numeric types.
> In other cases, you may want to disable some kernels to speed up the compilation process and trim the binary size.
`SIMSIMD_SQRT`, `SIMSIMD_RSQRT`, `SIMSIMD_LOG`:
> By default, for __non__-SIMD backends, SimSIMD may use `libc` functions like `sqrt` and `log`.
> Those are generally very accurate, but slow, and introduce a dependency on the C standard library.
> To avoid that you can override those definitions with your custom implementations, like: `#define SIMSIMD_RSQRT(x) (1 / sqrt(x))`.
## Algorithms & Design Decisions 📚
In general there are a few principles that SimSIMD follows:
- Avoid loop unrolling.
- Never allocate memory.
- Never throw exceptions or set `errno`.
- Keep all function arguments the size of the pointer.
- Avoid returning from public interfaces, use out-arguments instead.
- Don't over-optimize for old CPUs and single- and double-precision floating-point numbers.
- Prioritize mixed-precision and integer operations, and new ISA extensions.
- Prefer saturated arithmetic and avoid overflows.
Possibly, in the future:
- Best effort computation silencing `NaN` components in low-precision inputs.
- Detect overflows and report the distance with a "signaling" `NaN`.
Last, but not the least - don't build unless there is a demand for it.
So if you have a specific use-case, please open an issue or a pull request, and ideally, bring in more users with similar needs.
### Cosine Similarity, Reciprocal Square Root, and Newton-Raphson Iteration
The cosine similarity is the most common and straightforward metric used in machine learning and information retrieval.
Interestingly, there are multiple ways to shoot yourself in the foot when computing it.
The cosine similarity is the inverse of the cosine distance, which is the cosine of the angle between two vectors.
```math
\text{CosineSimilarity}(a, b) = \frac{a \cdot b}{\|a\| \cdot \|b\|}
```
```math
\text{CosineDistance}(a, b) = 1 - \frac{a \cdot b}{\|a\| \cdot \|b\|}
```
In NumPy terms, SimSIMD implementation is similar to:
```python
import numpy as np
def cos_numpy(a: np.ndarray, b: np.ndarray) -> float:
ab, a2, b2 = np.dot(a, b), np.dot(a, a), np.dot(b, b) # Fused in SimSIMD
if a2 == 0 and b2 == 0: result = 0 # Same in SciPy
elif ab == 0: result = 1 # Division by zero error in SciPy
else: result = 1 - ab / (sqrt(a2) * sqrt(b2)) # Bigger rounding error in SciPy
return result
```
In SciPy, however, the cosine distance is computed as `1 - ab / np.sqrt(a2 * b2)`.
It handles the edge case of a zero and non-zero argument pair differently, resulting in a division by zero error.
It's not only less efficient, but also less accurate, given how the reciprocal square roots are computed.
The C standard library provides the `sqrt` function, which is generally very accurate, but slow.
The `rsqrt` in-hardware implementations are faster, but have different accuracy characteristics.
- SSE `rsqrtps` and AVX `vrsqrtps`: $1.5 \times 2^{-12}$ maximal relative error.
- AVX-512 `vrsqrt14pd` instruction: $2^{-14}$ maximal relative error.
- NEON `frsqrte` instruction has no documented error bounds, but [can be][arm-rsqrt] $2^{-3}$.
[arm-rsqrt]: https://gist.github.com/ashvardanian/5e5cf585d63f8ab6d240932313c75411
To overcome the limitations of the `rsqrt` instruction, SimSIMD uses the Newton-Raphson iteration to refine the initial estimate for high-precision floating-point numbers.
It can be defined as:
```math
x_{n+1} = x_n \cdot (3 - x_n \cdot x_n) / 2
```
On 1536-dimensional inputs on Intel Sapphire Rapids CPU a single such iteration can result in a 2-3 orders of magnitude relative error reduction:
| Datatype | NumPy Error | SimSIMD w/out Iteration | SimSIMD |
| :--------- | ------------------: | ----------------------: | ------------------: |
| `bfloat16` | 1.89e-08 ± 1.59e-08 | 3.07e-07 ± 3.09e-07 | 3.53e-09 ± 2.70e-09 |
| `float16` | 1.67e-02 ± 1.44e-02 | 2.68e-05 ± 1.95e-05 | 2.02e-05 ± 1.39e-05 |
| `float32` | 2.21e-08 ± 1.65e-08 | 3.47e-07 ± 3.49e-07 | 3.77e-09 ± 2.84e-09 |
| `float64` | 0.00e+00 ± 0.00e+00 | 3.80e-07 ± 4.50e-07 | 1.35e-11 ± 1.85e-11 |
### Curved Spaces, Mahalanobis Distance, and Bilinear Quadratic Forms
The Mahalanobis distance is a generalization of the Euclidean distance, which takes into account the covariance of the data.
It's very similar in its form to the bilinear form, which is a generalization of the dot product.
```math
\text{BilinearForm}(a, b, M) = a^T M b
```
```math
\text{Mahalanobis}(a, b, M) = \sqrt{(a - b)^T M^{-1} (a - b)}
```
Bilinear Forms can be seen as one of the most important linear algebraic operations, surprisingly missing in BLAS and LAPACK.
They are versatile and appear in various domains:
- In Quantum Mechanics, the expectation value of an observable $A$ in a state $\psi$ is given by $\langle \psi | A | \psi \rangle$, which is a bilinear form.
- In Machine Learning, in Support Vector Machines (SVMs), bilinear forms define kernel functions that measure similarity between data points.
- In Differential Geometry, the metric tensor, which defines distances and angles on a manifold, is a bilinear form on the tangent space.
- In Economics, payoff functions in certain Game Theoretic problems can be modeled as bilinear forms of players' strategies.
- In Physics, interactions between electric and magnetic fields can be expressed using bilinear forms.
Broad applications aside, the lack of a specialized primitive for bilinear forms in BLAS and LAPACK means significant performance overhead.
A $vector * matrix * vector$ product is a scalar, whereas its constituent parts ($vector * matrix$ and $matrix * vector$) are vectors:
- They need memory to be stored in: $O(n)$ allocation.
- The data will be written to memory and read back, wasting CPU cycles.
SimSIMD doesn't produce intermediate vector results, like `a @ M @ b`, but computes the bilinear form directly.
### Set Intersection, Galloping, and Binary Search
The set intersection operation is generally defined as the number of elements that are common between two sets, represented as sorted arrays of integers.
The most common way to compute it is a linear scan:
```c
size_t intersection_size(int *a, int *b, size_t n, size_t m) {
size_t i = 0, j = 0, count = 0;
while (i < n && j < m) {
if (a[i] < b[j]) i++;
else if (a[i] > b[j]) j++;
else i++, j++, count++;
}
return count;
}
```
Alternatively, one can use the binary search to find the elements in the second array that are present in the first one.
On every step the checked region of the second array is halved, which is called the _galloping search_.
It's faster, but only when large arrays of very different sizes are intersected.
Third approach is to use the SIMD instructions to compare multiple elements at once:
- Using string-intersection instructions on x86, like `pcmpestrm`.
- Using integer-intersection instructions in AVX-512, like `vp2intersectd`.
- Using vanilla equality checks present in all SIMD instruction sets.
After benchmarking, the last approach was chosen, as it's the most flexible and often the fastest.
### Complex Dot Products, Conjugate Dot Products, and Complex Numbers
Complex dot products are a generalization of the dot product to complex numbers.
They are supported by most BLAS packages, but almost never in mixed precision.
SimSIMD defines `dot` and `vdot` kernels as:
```math
\text{dot}(a, b) = \sum_{i=0}^{n-1} a_i \cdot b_i
```
```math
\text{vdot}(a, b) = \sum_{i=0}^{n-1} a_i \cdot \bar{b_i}
```
Where $\bar{b_i}$ is the complex conjugate of $b_i$.
Putting that into Python code for scalar arrays:
```python
def dot(a: List[number], b: List[number]) -> number:
a_real, a_imaginary = a[0::2], a[1::2]
b_real, b_imaginary = b[0::2], b[1::2]
ab_real, ab_imaginary = 0, 0
for ar, ai, br, bi in zip(a_real, a_imaginary, b_real, b_imaginary):
ab_real += ar * br - ai * bi
ab_imaginary += ar * bi + ai * br
return ab_real, ab_imaginary
def vdot(a: List[number], b: List[number]) -> number:
a_real, a_imaginary = a[0::2], a[1::2]
b_real, b_imaginary = b[0::2], b[1::2]
ab_real, ab_imaginary = 0, 0
for ar, ai, br, bi in zip(a_real, a_imaginary, b_real, b_imaginary):
ab_real += ar * br + ai * bi
ab_imaginary += ar * bi - ai * br
return ab_real, ab_imaginary
```
### Logarithms in Kullback-Leibler & Jensen–Shannon Divergences
The Kullback-Leibler divergence is a measure of how one probability distribution diverges from a second, expected probability distribution.
Jensen-Shannon divergence is a symmetrized and smoothed version of the Kullback-Leibler divergence, which can be used as a distance metric between probability distributions.
```math
\text{KL}(P || Q) = \sum_{i} P(i) \log \frac{P(i)}{Q(i)}
```
```math
\text{JS}(P, Q) = \frac{1}{2} \text{KL}(P || M) + \frac{1}{2} \text{KL}(Q || M), M = \frac{P + Q}{2}
```
Both functions are defined for non-negative numbers, and the logarithm is a key part of their computation.
### Mixed Precision in Fused-Multiply-Add and Weighted Sums
The Fused-Multiply-Add (FMA) operation is a single operation that combines element-wise multiplication and addition with different scaling factors.
The Weighted Sum is its simplified variant without element-wise multiplication.
```math
\text{FMA}_i(A, B, C, \alpha, \beta) = \alpha \cdot A_i \cdot B_i + \beta \cdot C_i
```
```math
\text{WSum}_i(A, B, \alpha, \beta) = \alpha \cdot A_i + \beta \cdot B_i
```
In NumPy terms, the implementation may look like:
```py
import numpy as np
def wsum(A: np.ndarray, B: np.ndarray, /, Alpha: float, Beta: float) -> np.ndarray:
assert A.dtype == B.dtype, "Input types must match and affect the output style"
return (Alpha * A + Beta * B).astype(A.dtype)
def fma(A: np.ndarray, B: np.ndarray, C: np.ndarray, /, Alpha: float, Beta: float) -> np.ndarray:
assert A.dtype == B.dtype and A.dtype == C.dtype, "Input types must match and affect the output style"
return (Alpha * A * B + Beta * C).astype(A.dtype)
```
The tricky part is implementing those operations in mixed precision, where the scaling factors are of different precision than the input and output vectors.
SimSIMD uses double-precision floating-point scaling factors for any input and output precision, including `i8` and `u8` integers and `f16` and `bf16` floats.
Depending on the generation of the CPU, given native support for `f16` addition and multiplication, the `f16` temporaries are used for `i8` and `u8` multiplication, scaling, and addition.
For `bf16`, native support is generally limited to dot-products with subsequent partial accumulation, which is not enough for the FMA and WSum operations, so `f32` is used as a temporary.
### Auto-Vectorization & Loop Unrolling
On the Intel Sapphire Rapids platform, SimSIMD was benchmarked against auto-vectorized code using GCC 12.
GCC handles single-precision `float` but might not be the best choice for `int8` and `_Float16` arrays, which have been part of the C language since 2011.
| Kind | GCC 12 `f32` | GCC 12 `f16` | SimSIMD `f16` | `f16` improvement |
| :------------------------ | -----------: | -----------: | ------------: | ----------------: |
| Inner Product | 3,810 K/s | 192 K/s | 5,990 K/s | __31 x__ |
| Cosine Distance | 3,280 K/s | 336 K/s | 6,880 K/s | __20 x__ |
| Euclidean Distance ² | 4,620 K/s | 147 K/s | 5,320 K/s | __36 x__ |
| Jensen-Shannon Divergence | 1,180 K/s | 18 K/s | 2,140 K/s | __118 x__ |
### Dynamic Dispatch
Most popular software is precompiled and distributed with fairly conservative CPU optimizations, to ensure compatibility with older hardware.
Database Management platforms, like ClickHouse, and Web Browsers, like Google Chrome,need to run on billions of devices, and they can't afford to be picky about the CPU features.
For such users SimSIMD provides a dynamic dispatch mechanism, which selects the most advanced micro-kernel for the current CPU at runtime.
You can compile SimSIMD on an old CPU, like Intel Haswell, and run it on a new one, like AMD Genoa, and it will automatically use the most advanced instructions available.
Reverse is also true, you can compile on a new CPU and run on an old one, and it will automatically fall back to the most basic instructions.
Moreover, the very first time you prove for CPU capabilities with `simsimd_capabilities()`, it initializes the dynamic dispatch mechanism, and all subsequent calls will be faster and won't face race conditions in multi-threaded environments.
## Target Specific Backends
SimSIMD exposes all kernels for all backends, and you can select the most advanced one for the current CPU without relying on built-in dispatch mechanisms.
That's handy for testing and benchmarking, but also in case you want to dispatch a very specific kernel for a very specific CPU, bypassing SimSIMD assignment logic.
All of the function names follow the same pattern: `simsimd_{function}_{type}_{backend}`.
- The backend can be `serial`, `haswell`, `skylake`, `ice`, `genoa`, `sapphire`, `turin`, `neon`, or `sve`.
- The type can be `f64`, `f32`, `f16`, `bf16`, `f64c`, `f32c`, `f16c`, `bf16c`, `i8`, or `b8`.
- The function can be `dot`, `vdot`, `cos`, `l2sq`, `hamming`, `jaccard`, `kl`, `js`, or `intersect`.
To avoid hard-coding the backend, you can use the `simsimd_kernel_punned_t` to pun the function pointer and the `simsimd_capabilities` function to get the available backends at runtime.
To match all the function names, consider a RegEx:
```regex
SIMSIMD_PUBLIC void simsimd_\w+_\w+_\w+\(
```
On Linux, you can use the following command to list all unique functions:
```sh
$ grep -oP 'SIMSIMD_PUBLIC void simsimd_\w+_\w+_\w+\(' include/simsimd/*.h | sort | uniq
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_hamming_b8_haswell(
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_hamming_b8_ice(
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_hamming_b8_neon(
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_hamming_b8_serial(
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_hamming_b8_sve(
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_jaccard_b8_haswell(
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_jaccard_b8_neon(
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_jaccard_b8_serial(
> include/simsimd/binary.h:SIMSIMD_PUBLIC void simsimd_jaccard_b8_sve(
```
## License
Feel free to use the project under Apache 2.0 or the Three-clause BSD license at your preference.
simsimd-6.5.12/build.rs 0000644 0000000 0000000 00000012177 10461020230 0013041 0 ustar 0000000 0000000 use std::collections::HashMap;
use std::env;
fn main() {
build_simsimd();
}
/// Build SimSIMD with dynamic SIMD dispatching.
/// Returns a HashMap of enabled compilation flags for potential reuse.
fn build_simsimd() -> HashMap {
let mut flags = HashMap::::new();
let mut build = cc::Build::new();
build
// Prefer portable flags to support MSVC and older toolchains
.std("c99") // Enforce C99 standard when supported
.file("c/lib.c")
.include("include")
.define("SIMSIMD_NATIVE_F16", "0")
.define("SIMSIMD_NATIVE_BF16", "0")
.define("SIMSIMD_DYNAMIC_DISPATCH", "1")
.opt_level(3)
.flag_if_supported("-pedantic") // Strict compliance when supported
.warnings(false);
// On 32-bit x86, ensure proper stack alignment for floating-point operations
// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=38534
let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
if target_arch == "x86" {
build.flag_if_supported("-mstackrealign");
build.flag_if_supported("-mpreferred-stack-boundary=4");
}
// Set architecture-specific macros explicitly (like StringZilla)
let target_bits = env::var("CARGO_CFG_TARGET_POINTER_WIDTH").unwrap_or_default();
if target_arch == "x86_64" && target_bits == "64" {
build.define("SIMSIMD_IS_64BIT_X86", "1");
build.define("SIMSIMD_IS_64BIT_ARM", "0");
flags.insert("SIMSIMD_IS_64BIT_X86".to_string(), true);
flags.insert("SIMSIMD_IS_64BIT_ARM".to_string(), false);
} else if target_arch == "aarch64" && target_bits == "64" {
build.define("SIMSIMD_IS_64BIT_X86", "0");
build.define("SIMSIMD_IS_64BIT_ARM", "1");
flags.insert("SIMSIMD_IS_64BIT_X86".to_string(), false);
flags.insert("SIMSIMD_IS_64BIT_ARM".to_string(), true);
} else {
build.define("SIMSIMD_IS_64BIT_X86", "0");
build.define("SIMSIMD_IS_64BIT_ARM", "0");
flags.insert("SIMSIMD_IS_64BIT_X86".to_string(), false);
flags.insert("SIMSIMD_IS_64BIT_ARM".to_string(), false);
}
// Determine which backends to try based on target architecture.
// The fallback mechanism will disable unsupported targets one by one.
let flags_to_try = match target_arch.as_str() {
"arm" | "aarch64" => vec![
"SIMSIMD_TARGET_SVE2",
"SIMSIMD_TARGET_SVE_BF16",
"SIMSIMD_TARGET_SVE_F16",
"SIMSIMD_TARGET_SVE_I8",
"SIMSIMD_TARGET_SVE",
"SIMSIMD_TARGET_NEON_BF16",
"SIMSIMD_TARGET_NEON_F16",
"SIMSIMD_TARGET_NEON_I8",
"SIMSIMD_TARGET_NEON",
],
"x86_64" => vec![
"SIMSIMD_TARGET_SIERRA",
"SIMSIMD_TARGET_TURIN",
"SIMSIMD_TARGET_SAPPHIRE",
"SIMSIMD_TARGET_GENOA",
"SIMSIMD_TARGET_ICE",
"SIMSIMD_TARGET_SKYLAKE",
"SIMSIMD_TARGET_HASWELL",
],
_ => vec![],
};
// Check environment variables to allow users to disable specific backends.
// Usage: SIMSIMD_TARGET_NEON=0 SIMSIMD_TARGET_SVE=0 cargo build
for flag in flags_to_try.iter() {
let enabled = match env::var(flag) {
Ok(val) => val != "0" && val.to_lowercase() != "false",
Err(_) => true, // Default to enabled if not specified
};
if enabled {
build.define(flag, "1");
flags.insert(flag.to_string(), true);
} else {
build.define(flag, "0");
flags.insert(flag.to_string(), false);
println!("cargo:warning=Disabled {} via environment variable", flag);
}
}
// Try compilation with all enabled backends
if build.try_compile("simsimd").is_err() {
println!("cargo:warning=Failed to compile SimSIMD with all SIMD backends...");
// Fallback: disable backends one by one until compilation succeeds
for flag in flags_to_try.iter() {
build.define(flag, "0");
flags.insert(flag.to_string(), false);
if build.try_compile("simsimd").is_ok() {
println!(
"cargo:warning=Successfully compiled after disabling {}",
flag
);
break;
}
println!(
"cargo:warning=Failed to compile after disabling {}, trying next configuration...",
flag
);
}
}
// Declare file dependencies
println!("cargo:rerun-if-changed=c/lib.c");
println!("cargo:rerun-if-changed=rust/lib.rs");
println!("cargo:rerun-if-changed=include/simsimd/simsimd.h");
println!("cargo:rerun-if-changed=include/simsimd/dot.h");
println!("cargo:rerun-if-changed=include/simsimd/spatial.h");
println!("cargo:rerun-if-changed=include/simsimd/probability.h");
println!("cargo:rerun-if-changed=include/simsimd/binary.h");
println!("cargo:rerun-if-changed=include/simsimd/types.h");
// Rerun if environment variables change
for flag in flags_to_try.iter() {
println!("cargo:rerun-if-env-changed={}", flag);
}
flags
}
simsimd-6.5.12/c/lib.c 0000644 0000000 0000000 00000054402 10461020230 0012525 0 ustar 0000000 0000000 /**
* @brief Dynamic dispatch library for SimSIMD.
* @note Compile with the most recent compiler available.
* @file lib.c
*/
#define SIMSIMD_DYNAMIC_DISPATCH 1
#define SIMSIMD_NATIVE_F16 0
#define SIMSIMD_NATIVE_BF16 0
/* Override the primary serial operations to avoid the LibC dependency.
*/
#define SIMSIMD_SQRT(x) simsimd_approximate_square_root(x)
#define SIMSIMD_RSQRT(x) simsimd_approximate_inverse_square_root(x)
#define SIMSIMD_LOG(x) simsimd_approximate_log(x)
/* Depending on the Operating System, the following intrinsics are available
* on recent compiler toolchains:
*
* - Linux: everything is available in GCC 12+ and Clang 16+.
* - Windows - MSVC: everything except Sapphire Rapids and ARM SVE.
* - macOS - Apple Clang: only Arm NEON and x86 AVX2 Haswell extensions are available.
*/
#if !defined(SIMSIMD_TARGET_NEON) && (defined(__APPLE__) || defined(__linux__))
#define SIMSIMD_TARGET_NEON 1
#endif
#if !defined(SIMSIMD_TARGET_SVE) && (defined(__linux__))
#define SIMSIMD_TARGET_SVE 1
#endif
#if !defined(SIMSIMD_TARGET_SVE2) && (defined(__linux__))
#define SIMSIMD_TARGET_SVE2 1
#endif
#if !defined(SIMSIMD_TARGET_HASWELL) && (defined(_MSC_VER) || defined(__APPLE__) || defined(__linux__))
#define SIMSIMD_TARGET_HASWELL 1
#endif
#if !defined(SIMSIMD_TARGET_SKYLAKE) && (defined(_MSC_VER) || defined(__linux__))
#define SIMSIMD_TARGET_SKYLAKE 1
#endif
#if !defined(SIMSIMD_TARGET_ICE) && (defined(_MSC_VER) || defined(__linux__))
#define SIMSIMD_TARGET_ICE 1
#endif
#if !defined(SIMSIMD_TARGET_GENOA) && (defined(__linux__))
#define SIMSIMD_TARGET_GENOA 1
#endif
#if !defined(SIMSIMD_TARGET_SAPPHIRE) && (defined(__linux__))
#define SIMSIMD_TARGET_SAPPHIRE 1
#endif
#if !defined(SIMSIMD_TARGET_TURIN) && (defined(__linux__))
#define SIMSIMD_TARGET_TURIN 1
#endif
#if !defined(SIMSIMD_TARGET_SIERRA) && (defined(__linux__)) && 0 // TODO: Add target spec to GCC & Clang
#define SIMSIMD_TARGET_SIERRA 1
#endif
#include
#ifdef __cplusplus
extern "C" {
#endif
// Every time a function is called, it checks if the metric is already loaded. If not, it fetches it.
// If no metric is found, it returns NaN. We can obtain NaN by dividing 0.0 by 0.0, but that annoys
// the MSVC compiler. Instead we can directly write-in the signaling NaN (0x7FF0000000000001)
// or the qNaN (0x7FF8000000000000).
#define SIMSIMD_DECLARATION_DENSE(name, extension) \
SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##extension##_t const *a, \
simsimd_##extension##_t const *b, simsimd_size_t n, \
simsimd_distance_t *results) { \
static simsimd_metric_dense_punned_t metric = 0; \
if (metric == 0) { \
simsimd_capability_t used_capability; \
simsimd_find_kernel_punned(simsimd_metric_##name##_k, simsimd_datatype_##extension##_k, \
simsimd_capabilities(), simsimd_cap_any_k, (simsimd_kernel_punned_t *)&metric, \
&used_capability); \
if (!metric) { \
*(simsimd_u64_t *)results = 0x7FF0000000000001ull; \
return; \
} \
} \
metric(a, b, n, results); \
}
#define SIMSIMD_DECLARATION_SPARSE(name, extension, type) \
SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \
simsimd_size_t a_length, simsimd_size_t b_length, \
simsimd_distance_t *result) { \
static simsimd_metric_sparse_punned_t metric = 0; \
if (metric == 0) { \
simsimd_capability_t used_capability; \
simsimd_find_kernel_punned(simsimd_metric_##name##_k, simsimd_datatype_##extension##_k, \
simsimd_capabilities(), simsimd_cap_any_k, \
(simsimd_kernel_punned_t *)(&metric), &used_capability); \
if (!metric) { \
*(simsimd_u64_t *)result = 0x7FF0000000000001ull; \
return; \
} \
} \
metric(a, b, a_length, b_length, result); \
}
#define SIMSIMD_DECLARATION_CURVED(name, extension) \
SIMSIMD_DYNAMIC void simsimd_##name##_##extension( \
simsimd_##extension##_t const *a, simsimd_##extension##_t const *b, simsimd_##extension##_t const *c, \
simsimd_size_t n, simsimd_distance_t *result) { \
static simsimd_metric_curved_punned_t metric = 0; \
if (metric == 0) { \
simsimd_capability_t used_capability; \
simsimd_find_kernel_punned(simsimd_metric_##name##_k, simsimd_datatype_##extension##_k, \
simsimd_capabilities(), simsimd_cap_any_k, \
(simsimd_kernel_punned_t *)(&metric), &used_capability); \
if (!metric) { \
*(simsimd_u64_t *)result = 0x7FF0000000000001ull; \
return; \
} \
} \
metric(a, b, c, n, result); \
}
#define SIMSIMD_DECLARATION_FMA(name, extension) \
SIMSIMD_DYNAMIC void simsimd_##name##_##extension( \
simsimd_##extension##_t const *a, simsimd_##extension##_t const *b, simsimd_##extension##_t const *c, \
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_##extension##_t *result) { \
static simsimd_kernel_fma_punned_t metric = 0; \
if (metric == 0) { \
simsimd_capability_t used_capability; \
simsimd_find_kernel_punned(simsimd_metric_##name##_k, simsimd_datatype_##extension##_k, \
simsimd_capabilities(), simsimd_cap_any_k, \
(simsimd_kernel_punned_t *)(&metric), &used_capability); \
} \
metric(a, b, c, n, alpha, beta, result); \
}
#define SIMSIMD_DECLARATION_WSUM(name, extension) \
SIMSIMD_DYNAMIC void simsimd_##name##_##extension( \
simsimd_##extension##_t const *a, simsimd_##extension##_t const *b, simsimd_size_t n, \
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_##extension##_t *result) { \
static simsimd_kernel_wsum_punned_t metric = 0; \
if (metric == 0) { \
simsimd_capability_t used_capability; \
simsimd_find_kernel_punned(simsimd_metric_##name##_k, simsimd_datatype_##extension##_k, \
simsimd_capabilities(), simsimd_cap_any_k, \
(simsimd_kernel_punned_t *)(&metric), &used_capability); \
} \
metric(a, b, n, alpha, beta, result); \
}
// Dot products
SIMSIMD_DECLARATION_DENSE(dot, i8)
SIMSIMD_DECLARATION_DENSE(dot, u8)
SIMSIMD_DECLARATION_DENSE(dot, f16)
SIMSIMD_DECLARATION_DENSE(dot, bf16)
SIMSIMD_DECLARATION_DENSE(dot, f32)
SIMSIMD_DECLARATION_DENSE(dot, f64)
SIMSIMD_DECLARATION_DENSE(dot, f16c)
SIMSIMD_DECLARATION_DENSE(dot, bf16c)
SIMSIMD_DECLARATION_DENSE(dot, f32c)
SIMSIMD_DECLARATION_DENSE(dot, f64c)
SIMSIMD_DECLARATION_DENSE(vdot, f16c)
SIMSIMD_DECLARATION_DENSE(vdot, bf16c)
SIMSIMD_DECLARATION_DENSE(vdot, f32c)
SIMSIMD_DECLARATION_DENSE(vdot, f64c)
// Spatial distances
SIMSIMD_DECLARATION_DENSE(cos, i8)
SIMSIMD_DECLARATION_DENSE(cos, u8)
SIMSIMD_DECLARATION_DENSE(cos, f16)
SIMSIMD_DECLARATION_DENSE(cos, bf16)
SIMSIMD_DECLARATION_DENSE(cos, f32)
SIMSIMD_DECLARATION_DENSE(cos, f64)
SIMSIMD_DECLARATION_DENSE(l2sq, i8)
SIMSIMD_DECLARATION_DENSE(l2sq, u8)
SIMSIMD_DECLARATION_DENSE(l2sq, f16)
SIMSIMD_DECLARATION_DENSE(l2sq, bf16)
SIMSIMD_DECLARATION_DENSE(l2sq, f32)
SIMSIMD_DECLARATION_DENSE(l2sq, f64)
SIMSIMD_DECLARATION_DENSE(l2, i8)
SIMSIMD_DECLARATION_DENSE(l2, u8)
SIMSIMD_DECLARATION_DENSE(l2, f16)
SIMSIMD_DECLARATION_DENSE(l2, bf16)
SIMSIMD_DECLARATION_DENSE(l2, f32)
SIMSIMD_DECLARATION_DENSE(l2, f64)
// Binary distances
SIMSIMD_DECLARATION_DENSE(hamming, b8)
SIMSIMD_DECLARATION_DENSE(jaccard, b8)
// Probability distributions
SIMSIMD_DECLARATION_DENSE(kl, f16)
SIMSIMD_DECLARATION_DENSE(kl, bf16)
SIMSIMD_DECLARATION_DENSE(kl, f32)
SIMSIMD_DECLARATION_DENSE(kl, f64)
SIMSIMD_DECLARATION_DENSE(js, f16)
SIMSIMD_DECLARATION_DENSE(js, bf16)
SIMSIMD_DECLARATION_DENSE(js, f32)
SIMSIMD_DECLARATION_DENSE(js, f64)
// Sparse sets
SIMSIMD_DECLARATION_SPARSE(intersect, u16, u16)
SIMSIMD_DECLARATION_SPARSE(intersect, u32, u32)
// Curved spaces
SIMSIMD_DECLARATION_CURVED(bilinear, f64)
SIMSIMD_DECLARATION_CURVED(bilinear, f64c)
SIMSIMD_DECLARATION_CURVED(mahalanobis, f64)
SIMSIMD_DECLARATION_CURVED(bilinear, f32)
SIMSIMD_DECLARATION_CURVED(bilinear, f32c)
SIMSIMD_DECLARATION_CURVED(mahalanobis, f32)
SIMSIMD_DECLARATION_CURVED(bilinear, f16)
SIMSIMD_DECLARATION_CURVED(bilinear, f16c)
SIMSIMD_DECLARATION_CURVED(mahalanobis, f16)
SIMSIMD_DECLARATION_CURVED(bilinear, bf16)
SIMSIMD_DECLARATION_CURVED(bilinear, bf16c)
SIMSIMD_DECLARATION_CURVED(mahalanobis, bf16)
// Element-wise operations
SIMSIMD_DECLARATION_FMA(fma, f64)
SIMSIMD_DECLARATION_FMA(fma, f32)
SIMSIMD_DECLARATION_FMA(fma, f16)
SIMSIMD_DECLARATION_FMA(fma, bf16)
SIMSIMD_DECLARATION_FMA(fma, i8)
SIMSIMD_DECLARATION_FMA(fma, u8)
SIMSIMD_DECLARATION_WSUM(wsum, f64)
SIMSIMD_DECLARATION_WSUM(wsum, f32)
SIMSIMD_DECLARATION_WSUM(wsum, f16)
SIMSIMD_DECLARATION_WSUM(wsum, bf16)
SIMSIMD_DECLARATION_WSUM(wsum, i8)
SIMSIMD_DECLARATION_WSUM(wsum, u8)
SIMSIMD_DYNAMIC int simsimd_uses_neon(void) { return (simsimd_capabilities() & simsimd_cap_neon_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_neon_f16(void) { return (simsimd_capabilities() & simsimd_cap_neon_f16_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_neon_bf16(void) { return (simsimd_capabilities() & simsimd_cap_neon_bf16_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_neon_i8(void) { return (simsimd_capabilities() & simsimd_cap_neon_i8_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_sve(void) { return (simsimd_capabilities() & simsimd_cap_sve_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_sve_f16(void) { return (simsimd_capabilities() & simsimd_cap_sve_f16_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_sve_bf16(void) { return (simsimd_capabilities() & simsimd_cap_sve_bf16_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_sve_i8(void) { return (simsimd_capabilities() & simsimd_cap_sve_i8_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_haswell(void) { return (simsimd_capabilities() & simsimd_cap_haswell_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_skylake(void) { return (simsimd_capabilities() & simsimd_cap_skylake_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_ice(void) { return (simsimd_capabilities() & simsimd_cap_ice_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_genoa(void) { return (simsimd_capabilities() & simsimd_cap_genoa_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_sapphire(void) { return (simsimd_capabilities() & simsimd_cap_sapphire_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_turin(void) { return (simsimd_capabilities() & simsimd_cap_turin_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_sierra(void) { return (simsimd_capabilities() & simsimd_cap_sierra_k) != 0; }
SIMSIMD_DYNAMIC int simsimd_uses_dynamic_dispatch(void) { return 1; }
SIMSIMD_DYNAMIC int simsimd_flush_denormals(void) { return _simsimd_flush_denormals(); }
SIMSIMD_DYNAMIC simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr) {
return simsimd_f16_to_f32_implementation(x_ptr);
}
SIMSIMD_DYNAMIC void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_ptr) {
simsimd_f32_to_f16_implementation(x, result_ptr);
}
SIMSIMD_DYNAMIC simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr) {
return simsimd_bf16_to_f32_implementation(x_ptr);
}
SIMSIMD_DYNAMIC void simsimd_f32_to_bf16(simsimd_f32_t x, simsimd_bf16_t *result_ptr) {
simsimd_f32_to_bf16_implementation(x, result_ptr);
}
SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void) {
//! The latency of the CPUID instruction can be over 100 cycles, so we cache the result.
static simsimd_capability_t static_capabilities = simsimd_cap_any_k;
if (static_capabilities != simsimd_cap_any_k) return static_capabilities;
static_capabilities = _simsimd_capabilities_implementation();
// In multithreaded applications we need to ensure that the function pointers are pre-initialized,
// so the first time we are probing for capabilities, we should also probe all of our metrics
// with dummy inputs:
simsimd_distance_t dummy_results_buffer[2];
simsimd_distance_t *dummy_results = &dummy_results_buffer[0];
// Passing `NULL` as `x` will trigger all kinds of `nonull` warnings on GCC.
typedef double largest_scalar_t;
largest_scalar_t dummy_input[1];
void *x = &dummy_input[0];
// Dense:
simsimd_dot_i8((simsimd_i8_t *)x, (simsimd_i8_t *)x, 0, dummy_results);
simsimd_dot_u8((simsimd_u8_t *)x, (simsimd_u8_t *)x, 0, dummy_results);
simsimd_dot_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, dummy_results);
simsimd_dot_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, dummy_results);
simsimd_dot_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, dummy_results);
simsimd_dot_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, dummy_results);
simsimd_dot_f16c((simsimd_f16c_t *)x, (simsimd_f16c_t *)x, 0, dummy_results);
simsimd_dot_bf16c((simsimd_bf16c_t *)x, (simsimd_bf16c_t *)x, 0, dummy_results);
simsimd_dot_f32c((simsimd_f32c_t *)x, (simsimd_f32c_t *)x, 0, dummy_results);
simsimd_dot_f64c((simsimd_f64c_t *)x, (simsimd_f64c_t *)x, 0, dummy_results);
simsimd_vdot_f16c((simsimd_f16c_t *)x, (simsimd_f16c_t *)x, 0, dummy_results);
simsimd_vdot_bf16c((simsimd_bf16c_t *)x, (simsimd_bf16c_t *)x, 0, dummy_results);
simsimd_vdot_f32c((simsimd_f32c_t *)x, (simsimd_f32c_t *)x, 0, dummy_results);
simsimd_vdot_f64c((simsimd_f64c_t *)x, (simsimd_f64c_t *)x, 0, dummy_results);
simsimd_cos_i8((simsimd_i8_t *)x, (simsimd_i8_t *)x, 0, dummy_results);
simsimd_cos_u8((simsimd_u8_t *)x, (simsimd_u8_t *)x, 0, dummy_results);
simsimd_cos_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, dummy_results);
simsimd_cos_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, dummy_results);
simsimd_cos_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, dummy_results);
simsimd_cos_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, dummy_results);
simsimd_l2sq_i8((simsimd_i8_t *)x, (simsimd_i8_t *)x, 0, dummy_results);
simsimd_l2sq_u8((simsimd_u8_t *)x, (simsimd_u8_t *)x, 0, dummy_results);
simsimd_l2sq_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, dummy_results);
simsimd_l2sq_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, dummy_results);
simsimd_l2sq_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, dummy_results);
simsimd_l2sq_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, dummy_results);
simsimd_l2_i8((simsimd_i8_t *)x, (simsimd_i8_t *)x, 0, dummy_results);
simsimd_l2_i8((simsimd_i8_t *)x, (simsimd_i8_t *)x, 0, dummy_results);
simsimd_l2_u8((simsimd_u8_t *)x, (simsimd_u8_t *)x, 0, dummy_results);
simsimd_l2_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, dummy_results);
simsimd_l2_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, dummy_results);
simsimd_l2_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, dummy_results);
simsimd_l2_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, dummy_results);
simsimd_hamming_b8((simsimd_b8_t *)x, (simsimd_b8_t *)x, 0, dummy_results);
simsimd_jaccard_b8((simsimd_b8_t *)x, (simsimd_b8_t *)x, 0, dummy_results);
simsimd_kl_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, dummy_results);
simsimd_kl_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, dummy_results);
simsimd_kl_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, dummy_results);
simsimd_kl_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, dummy_results);
simsimd_js_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, dummy_results);
simsimd_js_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, dummy_results);
simsimd_js_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, dummy_results);
simsimd_js_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, dummy_results);
// Sparse
simsimd_intersect_u16((simsimd_u16_t *)x, (simsimd_u16_t *)x, 0, 0, dummy_results);
simsimd_intersect_u32((simsimd_u32_t *)x, (simsimd_u32_t *)x, 0, 0, dummy_results);
// Curved:
simsimd_bilinear_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, dummy_results);
simsimd_mahalanobis_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, dummy_results);
simsimd_bilinear_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, dummy_results);
simsimd_mahalanobis_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, dummy_results);
simsimd_bilinear_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, dummy_results);
simsimd_mahalanobis_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, dummy_results);
simsimd_bilinear_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, dummy_results);
simsimd_mahalanobis_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, dummy_results);
// Elementwise
simsimd_wsum_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, 0, 0, (simsimd_f64_t *)x);
simsimd_wsum_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, 0, 0, (simsimd_f32_t *)x);
simsimd_wsum_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, 0, 0, (simsimd_f16_t *)x);
simsimd_wsum_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, 0, 0, (simsimd_bf16_t *)x);
simsimd_wsum_i8((simsimd_i8_t *)x, (simsimd_i8_t *)x, 0, 0, 0, (simsimd_i8_t *)x);
simsimd_wsum_u8((simsimd_u8_t *)x, (simsimd_u8_t *)x, 0, 0, 0, (simsimd_u8_t *)x);
simsimd_fma_f64((simsimd_f64_t *)x, (simsimd_f64_t *)x, (simsimd_f64_t *)x, 0, 0, 0, (simsimd_f64_t *)x);
simsimd_fma_f32((simsimd_f32_t *)x, (simsimd_f32_t *)x, (simsimd_f32_t *)x, 0, 0, 0, (simsimd_f32_t *)x);
simsimd_fma_f16((simsimd_f16_t *)x, (simsimd_f16_t *)x, (simsimd_f16_t *)x, 0, 0, 0, (simsimd_f16_t *)x);
simsimd_fma_bf16((simsimd_bf16_t *)x, (simsimd_bf16_t *)x, (simsimd_bf16_t *)x, 0, 0, 0, (simsimd_bf16_t *)x);
simsimd_fma_i8((simsimd_i8_t *)x, (simsimd_i8_t *)x, (simsimd_i8_t *)x, 0, 0, 0, (simsimd_i8_t *)x);
simsimd_fma_u8((simsimd_u8_t *)x, (simsimd_u8_t *)x, (simsimd_u8_t *)x, 0, 0, 0, (simsimd_u8_t *)x);
return static_capabilities;
}
SIMSIMD_DYNAMIC void simsimd_find_kernel_punned( //
simsimd_metric_kind_t kind, //
simsimd_datatype_t datatype, //
simsimd_capability_t supported, //
simsimd_capability_t allowed, //
simsimd_kernel_punned_t *kernel_output, //
simsimd_capability_t *capability_output) {
_simsimd_find_kernel_punned_implementation(kind, datatype, supported, allowed, kernel_output, capability_output);
}
#ifdef __cplusplus
}
#endif
simsimd-6.5.12/include/module.modulemap 0000644 0000000 0000000 00000000100 10461020230 0016170 0 ustar 0000000 0000000 module CSimSIMD {
header "simsimd/simsimd.h"
export *
}
simsimd-6.5.12/include/simsimd/binary.h 0000644 0000000 0000000 00000061522 10461020230 0016117 0 ustar 0000000 0000000 /**
* @file binary.h
* @brief SIMD-accelerated Binary Similarity Measures.
* @author Ash Vardanian
* @date July 1, 2023
*
* Contains:
* - Bit-level Hamming distance
* - Bit-level Jaccard distance (Tanimoto coefficient)
* - TODO: Hamming distance for integer vectors - `u32`
* - TODO: Jaccard distance for integer vectors - `u32` and `u32u32` count-min-sketches from StringZilla
*
* For hardware architectures:
* - Arm: NEON, SVE
* - x86: Haswell, Ice Lake
*
* The hardest part of optimizing binary similarity measures is the population count operation.
* It's natively supported by almost every instruction set, but the throughput and latency can
* be suboptimal. There are several ways to optimize this operation:
*
* - Lookup tables, mostly using nibbles (4-bit lookups)
* - Harley-Seal population counts: https://arxiv.org/pdf/1611.07612
*
* On binary vectors, when computing Jaccard distance we can clearly see how the CPU struggles
* to compute that many population counts. There are several instructions we should keep in mind
* for future optimizations:
*
* - `_mm512_popcnt_epi64` maps to `VPOPCNTQ (ZMM, K, ZMM)`:
* - On Ice Lake: 3 cycles latency, ports: 1*p5
* - On Genoa: 2 cycles latency, ports: 1*FP01
* - `_mm512_shuffle_epi8` maps to `VPSHUFB (ZMM, ZMM, ZMM)`:
* - On Ice Lake: 1 cycles latency, ports: 1*p5
* - On Genoa: 2 cycles latency, ports: 1*FP12
* - `_mm512_sad_epu8` maps to `VPSADBW (ZMM, ZMM, ZMM)`:
* - On Ice Lake: 3 cycles latency, ports: 1*p5
* - On Zen4: 3 cycles latency, ports: 1*FP01
* - `_mm512_tertiarylogic_epi64` maps to `VPTERNLOGQ (ZMM, ZMM, ZMM, I8)`:
* - On Ice Lake: 1 cycles latency, ports: 1*p05
* - On Zen4: 1 cycles latency, ports: 1*FP0123
* - `_mm512_gf2p8mul_epi8` maps to `VPGF2P8AFFINEQB (ZMM, ZMM, ZMM)`:
* - On Ice Lake: 5 cycles latency, ports: 1*p0
* - On Zen4: 3 cycles latency, ports: 1*FP01
*
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
* SSE POPCOUNT experiments by Wojciech Muła: https://github.com/WojciechMula/sse-popcount
* R&D progress tracker: https://github.com/ashvardanian/SimSIMD/pull/138
*/
#ifndef SIMSIMD_BINARY_H
#define SIMSIMD_BINARY_H
#include "types.h"
#ifdef __cplusplus
extern "C" {
#endif
// clang-format off
/* Serial backends for bitsets and integers. */
SIMSIMD_PUBLIC void simsimd_hamming_b8_serial(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_jaccard_b8_serial(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
/* Arm NEON backend for bitsets and integers. */
SIMSIMD_PUBLIC void simsimd_hamming_b8_neon(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_jaccard_b8_neon(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
/* Arm SVE backend for bitsets and integers. */
SIMSIMD_PUBLIC void simsimd_hamming_b8_sve(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_jaccard_b8_sve(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
/* x86 AVX2 backend for bitsets and integers for Intel Haswell CPUs and newer, needs only POPCNT extensions. */
SIMSIMD_PUBLIC void simsimd_hamming_b8_haswell(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_jaccard_b8_haswell(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
/* x86 AVX512 backend for bitsets and integers for Intel Ice Lake CPUs and newer, using VPOPCNTDQ extensions. */
SIMSIMD_PUBLIC void simsimd_hamming_b8_ice(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result);
// clang-format on
SIMSIMD_PUBLIC unsigned char simsimd_popcount_b8(simsimd_b8_t x) {
static unsigned char lookup_table[] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
return lookup_table[x];
}
SIMSIMD_PUBLIC void simsimd_hamming_b8_serial(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
simsimd_u32_t differences = 0;
for (simsimd_size_t i = 0; i != n_words; ++i) differences += simsimd_popcount_b8(a[i] ^ b[i]);
*result = differences;
}
SIMSIMD_PUBLIC void simsimd_jaccard_b8_serial(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
simsimd_u32_t intersection = 0, union_ = 0;
for (simsimd_size_t i = 0; i != n_words; ++i)
intersection += simsimd_popcount_b8(a[i] & b[i]), union_ += simsimd_popcount_b8(a[i] | b[i]);
*result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1;
}
#if _SIMSIMD_TARGET_ARM
#if SIMSIMD_TARGET_NEON
#pragma GCC push_options
#pragma GCC target("arch=armv8-a+simd")
#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
SIMSIMD_INTERNAL simsimd_u32_t _simsimd_reduce_u8x16_neon(uint8x16_t vec) {
// Split the vector into two halves and widen to `uint16x8_t`
uint16x8_t low_half = vmovl_u8(vget_low_u8(vec)); // widen lower 8 elements
uint16x8_t high_half = vmovl_u8(vget_high_u8(vec)); // widen upper 8 elements
// Sum the widened halves
uint16x8_t sum16 = vaddq_u16(low_half, high_half);
// Now reduce the `uint16x8_t` to a single `simsimd_u32_t`
uint32x4_t sum32 = vpaddlq_u16(sum16); // pairwise add into 32-bit integers
uint64x2_t sum64 = vpaddlq_u32(sum32); // pairwise add into 64-bit integers
simsimd_u32_t final_sum = vaddvq_u64(sum64); // final horizontal add to 32-bit result
return final_sum;
}
SIMSIMD_PUBLIC void simsimd_hamming_b8_neon(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
simsimd_i32_t differences = 0;
simsimd_size_t i = 0;
// In each 8-bit word we may have up to 8 differences.
// So for up-to 31 cycles (31 * 16 = 496 word-dimensions = 3968 bits)
// we can aggregate the differences into a `uint8x16_t` vector,
// where each component will be up-to 255.
while (i + 16 <= n_words) {
uint8x16_t differences_cycle_vec = vdupq_n_u8(0);
for (simsimd_size_t cycle = 0; cycle < 31 && i + 16 <= n_words; ++cycle, i += 16) {
uint8x16_t a_vec = vld1q_u8(a + i);
uint8x16_t b_vec = vld1q_u8(b + i);
uint8x16_t xor_count_vec = vcntq_u8(veorq_u8(a_vec, b_vec));
differences_cycle_vec = vaddq_u8(differences_cycle_vec, xor_count_vec);
}
differences += _simsimd_reduce_u8x16_neon(differences_cycle_vec);
}
// Handle the tail
for (; i != n_words; ++i) differences += simsimd_popcount_b8(a[i] ^ b[i]);
*result = differences;
}
SIMSIMD_PUBLIC void simsimd_jaccard_b8_neon(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
simsimd_i32_t intersection = 0, union_ = 0;
simsimd_size_t i = 0;
// In each 8-bit word we may have up to 8 intersections/unions.
// So for up-to 31 cycles (31 * 16 = 496 word-dimensions = 3968 bits)
// we can aggregate the intersections/unions into a `uint8x16_t` vector,
// where each component will be up-to 255.
while (i + 16 <= n_words) {
uint8x16_t intersections_cycle_vec = vdupq_n_u8(0);
uint8x16_t unions_cycle_vec = vdupq_n_u8(0);
for (simsimd_size_t cycle = 0; cycle < 31 && i + 16 <= n_words; ++cycle, i += 16) {
uint8x16_t a_vec = vld1q_u8(a + i);
uint8x16_t b_vec = vld1q_u8(b + i);
uint8x16_t and_count_vec = vcntq_u8(vandq_u8(a_vec, b_vec));
uint8x16_t or_count_vec = vcntq_u8(vorrq_u8(a_vec, b_vec));
intersections_cycle_vec = vaddq_u8(intersections_cycle_vec, and_count_vec);
unions_cycle_vec = vaddq_u8(unions_cycle_vec, or_count_vec);
}
intersection += _simsimd_reduce_u8x16_neon(intersections_cycle_vec);
union_ += _simsimd_reduce_u8x16_neon(unions_cycle_vec);
}
// Handle the tail
for (; i != n_words; ++i)
intersection += simsimd_popcount_b8(a[i] & b[i]), union_ += simsimd_popcount_b8(a[i] | b[i]);
*result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON
#if SIMSIMD_TARGET_SVE
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_hamming_b8_sve(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
// On very small register sizes, NEON is at least as fast as SVE.
simsimd_size_t const words_per_register = svcntb();
if (words_per_register <= 32) {
simsimd_hamming_b8_neon(a, b, n_words, result);
return;
}
// On larger register sizes, SVE is faster.
simsimd_size_t i = 0, cycle = 0;
simsimd_i32_t differences = 0;
svuint8_t differences_cycle_vec = svdup_n_u8(0);
svbool_t const all_vec = svptrue_b8();
while (i < n_words) {
do {
svbool_t pg_vec = svwhilelt_b8((unsigned int)i, (unsigned int)n_words);
svuint8_t a_vec = svld1_u8(pg_vec, a + i);
svuint8_t b_vec = svld1_u8(pg_vec, b + i);
differences_cycle_vec =
svadd_u8_z(all_vec, differences_cycle_vec, svcnt_u8_x(all_vec, sveor_u8_m(all_vec, a_vec, b_vec)));
i += words_per_register;
++cycle;
} while (i < n_words && cycle < 31);
differences += svaddv_u8(all_vec, differences_cycle_vec);
differences_cycle_vec = svdup_n_u8(0);
cycle = 0; // Reset the cycle counter.
}
*result = differences;
}
SIMSIMD_PUBLIC void simsimd_jaccard_b8_sve(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
// On very small register sizes, NEON is at least as fast as SVE.
simsimd_size_t const words_per_register = svcntb();
if (words_per_register <= 32) {
simsimd_jaccard_b8_neon(a, b, n_words, result);
return;
}
// On larger register sizes, SVE is faster.
simsimd_size_t i = 0, cycle = 0;
simsimd_i32_t intersection = 0, union_ = 0;
svuint8_t intersection_cycle_vec = svdup_n_u8(0);
svuint8_t union_cycle_vec = svdup_n_u8(0);
svbool_t const all_vec = svptrue_b8();
while (i < n_words) {
do {
svbool_t pg_vec = svwhilelt_b8((unsigned int)i, (unsigned int)n_words);
svuint8_t a_vec = svld1_u8(pg_vec, a + i);
svuint8_t b_vec = svld1_u8(pg_vec, b + i);
intersection_cycle_vec =
svadd_u8_z(all_vec, intersection_cycle_vec, svcnt_u8_x(all_vec, svand_u8_m(all_vec, a_vec, b_vec)));
union_cycle_vec =
svadd_u8_z(all_vec, union_cycle_vec, svcnt_u8_x(all_vec, svorr_u8_m(all_vec, a_vec, b_vec)));
i += words_per_register;
++cycle;
} while (i < n_words && cycle < 31);
intersection += svaddv_u8(all_vec, intersection_cycle_vec);
intersection_cycle_vec = svdup_n_u8(0);
union_ += svaddv_u8(all_vec, union_cycle_vec);
union_cycle_vec = svdup_n_u8(0);
cycle = 0; // Reset the cycle counter.
}
*result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SVE
#endif // _SIMSIMD_TARGET_ARM
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_ICE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512vpopcntdq")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512vpopcntdq"))), \
apply_to = function)
SIMSIMD_PUBLIC void simsimd_hamming_b8_ice(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
simsimd_size_t xor_count;
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
if (n_words <= 64) { // Up to 512 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a);
__m512i b_vec = _mm512_maskz_loadu_epi8(mask, b);
__m512i xor_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a_vec, b_vec));
xor_count = _mm512_reduce_add_epi64(xor_count_vec);
}
else if (n_words <= 128) { // Up to 1024 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 64);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_maskz_loadu_epi8(mask, a + 64);
__m512i b2_vec = _mm512_maskz_loadu_epi8(mask, b + 64);
__m512i xor1_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a1_vec, b1_vec));
__m512i xor2_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a2_vec, b2_vec));
xor_count = _mm512_reduce_add_epi64(_mm512_add_epi64(xor2_count_vec, xor1_count_vec));
}
else if (n_words <= 192) { // Up to 1536 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
__m512i b2_vec = _mm512_loadu_epi8(b + 64);
__m512i a3_vec = _mm512_maskz_loadu_epi8(mask, a + 128);
__m512i b3_vec = _mm512_maskz_loadu_epi8(mask, b + 128);
__m512i xor1_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a1_vec, b1_vec));
__m512i xor2_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a2_vec, b2_vec));
__m512i xor3_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a3_vec, b3_vec));
xor_count =
_mm512_reduce_add_epi64(_mm512_add_epi64(xor3_count_vec, _mm512_add_epi64(xor2_count_vec, xor1_count_vec)));
}
else if (n_words <= 256) { // Up to 2048 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 192);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
__m512i b2_vec = _mm512_loadu_epi8(b + 64);
__m512i a3_vec = _mm512_loadu_epi8(a + 128);
__m512i b3_vec = _mm512_loadu_epi8(b + 128);
__m512i a4_vec = _mm512_maskz_loadu_epi8(mask, a + 192);
__m512i b4_vec = _mm512_maskz_loadu_epi8(mask, b + 192);
__m512i xor1_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a1_vec, b1_vec));
__m512i xor2_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a2_vec, b2_vec));
__m512i xor3_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a3_vec, b3_vec));
__m512i xor4_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a4_vec, b4_vec));
xor_count = _mm512_reduce_add_epi64(_mm512_add_epi64(_mm512_add_epi64(xor4_count_vec, xor3_count_vec),
_mm512_add_epi64(xor2_count_vec, xor1_count_vec)));
}
else {
__m512i xor_count_vec = _mm512_setzero_si512();
__m512i a_vec, b_vec;
simsimd_hamming_b8_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_vec = _mm512_maskz_loadu_epi8(mask, a);
b_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
}
else {
a_vec = _mm512_loadu_epi8(a);
b_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
}
__m512i xor_vec = _mm512_xor_si512(a_vec, b_vec);
xor_count_vec = _mm512_add_epi64(xor_count_vec, _mm512_popcnt_epi64(xor_vec));
if (n_words) goto simsimd_hamming_b8_ice_cycle;
xor_count = _mm512_reduce_add_epi64(xor_count_vec);
}
*result = xor_count;
}
SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
simsimd_size_t intersection = 0, union_ = 0;
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
if (n_words <= 64) { // Up to 512 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a);
__m512i b_vec = _mm512_maskz_loadu_epi8(mask, b);
__m512i and_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a_vec, b_vec));
__m512i or_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a_vec, b_vec));
intersection = _mm512_reduce_add_epi64(and_count_vec);
union_ = _mm512_reduce_add_epi64(or_count_vec);
}
else if (n_words <= 128) { // Up to 1024 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 64);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_maskz_loadu_epi8(mask, a + 64);
__m512i b2_vec = _mm512_maskz_loadu_epi8(mask, b + 64);
__m512i and1_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a1_vec, b1_vec));
__m512i or1_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a1_vec, b1_vec));
__m512i and2_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a2_vec, b2_vec));
__m512i or2_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a2_vec, b2_vec));
intersection = _mm512_reduce_add_epi64(_mm512_add_epi64(and2_count_vec, and1_count_vec));
union_ = _mm512_reduce_add_epi64(_mm512_add_epi64(or2_count_vec, or1_count_vec));
}
else if (n_words <= 192) { // Up to 1536 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
__m512i b2_vec = _mm512_loadu_epi8(b + 64);
__m512i a3_vec = _mm512_maskz_loadu_epi8(mask, a + 128);
__m512i b3_vec = _mm512_maskz_loadu_epi8(mask, b + 128);
__m512i and1_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a1_vec, b1_vec));
__m512i or1_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a1_vec, b1_vec));
__m512i and2_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a2_vec, b2_vec));
__m512i or2_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a2_vec, b2_vec));
__m512i and3_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a3_vec, b3_vec));
__m512i or3_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a3_vec, b3_vec));
intersection = _mm512_reduce_add_epi64( //
_mm512_add_epi64(and3_count_vec, _mm512_add_epi64(and2_count_vec, and1_count_vec)));
union_ = _mm512_reduce_add_epi64( //
_mm512_add_epi64(or3_count_vec, _mm512_add_epi64(or2_count_vec, or1_count_vec)));
}
else if (n_words <= 256) { // Up to 2048 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 192);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
__m512i b2_vec = _mm512_loadu_epi8(b + 64);
__m512i a3_vec = _mm512_loadu_epi8(a + 128);
__m512i b3_vec = _mm512_loadu_epi8(b + 128);
__m512i a4_vec = _mm512_maskz_loadu_epi8(mask, a + 192);
__m512i b4_vec = _mm512_maskz_loadu_epi8(mask, b + 192);
__m512i and1_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a1_vec, b1_vec));
__m512i or1_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a1_vec, b1_vec));
__m512i and2_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a2_vec, b2_vec));
__m512i or2_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a2_vec, b2_vec));
__m512i and3_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a3_vec, b3_vec));
__m512i or3_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a3_vec, b3_vec));
__m512i and4_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a4_vec, b4_vec));
__m512i or4_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a4_vec, b4_vec));
intersection = _mm512_reduce_add_epi64(_mm512_add_epi64(_mm512_add_epi64(and4_count_vec, and3_count_vec),
_mm512_add_epi64(and2_count_vec, and1_count_vec)));
union_ = _mm512_reduce_add_epi64(_mm512_add_epi64(_mm512_add_epi64(or4_count_vec, or3_count_vec),
_mm512_add_epi64(or2_count_vec, or1_count_vec)));
}
else {
__m512i and_count_vec = _mm512_setzero_si512(), or_count_vec = _mm512_setzero_si512();
__m512i a_vec, b_vec;
simsimd_jaccard_b8_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_vec = _mm512_maskz_loadu_epi8(mask, a);
b_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
}
else {
a_vec = _mm512_loadu_epi8(a);
b_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
}
__m512i and_vec = _mm512_and_si512(a_vec, b_vec);
__m512i or_vec = _mm512_or_si512(a_vec, b_vec);
and_count_vec = _mm512_add_epi64(and_count_vec, _mm512_popcnt_epi64(and_vec));
or_count_vec = _mm512_add_epi64(or_count_vec, _mm512_popcnt_epi64(or_vec));
if (n_words) goto simsimd_jaccard_b8_ice_cycle;
intersection = _mm512_reduce_add_epi64(and_count_vec);
union_ = _mm512_reduce_add_epi64(or_count_vec);
}
*result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_ICE
#if SIMSIMD_TARGET_HASWELL
#pragma GCC push_options
#pragma GCC target("popcnt")
#pragma clang attribute push(__attribute__((target("popcnt"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_hamming_b8_haswell(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
// x86 supports unaligned loads and works just fine with the scalar version for small vectors.
simsimd_size_t differences = 0;
for (; n_words >= 8; n_words -= 8, a += 8, b += 8)
differences += _mm_popcnt_u64(*(simsimd_u64_t const *)a ^ *(simsimd_u64_t const *)b);
for (; n_words; --n_words, ++a, ++b) differences += _mm_popcnt_u32(*a ^ *b);
*result = differences;
}
SIMSIMD_PUBLIC void simsimd_jaccard_b8_haswell(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
// x86 supports unaligned loads and works just fine with the scalar version for small vectors.
simsimd_size_t intersection = 0, union_ = 0;
for (; n_words >= 8; n_words -= 8, a += 8, b += 8)
intersection += _mm_popcnt_u64(*(simsimd_u64_t const *)a & *(simsimd_u64_t const *)b),
union_ += _mm_popcnt_u64(*(simsimd_u64_t const *)a | *(simsimd_u64_t const *)b);
for (; n_words; --n_words, ++a, ++b) intersection += _mm_popcnt_u32(*a & *b), union_ += _mm_popcnt_u32(*a | *b);
*result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_HASWELL
#endif // _SIMSIMD_TARGET_X86
#ifdef __cplusplus
}
#endif
#endif
simsimd-6.5.12/include/simsimd/curved.h 0000644 0000000 0000000 00000246734 10461020230 0016135 0 ustar 0000000 0000000 /**
* @file curved.h
* @brief SIMD-accelerated Similarity Measures for curved spaces.
* @author Ash Vardanian
* @date August 27, 2024
*
* Contains:
* - Mahalanobis distance
* - Bilinear form multiplication
* - Bilinear form multiplication over complex numbers
*
* For datatypes:
* - 64-bit floating point numbers
* - 32-bit floating point numbers
* - 16-bit floating point numbers
* - 16-bit brain-floating point numbers
*
* For hardware architectures:
* - Arm: NEON
* - x86: Haswell, Ice Lake, Skylake, Genoa, Sapphire
*
* Most kernels in this file are designed for BLAS level 2 operations, where the operands are
* a combination of matrices and vectors, generally forming a chain of multiplications.
* Most kernels exploit the fact that matrix multiplication is associative, and the order of
* operations can be changed to minimize the number of operations: `(A * B) * C = A * (B * C)`.
* To optimize the performance, we minimize the number of memory accesses, and maximize the
* number of arithmetic operations, by using SIMD instructions.
*
* When A and C are vectors, and B is a matrix, we can load every element in B just once, and
* reuse it for every element in A and C.
*
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
*/
#ifndef SIMSIMD_CURVED_H
#define SIMSIMD_CURVED_H
#include "types.h"
#include "dot.h" // `_simsimd_partial_load_f16x4_neon` and friends
#include "spatial.h" // `_simsimd_substract_bf16x32_genoa`
#ifdef __cplusplus
extern "C" {
#endif
// clang-format off
/* Serial backends for all numeric types.
* By default they use 32-bit arithmetic, unless the arguments themselves contain 64-bit floats.
* For double-precision computation check out the "*_accurate" variants of those "*_serial" functions.
*/
SIMSIMD_PUBLIC void simsimd_bilinear_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_f64_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f64c_serial(simsimd_f64c_t const* a, simsimd_f64c_t const* b, simsimd_f64c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_f64_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f32c_serial(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_f32c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f16c_serial(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_f16c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_bf16c_serial(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_bf16c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
/* Double-precision serial backends for all numeric types.
* For single-precision computation check out the "*_serial" counterparts of those "*_accurate" functions.
*/
SIMSIMD_PUBLIC void simsimd_bilinear_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f32c_accurate(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_f32c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f16c_accurate(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_f16c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_bf16c_accurate(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_bf16c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for Arm NEON, mostly using 32-bit arithmetic over 128-bit words.
* By far the most portable backend, covering most Arm v8 devices, over a billion phones, and almost all
* server CPUs produced before 2023.
*/
SIMSIMD_PUBLIC void simsimd_bilinear_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f32c_neon(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_f32c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f16c_neon(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_f16c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_bf16c_neon(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_bf16c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for AVX2 CPUs of Haswell generation and newer, using 32-bit arithmetic over 256-bit words.
* First demonstrated in 2011, at least one Haswell-based processor was still being sold in 2022 — the Pentium G3420.
* Practically all modern x86 CPUs support AVX2, FMA, and F16C, making it a perfect baseline for SIMD algorithms.
* On other hand, there is no need to implement AVX2 versions of `f32` and `f64` functions, as those are
* properly vectorized by recent compilers.
*/
SIMSIMD_PUBLIC void simsimd_bilinear_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for various generations of AVX512 CPUs.
* Skylake is handy, as it supports masked loads and other operations, avoiding the need for the tail loop.
* Ice Lake added VNNI, VPOPCNTDQ, IFMA, VBMI, VAES, GFNI, VBMI2, BITALG, VPCLMULQDQ, and other extensions for integral operations.
* Sapphire Rapids added tiled matrix operations, but we are most interested in the new mixed-precision FMA instructions.
*/
SIMSIMD_PUBLIC void simsimd_bilinear_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_f64_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f64c_skylake(simsimd_f64c_t const* a, simsimd_f64c_t const* b, simsimd_f64c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_f64_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f32c_skylake(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_f32c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_bf16c_genoa(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_bf16c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_bilinear_f16c_sapphire(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_f16c_t const* c, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result);
// clang-format on
#define SIMSIMD_MAKE_BILINEAR(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_bilinear_##input_type##_##name( \
simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, simsimd_##input_type##_t const *c, \
simsimd_size_t n, simsimd_distance_t *result) { \
simsimd_##accumulator_type##_t sum = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t cb_j = 0; \
simsimd_##accumulator_type##_t a_i = load_and_convert(a + i); \
for (simsimd_size_t j = 0; j != n; ++j) { \
simsimd_##accumulator_type##_t b_j = load_and_convert(b + j); \
simsimd_##accumulator_type##_t c_ij = load_and_convert(c + i * n + j); \
cb_j += c_ij * b_j; \
} \
sum += a_i * cb_j; \
} \
*result = (simsimd_distance_t)sum; \
}
#define SIMSIMD_MAKE_COMPLEX_BILINEAR(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_bilinear_##input_type##_##name( \
simsimd_##input_type##_t const *a_pairs, simsimd_##input_type##_t const *b_pairs, \
simsimd_##input_type##_t const *c_pairs, simsimd_size_t n, simsimd_distance_t *results) { \
simsimd_##accumulator_type##_t sum_real = 0; \
simsimd_##accumulator_type##_t sum_imag = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t cb_j_real = 0; \
simsimd_##accumulator_type##_t cb_j_imag = 0; \
simsimd_##accumulator_type##_t a_i_real = load_and_convert(&(a_pairs + i)->real); \
simsimd_##accumulator_type##_t a_i_imag = load_and_convert(&(a_pairs + i)->imag); \
for (simsimd_size_t j = 0; j != n; ++j) { \
simsimd_##accumulator_type##_t b_j_real = load_and_convert(&(b_pairs + j)->real); \
simsimd_##accumulator_type##_t b_j_imag = load_and_convert(&(b_pairs + j)->imag); \
simsimd_##accumulator_type##_t c_ij_real = load_and_convert(&(c_pairs + i * n + j)->real); \
simsimd_##accumulator_type##_t c_ij_imag = load_and_convert(&(c_pairs + i * n + j)->imag); \
/* Complex multiplication: (c_ij * b_j) */ \
cb_j_real += c_ij_real * b_j_real - c_ij_imag * b_j_imag; \
cb_j_imag += c_ij_real * b_j_imag + c_ij_imag * b_j_real; \
} \
/* Complex multiplication: (a_i * cb_j) */ \
sum_real += a_i_real * cb_j_real - a_i_imag * cb_j_imag; \
sum_imag += a_i_real * cb_j_imag + a_i_imag * cb_j_real; \
} \
results[0] = (simsimd_distance_t)sum_real; \
results[1] = (simsimd_distance_t)sum_imag; \
}
#define SIMSIMD_MAKE_MAHALANOBIS(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_mahalanobis_##input_type##_##name( \
simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, simsimd_##input_type##_t const *c, \
simsimd_size_t n, simsimd_distance_t *result) { \
simsimd_##accumulator_type##_t sum = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t cdiff_j = 0; \
simsimd_##accumulator_type##_t diff_i = load_and_convert(a + i) - load_and_convert(b + i); \
for (simsimd_size_t j = 0; j != n; ++j) { \
simsimd_##accumulator_type##_t diff_j = load_and_convert(a + j) - load_and_convert(b + j); \
simsimd_##accumulator_type##_t c_ij = load_and_convert(c + i * n + j); \
cdiff_j += c_ij * diff_j; \
} \
sum += diff_i * cdiff_j; \
} \
*result = (simsimd_distance_t)SIMSIMD_SQRT(sum); \
}
SIMSIMD_MAKE_BILINEAR(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_bilinear_f64_serial
SIMSIMD_MAKE_COMPLEX_BILINEAR(serial, f64c, f64, SIMSIMD_DEREFERENCE) // simsimd_bilinear_f64c_serial
SIMSIMD_MAKE_MAHALANOBIS(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_mahalanobis_f64_serial
SIMSIMD_MAKE_BILINEAR(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_bilinear_f32_serial
SIMSIMD_MAKE_COMPLEX_BILINEAR(serial, f32c, f32, SIMSIMD_DEREFERENCE) // simsimd_bilinear_f32c_serial
SIMSIMD_MAKE_MAHALANOBIS(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_mahalanobis_f32_serial
SIMSIMD_MAKE_BILINEAR(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_bilinear_f16_serial
SIMSIMD_MAKE_COMPLEX_BILINEAR(serial, f16c, f32, SIMSIMD_F16_TO_F32) // simsimd_bilinear_f16c_serial
SIMSIMD_MAKE_MAHALANOBIS(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_mahalanobis_f16_serial
SIMSIMD_MAKE_BILINEAR(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_bilinear_bf16_serial
SIMSIMD_MAKE_COMPLEX_BILINEAR(serial, bf16c, f32, SIMSIMD_BF16_TO_F32) // simsimd_bilinear_bf16c_serial
SIMSIMD_MAKE_MAHALANOBIS(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_mahalanobis_bf16_serial
SIMSIMD_MAKE_BILINEAR(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_bilinear_f32_accurate
SIMSIMD_MAKE_COMPLEX_BILINEAR(accurate, f32c, f64, SIMSIMD_DEREFERENCE) // simsimd_bilinear_f32c_accurate
SIMSIMD_MAKE_MAHALANOBIS(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_mahalanobis_f32_accurate
SIMSIMD_MAKE_BILINEAR(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_bilinear_f16_accurate
SIMSIMD_MAKE_COMPLEX_BILINEAR(accurate, f16c, f64, SIMSIMD_F16_TO_F32) // simsimd_bilinear_f16c_accurate
SIMSIMD_MAKE_MAHALANOBIS(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_mahalanobis_f16_accurate
SIMSIMD_MAKE_BILINEAR(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_bilinear_bf16_accurate
SIMSIMD_MAKE_COMPLEX_BILINEAR(accurate, bf16c, f64, SIMSIMD_BF16_TO_F32) // simsimd_bilinear_bf16c_accurate
SIMSIMD_MAKE_MAHALANOBIS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_mahalanobis_bf16_accurate
#if _SIMSIMD_TARGET_ARM
#if SIMSIMD_TARGET_NEON
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_bilinear_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c,
simsimd_size_t n, simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
for (simsimd_size_t i = 0; i != n; ++i) {
float32x4_t a_vec = vdupq_n_f32(a[i]);
float32x4_t cb_j_vec = vdupq_n_f32(0);
for (simsimd_size_t j = 0; j + 4 <= n; j += 4) {
float32x4_t b_vec = vld1q_f32(b + j);
float32x4_t c_vec = vld1q_f32(c + i * n + j);
cb_j_vec = vmlaq_f32(cb_j_vec, b_vec, c_vec);
}
sum_vec = vmlaq_f32(sum_vec, a_vec, cb_j_vec);
}
// Handle the tail of every row
simsimd_f64_t sum = vaddvq_f32(sum_vec);
simsimd_size_t const tail_length = n % 4;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = a[i];
simsimd_f32_t cb_j = 0;
for (simsimd_size_t j = tail_start; j != n; ++j) cb_j += b[j] * c[i * n + j];
sum += a[i] * cb_j;
}
}
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c,
simsimd_size_t n, simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
for (simsimd_size_t i = 0; i != n; ++i) {
float32x4_t diff_i_vec = vdupq_n_f32(a[i] - b[i]);
float32x4_t cdiff_j_vec = vdupq_n_f32(0);
for (simsimd_size_t j = 0; j + 4 <= n; j += 4) {
float32x4_t diff_j_vec = vsubq_f32(vld1q_f32(a + j), vld1q_f32(b + j));
float32x4_t c_vec = vld1q_f32(c + i * n + j);
cdiff_j_vec = vmlaq_f32(cdiff_j_vec, diff_j_vec, c_vec);
}
sum_vec = vmlaq_f32(sum_vec, diff_i_vec, cdiff_j_vec);
}
// Handle the tail of every row
simsimd_f64_t sum = vaddvq_f32(sum_vec);
simsimd_size_t const tail_length = n % 4;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t diff_i = a[i] - b[i];
simsimd_f32_t cdiff_j = 0;
for (simsimd_size_t j = tail_start; j != n; ++j) {
simsimd_f32_t diff_j = a[j] - b[j];
cdiff_j += diff_j * c[i * n + j];
}
sum += diff_i * cdiff_j;
}
}
*result = _simsimd_sqrt_f64_neon(sum);
}
SIMSIMD_PUBLIC void simsimd_bilinear_f32c_neon(simsimd_f32c_t const *a, simsimd_f32c_t const *b,
simsimd_f32c_t const *c, simsimd_size_t n, simsimd_distance_t *results) {
simsimd_f32_t sum_real = 0;
simsimd_f32_t sum_imag = 0;
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32c_t a_i = a[i];
simsimd_f32c_t cb_j;
float32x4_t cb_j_real_vec = vdupq_n_f32(0);
float32x4_t cb_j_imag_vec = vdupq_n_f32(0);
for (simsimd_size_t j = 0; j + 4 <= n; j += 4) {
// Unpack the input arrays into real and imaginary parts:
float32x4x2_t b_vec = vld2q_f32((simsimd_f32_t const *)(b + j));
float32x4x2_t c_vec = vld2q_f32((simsimd_f32_t const *)(c + i * n + j));
float32x4_t b_real_vec = b_vec.val[0];
float32x4_t b_imag_vec = b_vec.val[1];
float32x4_t c_real_vec = c_vec.val[0];
float32x4_t c_imag_vec = c_vec.val[1];
// Compute the dot product:
cb_j_real_vec = vfmaq_f32(cb_j_real_vec, c_real_vec, b_real_vec);
cb_j_real_vec = vfmsq_f32(cb_j_real_vec, c_imag_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f32(cb_j_imag_vec, c_real_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f32(cb_j_imag_vec, c_imag_vec, b_real_vec);
}
cb_j.real = vaddvq_f32(cb_j_real_vec);
cb_j.imag = vaddvq_f32(cb_j_imag_vec);
sum_real += a_i.real * cb_j.real - a_i.imag * cb_j.imag;
sum_imag += a_i.real * cb_j.imag + a_i.imag * cb_j.real;
}
// Handle the tail of every row
simsimd_size_t const tail_length = n % 4;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32c_t a_i = a[i];
simsimd_f32c_t cb_j = {0, 0};
for (simsimd_size_t j = tail_start; j != n; ++j) {
simsimd_f32c_t b_j = b[j];
simsimd_f32c_t c_ij = c[i * n + j];
cb_j.real += b_j.real * c_ij.real - b_j.imag * c_ij.imag;
cb_j.imag += b_j.real * c_ij.imag + b_j.imag * c_ij.real;
}
sum_real += a_i.real * cb_j.real - a_i.imag * cb_j.imag;
sum_imag += a_i.real * cb_j.imag + a_i.imag * cb_j.real;
}
}
results[0] = sum_real;
results[1] = sum_imag;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON
#if SIMSIMD_TARGET_NEON_F16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd+fp16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_bilinear_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c,
simsimd_size_t n, simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
for (simsimd_size_t i = 0; i != n; ++i) {
// MSVC doesn't recognize `vdup_n_f16` as a valid intrinsic
float32x4_t a_vec = vcvt_f32_f16(vreinterpret_f16_s16(vdup_n_s16(*(short const *)(a + i))));
float32x4_t cb_j_vec = vdupq_n_f32(0);
for (simsimd_size_t j = 0; j + 4 <= n; j += 4) {
float32x4_t b_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)(b + j)));
float32x4_t c_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)(c + i * n + j)));
cb_j_vec = vmlaq_f32(cb_j_vec, b_vec, c_vec);
}
sum_vec = vmlaq_f32(sum_vec, a_vec, cb_j_vec);
}
// Handle the tail of every row
simsimd_f64_t sum = vaddvq_f32(sum_vec);
simsimd_size_t const tail_length = n % 4;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a + i, 1)));
float32x4_t b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b + tail_start, tail_length));
float32x4_t c_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length));
simsimd_f32_t cb_j = vaddvq_f32(vmulq_f32(b_vec, c_vec));
sum += a_i * cb_j;
}
}
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c,
simsimd_size_t n, simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
for (simsimd_size_t i = 0; i != n; ++i) {
// MSVC doesn't recognize `vdup_n_f16` as a valid intrinsic
float32x4_t a_i_vec = vcvt_f32_f16(vreinterpret_f16_s16(vdup_n_s16(*(short const *)(a + i))));
float32x4_t b_i_vec = vcvt_f32_f16(vreinterpret_f16_s16(vdup_n_s16(*(short const *)(b + i))));
float32x4_t diff_i_vec = vsubq_f32(a_i_vec, b_i_vec);
float32x4_t cdiff_j_vec = vdupq_n_f32(0);
for (simsimd_size_t j = 0; j + 4 <= n; j += 4) {
float32x4_t a_j_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)(a + j)));
float32x4_t b_j_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)(b + j)));
float32x4_t diff_j_vec = vsubq_f32(a_j_vec, b_j_vec);
float32x4_t c_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)(c + i * n + j)));
cdiff_j_vec = vmlaq_f32(cdiff_j_vec, diff_j_vec, c_vec);
}
sum_vec = vmlaq_f32(sum_vec, diff_i_vec, cdiff_j_vec);
}
// Handle the tail of every row
simsimd_f32_t sum = vaddvq_f32(sum_vec);
simsimd_size_t const tail_length = n % 4;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a + i, 1)));
simsimd_f32_t b_i = vaddvq_f32(vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b + i, 1)));
simsimd_f32_t diff_i = a_i - b_i;
float32x4_t a_j_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a + tail_start, tail_length));
float32x4_t b_j_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b + tail_start, tail_length));
float32x4_t diff_j_vec = vsubq_f32(a_j_vec, b_j_vec);
float32x4_t c_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length));
simsimd_f32_t cdiff_j = vaddvq_f32(vmulq_f32(diff_j_vec, c_vec));
sum += diff_i * cdiff_j;
}
}
*result = _simsimd_sqrt_f32_neon(sum);
}
SIMSIMD_INTERNAL simsimd_f32_t _simsimd_reduce_f16x8_neon(float16x8_t vec) {
// Split the 8-element vector into two 4-element vectors
float16x4_t low = vget_low_f16(vec); // Lower 4 elements
float16x4_t high = vget_high_f16(vec); // Upper 4 elements
// Add the lower and upper parts
float16x4_t sum = vadd_f16(low, high);
// Perform pairwise addition to reduce 4 elements to 2, then to 1
sum = vpadd_f16(sum, sum); // First reduction: 4 -> 2
sum = vpadd_f16(sum, sum); // Second reduction: 2 -> 1
// Convert the remaining half-precision value to single-precision and return
return vgetq_lane_f32(vcvt_f32_f16(sum), 0);
}
SIMSIMD_INTERNAL float16x8x2_t _simsimd_partial_load_f16x8x2_neon(simsimd_f16c_t const *x, simsimd_size_t n) {
union {
float16x8x2_t vecs;
simsimd_f16_t scalars[2][8];
} result;
simsimd_size_t i = 0;
for (; i < n; ++i) result.scalars[0][i] = x[i].real, result.scalars[1][i] = x[i].imag;
for (; i < 8; ++i) result.scalars[0][i] = 0, result.scalars[1][i] = 0;
return result.vecs;
}
SIMSIMD_PUBLIC void simsimd_bilinear_f16c_neon(simsimd_f16c_t const *a, simsimd_f16c_t const *b,
simsimd_f16c_t const *c, simsimd_size_t n, simsimd_distance_t *results) {
simsimd_f32_t sum_real = 0;
simsimd_f32_t sum_imag = 0;
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32c_t a_i = {simsimd_f16_to_f32(&a[i].real), simsimd_f16_to_f32(&a[i].imag)};
float16x8_t cb_j_real_vec = vdupq_n_f16(0);
float16x8_t cb_j_imag_vec = vdupq_n_f16(0);
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
// Unpack the input arrays into real and imaginary parts:
float16x8x2_t b_vec = vld2q_f16((float16_t const *)(b + j));
float16x8x2_t c_vec = vld2q_f16((float16_t const *)(c + i * n + j));
float16x8_t b_real_vec = b_vec.val[0];
float16x8_t b_imag_vec = b_vec.val[1];
float16x8_t c_real_vec = c_vec.val[0];
float16x8_t c_imag_vec = c_vec.val[1];
// Compute the dot product:
cb_j_real_vec = vfmaq_f16(cb_j_real_vec, c_real_vec, b_real_vec);
cb_j_real_vec = vfmsq_f16(cb_j_real_vec, c_imag_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f16(cb_j_imag_vec, c_real_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f16(cb_j_imag_vec, c_imag_vec, b_real_vec);
}
// Handle row tails
if (tail_length) {
// Unpack the input arrays into real and imaginary parts:
float16x8x2_t b_vec = _simsimd_partial_load_f16x8x2_neon(b + tail_start, tail_length);
float16x8x2_t c_vec = _simsimd_partial_load_f16x8x2_neon(c + i * n + tail_start, tail_length);
float16x8_t b_real_vec = b_vec.val[0];
float16x8_t b_imag_vec = b_vec.val[1];
float16x8_t c_real_vec = c_vec.val[0];
float16x8_t c_imag_vec = c_vec.val[1];
// Compute the dot product:
cb_j_real_vec = vfmaq_f16(cb_j_real_vec, c_real_vec, b_real_vec);
cb_j_real_vec = vfmsq_f16(cb_j_real_vec, c_imag_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f16(cb_j_imag_vec, c_real_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f16(cb_j_imag_vec, c_imag_vec, b_real_vec);
}
simsimd_f32c_t cb_j;
cb_j.real = _simsimd_reduce_f16x8_neon(cb_j_real_vec);
cb_j.imag = _simsimd_reduce_f16x8_neon(cb_j_imag_vec);
sum_real += a_i.real * cb_j.real - a_i.imag * cb_j.imag;
sum_imag += a_i.real * cb_j.imag + a_i.imag * cb_j.real;
}
results[0] = sum_real;
results[1] = sum_imag;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_F16
#if SIMSIMD_TARGET_NEON_BF16
#pragma GCC push_options
#pragma GCC target("arch=armv8.6-a+simd+bf16")
#pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_bilinear_bf16_neon(simsimd_bf16_t const *a, simsimd_bf16_t const *b,
simsimd_bf16_t const *c, simsimd_size_t n, simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
for (simsimd_size_t i = 0; i != n; ++i) {
float32x4_t a_vec = vdupq_n_f32(simsimd_bf16_to_f32(a + i));
float32x4_t cb_j_vec = vdupq_n_f32(0);
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
bfloat16x8_t b_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)(b + j));
bfloat16x8_t c_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)(c + i * n + j));
cb_j_vec = vbfdotq_f32(cb_j_vec, b_vec, c_vec);
}
sum_vec = vmlaq_f32(sum_vec, a_vec, cb_j_vec);
}
// Handle the tail of every row
simsimd_f64_t sum = vaddvq_f32(sum_vec);
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = simsimd_bf16_to_f32(a + i);
bfloat16x8_t b_vec = _simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length);
bfloat16x8_t c_vec = _simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length);
simsimd_f32_t cb_j = vaddvq_f32(vbfdotq_f32(vdupq_n_f32(0), b_vec, c_vec));
sum += a_i * cb_j;
}
}
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const *a, simsimd_bf16_t const *b,
simsimd_bf16_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = simsimd_bf16_to_f32(a + i);
simsimd_f32_t b_i = simsimd_bf16_to_f32(b + i);
float32x4_t diff_i_vec = vdupq_n_f32(a_i - b_i);
float32x4_t cdiff_j_vec = vdupq_n_f32(0);
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
bfloat16x8_t a_j_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)(a + j));
bfloat16x8_t b_j_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)(b + j));
// Arm NEON does not have a native subtraction instruction for `bf16`,
// so we need to convert to `f32` first, subtract, and only then get back to `bf16`
// for multiplication.
float32x4_t a_j_vec_high = vcvt_f32_bf16(vget_high_bf16(a_j_vec));
float32x4_t a_j_vec_low = vcvt_f32_bf16(vget_low_bf16(a_j_vec));
float32x4_t b_j_vec_high = vcvt_f32_bf16(vget_high_bf16(b_j_vec));
float32x4_t b_j_vec_low = vcvt_f32_bf16(vget_low_bf16(b_j_vec));
float32x4_t diff_j_vec_high = vsubq_f32(a_j_vec_high, b_j_vec_high);
float32x4_t diff_j_vec_low = vsubq_f32(a_j_vec_low, b_j_vec_low);
bfloat16x8_t diff_j_vec = vcombine_bf16(vcvt_bf16_f32(diff_j_vec_low), vcvt_bf16_f32(diff_j_vec_high));
bfloat16x8_t c_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)(c + i * n + j));
cdiff_j_vec = vbfdotq_f32(cdiff_j_vec, diff_j_vec, c_vec);
}
sum_vec = vmlaq_f32(sum_vec, diff_i_vec, cdiff_j_vec);
}
// Handle the tail of every row
simsimd_f32_t sum = vaddvq_f32(sum_vec);
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = simsimd_bf16_to_f32(a + i);
simsimd_f32_t b_i = simsimd_bf16_to_f32(b + i);
simsimd_f32_t diff_i = a_i - b_i;
bfloat16x8_t a_j_vec = _simsimd_partial_load_bf16x8_neon(a + tail_start, tail_length);
bfloat16x8_t b_j_vec = _simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length);
// Again, upcast for subtraction
float32x4_t a_j_vec_high = vcvt_f32_bf16(vget_high_bf16(a_j_vec));
float32x4_t a_j_vec_low = vcvt_f32_bf16(vget_low_bf16(a_j_vec));
float32x4_t b_j_vec_high = vcvt_f32_bf16(vget_high_bf16(b_j_vec));
float32x4_t b_j_vec_low = vcvt_f32_bf16(vget_low_bf16(b_j_vec));
float32x4_t diff_j_vec_high = vsubq_f32(a_j_vec_high, b_j_vec_high);
float32x4_t diff_j_vec_low = vsubq_f32(a_j_vec_low, b_j_vec_low);
bfloat16x8_t diff_j_vec = vcombine_bf16(vcvt_bf16_f32(diff_j_vec_low), vcvt_bf16_f32(diff_j_vec_high));
bfloat16x8_t c_vec = _simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length);
simsimd_f32_t cdiff_j = vaddvq_f32(vbfdotq_f32(vdupq_n_f32(0), diff_j_vec, c_vec));
sum += diff_i * cdiff_j;
}
}
*result = _simsimd_sqrt_f32_neon(sum);
}
SIMSIMD_INTERNAL int16x4x2_t _simsimd_partial_load_bf16x4x2_neon(simsimd_bf16c_t const *x, simsimd_size_t n) {
union {
int16x4x2_t vec;
simsimd_bf16_t scalars[2][4];
} result;
simsimd_size_t i = 0;
for (; i < n; ++i) result.scalars[0][i] = x[i].real, result.scalars[1][i] = x[i].imag;
for (; i < 4; ++i) result.scalars[1][i] = 0, result.scalars[1][i] = 0;
return result.vec;
}
SIMSIMD_PUBLIC void simsimd_bilinear_bf16c_neon(simsimd_bf16c_t const *a, simsimd_bf16c_t const *b,
simsimd_bf16c_t const *c, simsimd_size_t n,
simsimd_distance_t *results) {
simsimd_f32_t sum_real = 0;
simsimd_f32_t sum_imag = 0;
simsimd_size_t const tail_length = n % 4;
simsimd_size_t const tail_start = n - tail_length;
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32c_t a_i = {simsimd_bf16_to_f32(&a[i].real), simsimd_bf16_to_f32(&a[i].imag)};
// A nicer approach is to use `bf16` arithmetic for the dot product, but that requires
// FMLA extensions available on Arm v8.3 and later. That we can also process 16 entries
// at once. That's how the original implementation worked, but compiling it was a nightmare :)
float32x4_t cb_j_real_vec = vdupq_n_f32(0);
float32x4_t cb_j_imag_vec = vdupq_n_f32(0);
for (simsimd_size_t j = 0; j + 4 <= n; j += 4) {
// Unpack the input arrays into real and imaginary parts.
// MSVC sadly doesn't recognize the `vld2_bf16`, so we load the data as signed
// integers of the same size and reinterpret with `vreinterpret_bf16_s16` afterwards.
int16x4x2_t b_vec = vld2_s16((short const *)(b + j));
int16x4x2_t c_vec = vld2_s16((short const *)(c + i * n + j));
float32x4_t b_real_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(b_vec.val[0]));
float32x4_t b_imag_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(b_vec.val[1]));
float32x4_t c_real_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(c_vec.val[0]));
float32x4_t c_imag_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(c_vec.val[1]));
// Compute the dot product:
cb_j_real_vec = vfmaq_f32(cb_j_real_vec, c_real_vec, b_real_vec);
cb_j_real_vec = vfmsq_f32(cb_j_real_vec, c_imag_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f32(cb_j_imag_vec, c_real_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f32(cb_j_imag_vec, c_imag_vec, b_real_vec);
}
// Handle row tails
if (tail_length) {
// Unpack the input arrays into real and imaginary parts:
int16x4x2_t b_vec = _simsimd_partial_load_bf16x4x2_neon(b + tail_start, tail_length);
int16x4x2_t c_vec = _simsimd_partial_load_bf16x4x2_neon(c + i * n + tail_start, tail_length);
float32x4_t b_real_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(b_vec.val[0]));
float32x4_t b_imag_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(b_vec.val[1]));
float32x4_t c_real_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(c_vec.val[0]));
float32x4_t c_imag_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(c_vec.val[1]));
// Compute the dot product:
cb_j_real_vec = vfmaq_f32(cb_j_real_vec, c_real_vec, b_real_vec);
cb_j_real_vec = vfmsq_f32(cb_j_real_vec, c_imag_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f32(cb_j_imag_vec, c_real_vec, b_imag_vec);
cb_j_imag_vec = vfmaq_f32(cb_j_imag_vec, c_imag_vec, b_real_vec);
}
simsimd_f32c_t cb_j;
cb_j.real = vaddvq_f32(cb_j_real_vec);
cb_j.imag = vaddvq_f32(cb_j_imag_vec);
sum_real += a_i.real * cb_j.real - a_i.imag * cb_j.imag;
sum_imag += a_i.real * cb_j.imag + a_i.imag * cb_j.real;
}
results[0] = sum_real;
results[1] = sum_imag;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_BF16
#endif // _SIMSIMD_TARGET_ARM
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_HASWELL
#pragma GCC push_options
#pragma GCC target("avx2", "f16c", "fma")
#pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_bilinear_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c,
simsimd_size_t n, simsimd_distance_t *result) {
__m256 sum_vec = _mm256_setzero_ps();
for (simsimd_size_t i = 0; i != n; ++i) {
__m256 a_vec = _mm256_cvtph_ps(_mm_set1_epi16(*(short const *)(a + i)));
__m256 cb_j_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)(b + j)));
__m256 c_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)(c + i * n + j)));
cb_j_vec = _mm256_fmadd_ps(b_vec, c_vec, cb_j_vec);
}
sum_vec = _mm256_fmadd_ps(a_vec, cb_j_vec, sum_vec);
}
// Handle the tail of every row
simsimd_f32_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = _mm256_cvtss_f32(_mm256_cvtph_ps(_mm_set1_epi16(*(short const *)(a + i))));
__m256 b_vec = _simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length);
__m256 c_vec = _simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length);
simsimd_f32_t cb_j = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(b_vec, c_vec));
sum += a_i * cb_j;
}
}
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b,
simsimd_f16_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 sum_vec = _mm256_setzero_ps();
for (simsimd_size_t i = 0; i != n; ++i) {
__m256 diff_i_vec = _mm256_sub_ps( //
_mm256_cvtph_ps(_mm_set1_epi16(*(short const *)(a + i))), //
_mm256_cvtph_ps(_mm_set1_epi16(*(short const *)(b + i))));
__m256 cdiff_j_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 diff_j_vec = _mm256_sub_ps( //
_mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)(a + j))),
_mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)(b + j))));
__m256 c_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)(c + i * n + j)));
cdiff_j_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, cdiff_j_vec);
}
sum_vec = _mm256_fmadd_ps(diff_i_vec, cdiff_j_vec, sum_vec);
}
// Handle the tail of every row
simsimd_f32_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t diff_i = _mm256_cvtss_f32(_mm256_sub_ps( //
_mm256_cvtph_ps(_mm_set1_epi16(*(short const *)(a + i))), //
_mm256_cvtph_ps(_mm_set1_epi16(*(short const *)(b + i)))));
__m256 diff_j_vec = _mm256_sub_ps( //
_simsimd_partial_load_f16x8_haswell(a + tail_start, tail_length),
_simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length));
__m256 c_vec = _simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length);
simsimd_f32_t cdiff_j = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(diff_j_vec, c_vec));
sum += diff_i * cdiff_j;
}
}
*result = _simsimd_sqrt_f32_haswell(sum);
}
SIMSIMD_PUBLIC void simsimd_bilinear_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b,
simsimd_bf16_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 sum_vec = _mm256_setzero_ps();
for (simsimd_size_t i = 0; i != n; ++i) {
// The `simsimd_bf16_to_f32` is cheaper than `_simsimd_bf16x8_to_f32x8_haswell`
__m256 a_vec = _mm256_set1_ps(simsimd_bf16_to_f32(a + i));
__m256 cb_j_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)(b + j)));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)(c + i * n + j)));
cb_j_vec = _mm256_fmadd_ps(b_vec, c_vec, cb_j_vec);
}
sum_vec = _mm256_fmadd_ps(a_vec, cb_j_vec, sum_vec);
}
// Handle the tail of every row
simsimd_f32_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = simsimd_bf16_to_f32(a + i);
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell( //
_simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell( //
_simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length));
simsimd_f32_t cb_j = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(b_vec, c_vec));
sum += a_i * cb_j;
}
}
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b,
simsimd_bf16_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 sum_vec = _mm256_setzero_ps();
for (simsimd_size_t i = 0; i != n; ++i) {
__m256 diff_i_vec = _mm256_sub_ps( //
_mm256_set1_ps(simsimd_bf16_to_f32(a + i)), //
_mm256_set1_ps(simsimd_bf16_to_f32(b + i)));
__m256 cdiff_j_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 diff_j_vec = _mm256_sub_ps( //
_simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)(a + j))), //
_simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)(b + j))));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)(c + i * n + j)));
cdiff_j_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, cdiff_j_vec);
}
sum_vec = _mm256_fmadd_ps(diff_i_vec, cdiff_j_vec, sum_vec);
}
// Handle the tail of every row
simsimd_f32_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t diff_i = simsimd_bf16_to_f32(a + i) - simsimd_bf16_to_f32(b + i);
__m256 diff_j_vec = _mm256_sub_ps( //
_simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a + tail_start, tail_length)),
_simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length)));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(
_simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length));
simsimd_f32_t cdiff_j = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(diff_j_vec, c_vec));
sum += diff_i * cdiff_j;
}
}
*result = _simsimd_sqrt_f32_haswell(sum);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_HASWELL
#if SIMSIMD_TARGET_SKYLAKE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_bilinear_f32_skylake_under16unrolled(simsimd_f32_t const *a, simsimd_f32_t const *b,
simsimd_f32_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
// The goal of this optimization is to avoid horizontal accumulation of the cb_j sums
// until the very end of the computation.
__mmask16 const mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
__m512 const b_vec = _mm512_maskz_loadu_ps(mask, b);
__m512 cb_j1 = _mm512_setzero_ps();
__m512 cb_j2 = _mm512_setzero_ps();
__m512 cb_j3 = _mm512_setzero_ps();
__m512 cb_j4 = _mm512_setzero_ps();
// Unroll the loop to process 4x ZMM registers at a time.
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
cb_j1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(mask, c + n * (i + 0)),
_mm512_mul_ps(b_vec, _mm512_set1_ps(a[i + 0])), cb_j1);
cb_j2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(mask, c + n * (i + 1)),
_mm512_mul_ps(b_vec, _mm512_set1_ps(a[i + 1])), cb_j2);
cb_j3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(mask, c + n * (i + 2)),
_mm512_mul_ps(b_vec, _mm512_set1_ps(a[i + 2])), cb_j3);
cb_j4 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(mask, c + n * (i + 3)),
_mm512_mul_ps(b_vec, _mm512_set1_ps(a[i + 3])), cb_j4);
}
if (i + 0 < n)
cb_j1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(mask, c + n * (i + 0)),
_mm512_mul_ps(b_vec, _mm512_set1_ps(a[i + 0])), cb_j1);
if (i + 1 < n)
cb_j2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(mask, c + n * (i + 1)),
_mm512_mul_ps(b_vec, _mm512_set1_ps(a[i + 1])), cb_j2);
if (i + 2 < n)
cb_j3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(mask, c + n * (i + 2)),
_mm512_mul_ps(b_vec, _mm512_set1_ps(a[i + 2])), cb_j3);
if (i + 3 < n)
cb_j4 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(mask, c + n * (i + 3)),
_mm512_mul_ps(b_vec, _mm512_set1_ps(a[i + 3])), cb_j4);
// Combine cb_j sums
__m512 sum_vec = _mm512_add_ps( //
_mm512_add_ps(cb_j1, cb_j2), //
_mm512_add_ps(cb_j3, cb_j4));
*result = _mm512_reduce_add_ps(sum_vec);
}
SIMSIMD_PUBLIC void simsimd_bilinear_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c,
simsimd_size_t n, simsimd_distance_t *result) {
// On modern x86 CPUs we have enough register space to load fairly large matrices with up to 16 cells
// per row and 16 rows at a time, still keeping enough register space for temporaries.
if (n <= 16) {
simsimd_bilinear_f32_skylake_under16unrolled(a, b, c, n, result);
return;
}
// Default case for arbitrary size `n`
simsimd_size_t const tail_length = n % 16;
simsimd_size_t const tail_start = n - tail_length;
__m512 sum_vec = _mm512_setzero_ps();
__mmask16 const tail_mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, tail_length);
for (simsimd_size_t i = 0; i != n; ++i) {
__m512 a_vec = _mm512_set1_ps(a[i]);
__m512 cb_j_vec = _mm512_setzero_ps();
__m512 b_vec, c_vec;
simsimd_size_t j = 0;
simsimd_bilinear_f32_skylake_cycle:
if (j + 16 <= n) {
b_vec = _mm512_loadu_ps(b + j);
c_vec = _mm512_loadu_ps(c + i * n + j);
}
else {
b_vec = _mm512_maskz_loadu_ps(tail_mask, b + tail_start);
c_vec = _mm512_maskz_loadu_ps(tail_mask, c + i * n + tail_start);
}
cb_j_vec = _mm512_fmadd_ps(b_vec, c_vec, cb_j_vec);
j += 16;
if (j < n) goto simsimd_bilinear_f32_skylake_cycle;
sum_vec = _mm512_fmadd_ps(a_vec, cb_j_vec, sum_vec);
}
*result = _mm512_reduce_add_ps(sum_vec);
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b,
simsimd_f32_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t const tail_length = n % 16;
simsimd_size_t const tail_start = n - tail_length;
__m512 sum_vec = _mm512_setzero_ps();
__mmask16 const tail_mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, tail_length);
for (simsimd_size_t i = 0; i != n; ++i) {
__m512 diff_i_vec = _mm512_set1_ps(a[i] - b[i]);
__m512 cdiff_j_vec = _mm512_setzero_ps(), cdiff_j_bot_vec = _mm512_setzero_ps();
__m512 a_j_vec, b_j_vec, diff_j_vec, c_vec;
simsimd_size_t j = 0;
// The nested loop is cleaner to implement with a `goto` in this case:
simsimd_bilinear_f32_skylake_cycle:
if (j + 16 <= n) {
a_j_vec = _mm512_loadu_ps(a + j);
b_j_vec = _mm512_loadu_ps(b + j);
c_vec = _mm512_loadu_ps(c + i * n + j);
}
else {
a_j_vec = _mm512_maskz_loadu_ps(tail_mask, a + tail_start);
b_j_vec = _mm512_maskz_loadu_ps(tail_mask, b + tail_start);
c_vec = _mm512_maskz_loadu_ps(tail_mask, c + i * n + tail_start);
}
diff_j_vec = _mm512_sub_ps(a_j_vec, b_j_vec);
cdiff_j_vec = _mm512_fmadd_ps(diff_j_vec, c_vec, cdiff_j_vec);
j += 16;
if (j < n) goto simsimd_bilinear_f32_skylake_cycle;
sum_vec = _mm512_fmadd_ps(diff_i_vec, cdiff_j_vec, sum_vec);
}
*result = _simsimd_sqrt_f64_haswell(_mm512_reduce_add_ps(sum_vec));
}
SIMSIMD_PUBLIC void simsimd_bilinear_f32c_skylake(simsimd_f32c_t const *a, simsimd_f32c_t const *b,
simsimd_f32c_t const *c, simsimd_size_t n,
simsimd_distance_t *results) {
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set1_epi64(0x8000000000000000);
// Default case for arbitrary size `n`
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
__mmask16 const tail_mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, tail_length * 2);
simsimd_f32_t sum_real = 0;
simsimd_f32_t sum_imag = 0;
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t const a_i_real = a[i].real;
simsimd_f32_t const a_i_imag = a[i].imag;
__m512 cb_j_real_vec = _mm512_setzero_ps();
__m512 cb_j_imag_vec = _mm512_setzero_ps();
__m512 b_vec, c_vec;
simsimd_size_t j = 0;
simsimd_bilinear_f32c_skylake_cycle:
if (j + 8 <= n) {
b_vec = _mm512_loadu_ps((simsimd_f32_t const *)(b + j));
c_vec = _mm512_loadu_ps((simsimd_f32_t const *)(c + i * n + j));
}
else {
b_vec = _mm512_maskz_loadu_ps(tail_mask, (simsimd_f32_t const *)(b + tail_start));
c_vec = _mm512_maskz_loadu_ps(tail_mask, (simsimd_f32_t const *)(c + i * n + tail_start));
}
// The real part of the product: b.real * c.real - b.imag * c.imag.
// The subtraction will be performed later with a sign flip.
cb_j_real_vec = _mm512_fmadd_ps(c_vec, b_vec, cb_j_real_vec);
// The imaginary part of the product: b.real * c.imag + b.imag * c.real.
// Swap the imaginary and real parts of `c` before multiplication:
c_vec = _mm512_permute_ps(c_vec, 0xB1); //? Swap adjacent entries within each pair
cb_j_imag_vec = _mm512_fmadd_ps(c_vec, b_vec, cb_j_imag_vec);
j += 8;
if (j < n) goto simsimd_bilinear_f32c_skylake_cycle;
// Flip the sign bit in every second scalar before accumulation:
cb_j_real_vec = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(cb_j_real_vec), sign_flip_vec));
// Horizontal sums are the expensive part of the computation:
simsimd_f32_t const cb_j_real = _mm512_reduce_add_ps(cb_j_real_vec);
simsimd_f32_t const cb_j_imag = _mm512_reduce_add_ps(cb_j_imag_vec);
sum_real += a_i_real * cb_j_real - a_i_imag * cb_j_imag;
sum_imag += a_i_real * cb_j_imag + a_i_imag * cb_j_real;
}
// Reduce horizontal sums:
results[0] = sum_real;
results[1] = sum_imag;
}
SIMSIMD_PUBLIC void simsimd_bilinear_f64_skylake_under8unrolled(simsimd_f64_t const *a, simsimd_f64_t const *b,
simsimd_f64_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
// The goal of this optimization is to avoid horizontal accumulation of the cb_j sums
// until the very end of the computation.
__mmask8 const row_mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
__m512d const b_vec = _mm512_maskz_loadu_pd(row_mask, b);
__m512d cb_j1 = _mm512_setzero_pd();
__m512d cb_j2 = _mm512_setzero_pd();
__m512d cb_j3 = _mm512_setzero_pd();
__m512d cb_j4 = _mm512_setzero_pd();
// clang-format off
if (n > 0) cb_j1 = _mm512_fmadd_pd(_mm512_maskz_loadu_pd(row_mask, c + n * 0), _mm512_mul_pd(b_vec, _mm512_set1_pd(a[0])), cb_j1);
if (n > 1) cb_j2 = _mm512_fmadd_pd(_mm512_maskz_loadu_pd(row_mask, c + n * 1), _mm512_mul_pd(b_vec, _mm512_set1_pd(a[1])), cb_j2);
if (n > 2) cb_j3 = _mm512_fmadd_pd(_mm512_maskz_loadu_pd(row_mask, c + n * 2), _mm512_mul_pd(b_vec, _mm512_set1_pd(a[2])), cb_j3);
if (n > 3) cb_j4 = _mm512_fmadd_pd(_mm512_maskz_loadu_pd(row_mask, c + n * 3), _mm512_mul_pd(b_vec, _mm512_set1_pd(a[3])), cb_j4);
if (n > 4) cb_j1 = _mm512_fmadd_pd(_mm512_maskz_loadu_pd(row_mask, c + n * 4), _mm512_mul_pd(b_vec, _mm512_set1_pd(a[4])), cb_j1);
if (n > 5) cb_j2 = _mm512_fmadd_pd(_mm512_maskz_loadu_pd(row_mask, c + n * 5), _mm512_mul_pd(b_vec, _mm512_set1_pd(a[5])), cb_j2);
if (n > 6) cb_j3 = _mm512_fmadd_pd(_mm512_maskz_loadu_pd(row_mask, c + n * 6), _mm512_mul_pd(b_vec, _mm512_set1_pd(a[6])), cb_j3);
if (n > 7) cb_j4 = _mm512_fmadd_pd(_mm512_maskz_loadu_pd(row_mask, c + n * 7), _mm512_mul_pd(b_vec, _mm512_set1_pd(a[7])), cb_j4);
// clang-format on
// Combine cb_j sums
__m512d sum_vec = _mm512_add_pd( //
_mm512_add_pd(cb_j1, cb_j2), //
_mm512_add_pd(cb_j3, cb_j4));
*result = _mm512_reduce_add_pd(sum_vec);
}
SIMSIMD_PUBLIC void simsimd_bilinear_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t const *c,
simsimd_size_t n, simsimd_distance_t *result) {
// On modern x86 CPUs we have enough register space to load fairly large matrices with up to 16 cells
// per row and 8 rows at a time, still keeping enough register space for temporaries.
if (n <= 8) {
simsimd_bilinear_f64_skylake_under8unrolled(a, b, c, n, result);
return;
}
// Default case for arbitrary size `n`
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
__m512d sum_vec = _mm512_setzero_pd();
__mmask8 const tail_mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, tail_length);
for (simsimd_size_t i = 0; i != n; ++i) {
__m512d a_vec = _mm512_set1_pd(a[i]);
__m512d cb_j_vec = _mm512_setzero_pd();
__m512d b_vec, c_vec;
simsimd_size_t j = 0;
simsimd_bilinear_f64_skylake_cycle:
if (j + 8 <= n) {
b_vec = _mm512_loadu_pd(b + j);
c_vec = _mm512_loadu_pd(c + i * n + j);
}
else {
b_vec = _mm512_maskz_loadu_pd(tail_mask, b + tail_start);
c_vec = _mm512_maskz_loadu_pd(tail_mask, c + i * n + tail_start);
}
cb_j_vec = _mm512_fmadd_pd(b_vec, c_vec, cb_j_vec);
j += 8;
if (j < n) goto simsimd_bilinear_f64_skylake_cycle;
sum_vec = _mm512_fmadd_pd(a_vec, cb_j_vec, sum_vec);
}
*result = _mm512_reduce_add_pd(sum_vec);
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b,
simsimd_f64_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t const tail_length = n % 8;
simsimd_size_t const tail_start = n - tail_length;
__mmask8 const tail_mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, tail_length);
__m512d sum_vec = _mm512_setzero_pd();
for (simsimd_size_t i = 0; i != n; ++i) {
__m512d diff_i_vec = _mm512_set1_pd(a[i] - b[i]);
__m512d cdiff_j_vec = _mm512_setzero_pd();
__m512d a_j_vec, b_j_vec, diff_j_vec, c_vec;
simsimd_size_t j = 0;
// The nested loop is cleaner to implement with a `goto` in this case:
simsimd_bilinear_f64_skylake_cycle:
if (j + 8 <= n) {
a_j_vec = _mm512_loadu_pd(a + j);
b_j_vec = _mm512_loadu_pd(b + j);
c_vec = _mm512_loadu_pd(c + i * n + j);
}
else {
a_j_vec = _mm512_maskz_loadu_pd(tail_mask, a + tail_start);
b_j_vec = _mm512_maskz_loadu_pd(tail_mask, b + tail_start);
c_vec = _mm512_maskz_loadu_pd(tail_mask, c + i * n + tail_start);
}
diff_j_vec = _mm512_sub_pd(a_j_vec, b_j_vec);
cdiff_j_vec = _mm512_fmadd_pd(diff_j_vec, c_vec, cdiff_j_vec);
j += 8;
if (j < n) goto simsimd_bilinear_f64_skylake_cycle;
sum_vec = _mm512_fmadd_pd(diff_i_vec, cdiff_j_vec, sum_vec);
}
*result = _simsimd_sqrt_f64_haswell(_mm512_reduce_add_pd(sum_vec));
}
SIMSIMD_PUBLIC void simsimd_bilinear_f64c_skylake(simsimd_f64c_t const *a, simsimd_f64c_t const *b,
simsimd_f64c_t const *c, simsimd_size_t n,
simsimd_distance_t *results) {
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set_epi64( //
0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000, //
0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000 //
);
// Default case for arbitrary size `n`
simsimd_size_t const tail_length = n % 4;
simsimd_size_t const tail_start = n - tail_length;
__mmask8 const tail_mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, tail_length * 2);
simsimd_f64_t sum_real = 0;
simsimd_f64_t sum_imag = 0;
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f64_t const a_i_real = a[i].real;
simsimd_f64_t const a_i_imag = a[i].imag;
__m512d cb_j_real_vec = _mm512_setzero_pd();
__m512d cb_j_imag_vec = _mm512_setzero_pd();
__m512d b_vec, c_vec;
simsimd_size_t j = 0;
simsimd_bilinear_f64c_skylake_cycle:
if (j + 4 <= n) {
b_vec = _mm512_loadu_pd((simsimd_f64_t const *)(b + j));
c_vec = _mm512_loadu_pd((simsimd_f64_t const *)(c + i * n + j));
}
else {
b_vec = _mm512_maskz_loadu_pd(tail_mask, (simsimd_f64_t const *)(b + tail_start));
c_vec = _mm512_maskz_loadu_pd(tail_mask, (simsimd_f64_t const *)(c + i * n + tail_start));
}
// The real part of the product: b.real * c.real - b.imag * c.imag.
// The subtraction will be performed later with a sign flip.
cb_j_real_vec = _mm512_fmadd_pd(c_vec, b_vec, cb_j_real_vec);
// The imaginary part of the product: b.real * c.imag + b.imag * c.real.
// Swap the imaginary and real parts of `c` before multiplication:
c_vec = _mm512_permute_pd(c_vec, 0x55); //? Same as 0b01010101.
cb_j_imag_vec = _mm512_fmadd_pd(c_vec, b_vec, cb_j_imag_vec);
j += 4;
if (j < n) goto simsimd_bilinear_f64c_skylake_cycle;
// Flip the sign bit in every second scalar before accumulation:
cb_j_real_vec = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(cb_j_real_vec), sign_flip_vec));
// Horizontal sums are the expensive part of the computation:
simsimd_f64_t const cb_j_real = _mm512_reduce_add_pd(cb_j_real_vec);
simsimd_f64_t const cb_j_imag = _mm512_reduce_add_pd(cb_j_imag_vec);
sum_real += a_i_real * cb_j_real - a_i_imag * cb_j_imag;
sum_imag += a_i_real * cb_j_imag + a_i_imag * cb_j_real;
}
// Reduce horizontal sums:
results[0] = sum_real;
results[1] = sum_imag;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SKYLAKE
#if SIMSIMD_TARGET_GENOA
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512bf16")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512bf16"))), \
apply_to = function)
SIMSIMD_PUBLIC void simsimd_bilinear_bf16_genoa(simsimd_bf16_t const *a, simsimd_bf16_t const *b,
simsimd_bf16_t const *c, simsimd_size_t n, simsimd_distance_t *result) {
simsimd_size_t const tail_length = n % 32;
simsimd_size_t const tail_start = n - tail_length;
__mmask32 const tail_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, tail_length);
__m512 sum_vec = _mm512_setzero_ps();
for (simsimd_size_t i = 0; i != n; ++i) {
__m512 a_vec = _mm512_set1_ps(simsimd_bf16_to_f32(a + i));
__m512 cb_j_vec = _mm512_setzero_ps();
__m512i b_vec, c_vec;
simsimd_size_t j = 0;
simsimd_bilinear_bf16_genoa_cycle:
if (j + 32 <= n) {
b_vec = _mm512_loadu_epi16(b + j);
c_vec = _mm512_loadu_epi16(c + i * n + j);
}
else {
b_vec = _mm512_maskz_loadu_epi16(tail_mask, b + tail_start);
c_vec = _mm512_maskz_loadu_epi16(tail_mask, c + i * n + tail_start);
}
cb_j_vec = _mm512_dpbf16_ps(cb_j_vec, (__m512bh)(b_vec), (__m512bh)(c_vec));
j += 32;
if (j < n) goto simsimd_bilinear_bf16_genoa_cycle;
sum_vec = _mm512_fmadd_ps(a_vec, cb_j_vec, sum_vec);
}
*result = _mm512_reduce_add_ps(sum_vec);
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_genoa(simsimd_bf16_t const *a, simsimd_bf16_t const *b,
simsimd_bf16_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t const tail_length = n % 32;
simsimd_size_t const tail_start = n - tail_length;
__mmask32 const tail_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, tail_length);
__m512 sum_vec = _mm512_setzero_ps();
for (simsimd_size_t i = 0; i != n; ++i) {
__m512 diff_i_vec = _mm512_set1_ps(simsimd_bf16_to_f32(a + i) - simsimd_bf16_to_f32(b + i));
__m512 cdiff_j_vec = _mm512_setzero_ps();
__m512i a_j_vec, b_j_vec, diff_j_vec, c_vec;
simsimd_size_t j = 0;
// The nested loop is cleaner to implement with a `goto` in this case:
simsimd_mahalanobis_bf16_genoa_cycle:
if (j + 32 <= n) {
a_j_vec = _mm512_loadu_epi16(a + j);
b_j_vec = _mm512_loadu_epi16(b + j);
c_vec = _mm512_loadu_epi16(c + i * n + j);
}
else {
a_j_vec = _mm512_maskz_loadu_epi16(tail_mask, a + tail_start);
b_j_vec = _mm512_maskz_loadu_epi16(tail_mask, b + tail_start);
c_vec = _mm512_maskz_loadu_epi16(tail_mask, c + i * n + tail_start);
}
diff_j_vec = _simsimd_substract_bf16x32_genoa(a_j_vec, b_j_vec);
cdiff_j_vec = _mm512_dpbf16_ps(cdiff_j_vec, (__m512bh)(diff_j_vec), (__m512bh)(c_vec));
j += 32;
if (j < n) goto simsimd_mahalanobis_bf16_genoa_cycle;
sum_vec = _mm512_fmadd_ps(diff_i_vec, cdiff_j_vec, sum_vec);
}
*result = _simsimd_sqrt_f32_haswell(_mm512_reduce_add_ps(sum_vec));
}
SIMSIMD_PUBLIC void simsimd_bilinear_bf16c_genoa(simsimd_bf16c_t const *a, simsimd_bf16c_t const *b,
simsimd_bf16c_t const *c, simsimd_size_t n,
simsimd_distance_t *results) {
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000);
__m512i const swap_adjacent_vec = _mm512_set_epi8( //
61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane
45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane
29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 // 1st 128-bit lane
);
// Default case for arbitrary size `n`
simsimd_size_t const tail_length = n % 16;
simsimd_size_t const tail_start = n - tail_length;
__mmask32 const tail_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, tail_length * 2);
simsimd_f64_t sum_real = 0;
simsimd_f64_t sum_imag = 0;
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t const a_i_real = a[i].real;
simsimd_f32_t const a_i_imag = a[i].imag;
__m512 cb_j_real_vec = _mm512_setzero_ps();
__m512 cb_j_imag_vec = _mm512_setzero_ps();
__m512i b_vec, c_vec;
simsimd_size_t j = 0;
simsimd_bilinear_bf16c_skylake_cycle:
if (j + 16 <= n) {
b_vec = _mm512_loadu_epi16((simsimd_i16_t const *)(b + j));
c_vec = _mm512_loadu_epi16((simsimd_i16_t const *)(c + i * n + j));
}
else {
b_vec = _mm512_maskz_loadu_epi16(tail_mask, (simsimd_i16_t const *)(b + tail_start));
c_vec = _mm512_maskz_loadu_epi16(tail_mask, (simsimd_i16_t const *)(c + i * n + tail_start));
}
cb_j_real_vec = _mm512_dpbf16_ps( //
cb_j_real_vec, //
(__m512bh)(_mm512_xor_si512(c_vec, sign_flip_vec)), //
(__m512bh)b_vec);
cb_j_imag_vec = _mm512_dpbf16_ps( //
cb_j_imag_vec, //
(__m512bh)(_mm512_shuffle_epi8(c_vec, swap_adjacent_vec)), //
(__m512bh)b_vec);
j += 16;
if (j < n) goto simsimd_bilinear_bf16c_skylake_cycle;
// Horizontal sums are the expensive part of the computation:
simsimd_f64_t const cb_j_real = _simsimd_reduce_f32x16_skylake(cb_j_real_vec);
simsimd_f64_t const cb_j_imag = _simsimd_reduce_f32x16_skylake(cb_j_imag_vec);
sum_real += a_i_real * cb_j_real - a_i_imag * cb_j_imag;
sum_imag += a_i_real * cb_j_imag + a_i_imag * cb_j_real;
}
// Reduce horizontal sums:
results[0] = sum_real;
results[1] = sum_imag;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_GENOA
#if SIMSIMD_TARGET_SAPPHIRE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512fp16")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512fp16"))), \
apply_to = function)
SIMSIMD_PUBLIC void simsimd_bilinear_f16_sapphire_under32unrolled(simsimd_f16_t const *a, simsimd_f16_t const *b,
simsimd_f16_t const *c, simsimd_size_t const n,
simsimd_distance_t *result) {
// The goal of this optimization is to avoid horizontal accumulation of the cb_j sums
// until the very end of the computation.
__mmask32 const mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
__m512h const b_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b));
// Independently accumulate the partial sums into separate variables to avoid data-dependencies.
__m512h cb_j1 = _mm512_setzero_ph();
__m512h cb_j2 = _mm512_setzero_ph();
__m512h cb_j3 = _mm512_setzero_ph();
__m512h cb_j4 = _mm512_setzero_ph();
// Unroll the loop to process 4x ZMM registers at a time.
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
// If the code is compiled without native support for `_Float16`, we need a workaround
// to avoid implicit casts from out `simsimd_f16_t` to `_Float16`.
cb_j1 = _mm512_fmadd_ph(
_mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, c + n * (i + 0))),
_mm512_mul_ph(b_vec, _mm512_castsi512_ph(_mm512_set1_epi16(((simsimd_i16_t const *)a)[i + 0]))), cb_j1);
cb_j2 = _mm512_fmadd_ph(
_mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, c + n * (i + 1))),
_mm512_mul_ph(b_vec, _mm512_castsi512_ph(_mm512_set1_epi16(((simsimd_i16_t const *)a)[i + 1]))), cb_j2);
cb_j3 = _mm512_fmadd_ph(
_mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, c + n * (i + 2))),
_mm512_mul_ph(b_vec, _mm512_castsi512_ph(_mm512_set1_epi16(((simsimd_i16_t const *)a)[i + 2]))), cb_j3);
cb_j4 = _mm512_fmadd_ph(
_mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, c + n * (i + 3))),
_mm512_mul_ph(b_vec, _mm512_castsi512_ph(_mm512_set1_epi16(((simsimd_i16_t const *)a)[i + 3]))), cb_j4);
}
// Handle the tail of the loop:
if (i + 0 < n)
cb_j1 = _mm512_fmadd_ph(
_mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, c + n * (i + 0))),
_mm512_mul_ph(b_vec, _mm512_castsi512_ph(_mm512_set1_epi16(((simsimd_i16_t const *)a)[i + 0]))), cb_j1);
if (i + 1 < n)
cb_j2 = _mm512_fmadd_ph(
_mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, c + n * (i + 1))),
_mm512_mul_ph(b_vec, _mm512_castsi512_ph(_mm512_set1_epi16(((simsimd_i16_t const *)a)[i + 1]))), cb_j2);
if (i + 2 < n)
cb_j3 = _mm512_fmadd_ph(
_mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, c + n * (i + 2))),
_mm512_mul_ph(b_vec, _mm512_castsi512_ph(_mm512_set1_epi16(((simsimd_i16_t const *)a)[i + 2]))), cb_j3);
if (i + 3 < n)
cb_j4 = _mm512_fmadd_ph(
_mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, c + n * (i + 3))),
_mm512_mul_ph(b_vec, _mm512_castsi512_ph(_mm512_set1_epi16(((simsimd_i16_t const *)a)[i + 3]))), cb_j4);
// Combine cb_j sums
__m512h sum_vec = _mm512_add_ph( //
_mm512_add_ph(cb_j1, cb_j2), //
_mm512_add_ph(cb_j3, cb_j4));
*result = _mm512_reduce_add_ph(sum_vec);
}
SIMSIMD_PUBLIC void simsimd_bilinear_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b,
simsimd_f16_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
// On modern x86 CPUs we have enough register space to load fairly large matrices with up to 32 cells
// per row and 32 rows at a time, still keeping enough register space for temporaries.
if (n <= 32) {
simsimd_bilinear_f16_sapphire_under32unrolled(a, b, c, n, result);
return;
}
simsimd_size_t const tail_length = n % 32;
simsimd_size_t const tail_start = n - tail_length;
__mmask32 const tail_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, tail_length);
__m512h sum_vec = _mm512_setzero_ph();
for (simsimd_size_t i = 0; i != n; ++i) {
__m512h a_vec = _mm512_castsi512_ph(_mm512_set1_epi16(*(short const *)(a + i)));
__m512h cb_j_vec = _mm512_setzero_ph();
__m512i b_vec, c_vec;
simsimd_size_t j = 0;
simsimd_bilinear_f16_sapphire_cycle:
if (j + 32 <= n) {
b_vec = _mm512_loadu_epi16(b + j);
c_vec = _mm512_loadu_epi16(c + i * n + j);
}
else {
b_vec = _mm512_maskz_loadu_epi16(tail_mask, b + tail_start);
c_vec = _mm512_maskz_loadu_epi16(tail_mask, c + i * n + tail_start);
}
cb_j_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(b_vec), _mm512_castsi512_ph(c_vec), cb_j_vec);
j += 32;
if (j < n) goto simsimd_bilinear_f16_sapphire_cycle;
sum_vec = _mm512_fmadd_ph(a_vec, cb_j_vec, sum_vec);
}
*result = _mm512_reduce_add_ph(sum_vec);
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b,
simsimd_f16_t const *c, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t const tail_length = n % 32;
simsimd_size_t const tail_start = n - tail_length;
__mmask32 const tail_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, tail_length);
__m512h sum_vec = _mm512_setzero_ph();
for (simsimd_size_t i = 0; i != n; ++i) {
__m512h a_i_vec = _mm512_castsi512_ph(_mm512_set1_epi16(*(short const *)(a + i)));
__m512h b_i_vec = _mm512_castsi512_ph(_mm512_set1_epi16(*(short const *)(b + i)));
__m512h diff_i_vec = _mm512_sub_ph(a_i_vec, b_i_vec);
__m512h cdiff_j_vec = _mm512_setzero_ph();
__m512h diff_j_vec;
__m512i a_j_vec, b_j_vec, c_vec;
simsimd_size_t j = 0;
// The nested loop is cleaner to implement with a `goto` in this case:
simsimd_mahalanobis_f16_sapphire_cycle:
if (j + 32 <= n) {
a_j_vec = _mm512_loadu_epi16(a + j);
b_j_vec = _mm512_loadu_epi16(b + j);
c_vec = _mm512_loadu_epi16(c + i * n + j);
}
else {
a_j_vec = _mm512_maskz_loadu_epi16(tail_mask, a + tail_start);
b_j_vec = _mm512_maskz_loadu_epi16(tail_mask, b + tail_start);
c_vec = _mm512_maskz_loadu_epi16(tail_mask, c + i * n + tail_start);
}
diff_j_vec = _mm512_sub_ph(_mm512_castsi512_ph(a_j_vec), _mm512_castsi512_ph(b_j_vec));
cdiff_j_vec = _mm512_fmadd_ph(diff_j_vec, _mm512_castsi512_ph(c_vec), cdiff_j_vec);
j += 32;
if (j < n) goto simsimd_mahalanobis_f16_sapphire_cycle;
sum_vec = _mm512_fmadd_ph(diff_i_vec, cdiff_j_vec, sum_vec);
}
*result = _simsimd_sqrt_f32_haswell(_mm512_reduce_add_ph(sum_vec));
}
SIMSIMD_PUBLIC void simsimd_bilinear_f16c_sapphire(simsimd_f16c_t const *a, simsimd_f16c_t const *b,
simsimd_f16c_t const *c, simsimd_size_t n,
simsimd_distance_t *results) {
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000);
__m512i const swap_adjacent_vec = _mm512_set_epi8( //
61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane
45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane
29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 // 1st 128-bit lane
);
// Default case for arbitrary size `n`
simsimd_size_t const tail_length = n % 16;
simsimd_size_t const tail_start = n - tail_length;
__mmask32 const tail_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, tail_length * 2);
simsimd_f32_t sum_real = 0;
simsimd_f32_t sum_imag = 0;
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t const a_i_real = a[i].real;
simsimd_f32_t const a_i_imag = a[i].imag;
__m512h cb_j_real_vec = _mm512_setzero_ph();
__m512h cb_j_imag_vec = _mm512_setzero_ph();
__m512i b_vec, c_vec;
simsimd_size_t j = 0;
simsimd_bilinear_f16c_skylake_cycle:
if (j + 16 <= n) {
b_vec = _mm512_loadu_epi16((simsimd_i16_t const *)(b + j));
c_vec = _mm512_loadu_epi16((simsimd_i16_t const *)(c + i * n + j));
}
else {
b_vec = _mm512_maskz_loadu_epi16(tail_mask, (simsimd_i16_t const *)(b + tail_start));
c_vec = _mm512_maskz_loadu_epi16(tail_mask, (simsimd_i16_t const *)(c + i * n + tail_start));
}
cb_j_real_vec = _mm512_fmadd_ph( //
_mm512_castsi512_ph(_mm512_xor_si512(c_vec, sign_flip_vec)), //
_mm512_castsi512_ph(b_vec), cb_j_real_vec);
cb_j_imag_vec = _mm512_fmadd_ph( //
_mm512_castsi512_ph(_mm512_shuffle_epi8(c_vec, swap_adjacent_vec)), //
_mm512_castsi512_ph(b_vec), cb_j_imag_vec);
j += 16;
if (j < n) goto simsimd_bilinear_f16c_skylake_cycle;
// Horizontal sums are the expensive part of the computation:
simsimd_f32_t const cb_j_real = _mm512_reduce_add_ph(cb_j_real_vec);
simsimd_f32_t const cb_j_imag = _mm512_reduce_add_ph(cb_j_imag_vec);
sum_real += a_i_real * cb_j_real - a_i_imag * cb_j_imag;
sum_imag += a_i_real * cb_j_imag + a_i_imag * cb_j_real;
}
// Reduce horizontal sums:
results[0] = sum_real;
results[1] = sum_imag;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SAPPHIRE
#endif // _SIMSIMD_TARGET_X86
#ifdef __cplusplus
}
#endif
#endif
simsimd-6.5.12/include/simsimd/dot.h 0000644 0000000 0000000 00000305301 10461020230 0015415 0 ustar 0000000 0000000 /**
* @file dot.h
* @brief SIMD-accelerated Dot Products for Real and Complex numbers.
* @author Ash Vardanian
* @date February 24, 2024
*
* Contains:
* - Dot Product for Real and Complex vectors
* - Conjugate Dot Product for Complex vectors
*
* For datatypes:
* - 64-bit IEEE floating point numbers
* - 32-bit IEEE floating point numbers
* - 16-bit IEEE floating point numbers
* - 16-bit brain floating point numbers
* - 8-bit unsigned integers
* - 8-bit signed integers
*
* For hardware architectures:
* - Arm: NEON, SVE
* - x86: Haswell, Ice Lake, Skylake, Genoa, Sapphire
*
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
*/
#ifndef SIMSIMD_DOT_H
#define SIMSIMD_DOT_H
#include "types.h"
#ifdef __cplusplus
extern "C" {
#endif
// clang-format off
/* Serial backends for all numeric types.
* By default they use 32-bit arithmetic, unless the arguments themselves contain 64-bit floats.
* For double-precision computation check out the "*_accurate" variants of those "*_serial" functions.
*/
SIMSIMD_PUBLIC void simsimd_dot_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f64c_serial(simsimd_f64c_t const* a, simsimd_f64c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f64c_serial(simsimd_f64c_t const* a, simsimd_f64c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f32c_serial(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f32c_serial(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f16c_serial(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f16c_serial(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_bf16c_serial(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_bf16c_serial(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* Double-precision serial backends for all numeric types.
* For single-precision computation check out the "*_serial" counterparts of those "*_accurate" functions.
*/
SIMSIMD_PUBLIC void simsimd_dot_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f32c_accurate(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f32c_accurate(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f16c_accurate(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f16c_accurate(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_bf16c_accurate(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_bf16c_accurate(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
/* SIMD-powered backends for Arm NEON, mostly using 32-bit arithmetic over 128-bit words.
* By far the most portable backend, covering most Arm v8 devices, over a billion phones, and almost all
* server CPUs produced before 2023.
*/
SIMSIMD_PUBLIC void simsimd_dot_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f32c_neon(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f32c_neon(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f16c_neon(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f16c_neon(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_bf16c_neon(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_bf16c_neon(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
/* SIMD-powered backends for Arm SVE, mostly using 32-bit arithmetic over variable-length platform-defined word sizes.
* Designed for Arm Graviton 3, Microsoft Cobalt, as well as Nvidia Grace and newer Ampere Altra CPUs.
*/
SIMSIMD_PUBLIC void simsimd_dot_f32_sve(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f32c_sve(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f32c_sve(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_f16_sve(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f16c_sve(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f16c_sve(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_f64_sve(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f64c_sve(simsimd_f64c_t const* a, simsimd_f64c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f64c_sve(simsimd_f64c_t const* a, simsimd_f64c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
/* SIMD-powered backends for AVX2 CPUs of Haswell generation and newer, using 32-bit arithmetic over 256-bit words.
* First demonstrated in 2011, at least one Haswell-based processor was still being sold in 2022 — the Pentium G3420.
* Practically all modern x86 CPUs support AVX2, FMA, and F16C, making it a perfect baseline for SIMD algorithms.
* On other hand, there is no need to implement AVX2 versions of `f32` and `f64` functions, as those are
* properly vectorized by recent compilers.
*/
SIMSIMD_PUBLIC void simsimd_dot_f32_haswell(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f32c_haswell(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f32c_haswell(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f16c_haswell(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f16c_haswell(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for various generations of AVX512 CPUs.
* Skylake is handy, as it supports masked loads and other operations, avoiding the need for the tail loop.
* Ice Lake added VNNI, VPOPCNTDQ, IFMA, VBMI, VAES, GFNI, VBMI2, BITALG, VPCLMULQDQ, and other extensions for integral operations.
* Genoa added only BF16.
* Sapphire Rapids added tiled matrix operations, but we are most interested in the new mixed-precision FMA instructions.
*
* Sadly, we can't effectively interleave different kinds of arithmetic instructions to utilize more ports:
*
* > Like Intel server architectures since Skylake-X, SPR cores feature two 512-bit FMA units, and organize them in a similar fashion.
* > One 512-bit FMA unit is created by fusing two 256-bit ones on port 0 and port 1. The other is added to port 5, as a server-specific
* > core extension. The FMA units on port 0 and 1 are configured into 2×256-bit or 1×512-bit mode depending on whether 512-bit FMA
* > instructions are present in the scheduler. That means a mix of 256-bit and 512-bit FMA instructions will not achieve higher IPC
* > than executing 512-bit instructions alone.
*
* Source: https://chipsandcheese.com/p/a-peek-at-sapphire-rapids
*/
SIMSIMD_PUBLIC void simsimd_dot_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f64c_skylake(simsimd_f64c_t const* a, simsimd_f64c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f64c_skylake(simsimd_f64c_t const* a, simsimd_f64c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f32c_skylake(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f32c_skylake(simsimd_f32c_t const* a, simsimd_f32c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_bf16c_genoa(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_bf16c_genoa(simsimd_bf16c_t const* a, simsimd_bf16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_dot_f16c_sapphire(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_vdot_f16c_sapphire(simsimd_f16c_t const* a, simsimd_f16c_t const* b, simsimd_size_t n, simsimd_distance_t* results);
SIMSIMD_PUBLIC void simsimd_dot_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
// clang-format on
#define SIMSIMD_MAKE_DOT(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_dot_##input_type##_##name(simsimd_##input_type##_t const *a, \
simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t *result) { \
simsimd_##accumulator_type##_t ab = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \
simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \
ab += ai * bi; \
} \
*result = ab; \
}
#define SIMSIMD_MAKE_COMPLEX_DOT(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_dot_##input_type##_##name(simsimd_##input_type##_t const *a_pairs, \
simsimd_##input_type##_t const *b_pairs, \
simsimd_size_t count_pairs, simsimd_distance_t *results) { \
simsimd_##accumulator_type##_t ab_real = 0, ab_imag = 0; \
for (simsimd_size_t i = 0; i != count_pairs; ++i) { \
simsimd_##accumulator_type##_t ar = load_and_convert(&(a_pairs + i)->real); \
simsimd_##accumulator_type##_t br = load_and_convert(&(b_pairs + i)->real); \
simsimd_##accumulator_type##_t ai = load_and_convert(&(a_pairs + i)->imag); \
simsimd_##accumulator_type##_t bi = load_and_convert(&(b_pairs + i)->imag); \
ab_real += ar * br - ai * bi; \
ab_imag += ar * bi + ai * br; \
} \
results[0] = ab_real; \
results[1] = ab_imag; \
}
#define SIMSIMD_MAKE_COMPLEX_VDOT(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_vdot_##input_type##_##name(simsimd_##input_type##_t const *a_pairs, \
simsimd_##input_type##_t const *b_pairs, \
simsimd_size_t count_pairs, simsimd_distance_t *results) { \
simsimd_##accumulator_type##_t ab_real = 0, ab_imag = 0; \
for (simsimd_size_t i = 0; i != count_pairs; ++i) { \
simsimd_##accumulator_type##_t ar = load_and_convert(&(a_pairs + i)->real); \
simsimd_##accumulator_type##_t br = load_and_convert(&(b_pairs + i)->real); \
simsimd_##accumulator_type##_t ai = load_and_convert(&(a_pairs + i)->imag); \
simsimd_##accumulator_type##_t bi = load_and_convert(&(b_pairs + i)->imag); \
ab_real += ar * br + ai * bi; \
ab_imag += ar * bi - ai * br; \
} \
results[0] = ab_real; \
results[1] = ab_imag; \
}
SIMSIMD_MAKE_DOT(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f64_serial
SIMSIMD_MAKE_COMPLEX_DOT(serial, f64c, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f64c_serial
SIMSIMD_MAKE_COMPLEX_VDOT(serial, f64c, f64, SIMSIMD_DEREFERENCE) // simsimd_vdot_f64c_serial
SIMSIMD_MAKE_DOT(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_dot_f32_serial
SIMSIMD_MAKE_COMPLEX_DOT(serial, f32c, f32, SIMSIMD_DEREFERENCE) // simsimd_dot_f32c_serial
SIMSIMD_MAKE_COMPLEX_VDOT(serial, f32c, f32, SIMSIMD_DEREFERENCE) // simsimd_vdot_f32c_serial
SIMSIMD_MAKE_DOT(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_dot_f16_serial
SIMSIMD_MAKE_COMPLEX_DOT(serial, f16c, f32, SIMSIMD_F16_TO_F32) // simsimd_dot_f16c_serial
SIMSIMD_MAKE_COMPLEX_VDOT(serial, f16c, f32, SIMSIMD_F16_TO_F32) // simsimd_vdot_f16c_serial
SIMSIMD_MAKE_DOT(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_dot_bf16_serial
SIMSIMD_MAKE_COMPLEX_DOT(serial, bf16c, f32, SIMSIMD_BF16_TO_F32) // simsimd_dot_bf16c_serial
SIMSIMD_MAKE_COMPLEX_VDOT(serial, bf16c, f32, SIMSIMD_BF16_TO_F32) // simsimd_vdot_bf16c_serial
SIMSIMD_MAKE_DOT(serial, i8, i64, SIMSIMD_DEREFERENCE) // simsimd_dot_i8_serial
SIMSIMD_MAKE_DOT(serial, u8, i64, SIMSIMD_DEREFERENCE) // simsimd_dot_u8_serial
SIMSIMD_MAKE_DOT(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f32_accurate
SIMSIMD_MAKE_COMPLEX_DOT(accurate, f32c, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f32c_accurate
SIMSIMD_MAKE_COMPLEX_VDOT(accurate, f32c, f64, SIMSIMD_DEREFERENCE) // simsimd_vdot_f32c_accurate
SIMSIMD_MAKE_DOT(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_dot_f16_accurate
SIMSIMD_MAKE_COMPLEX_DOT(accurate, f16c, f64, SIMSIMD_F16_TO_F32) // simsimd_dot_f16c_accurate
SIMSIMD_MAKE_COMPLEX_VDOT(accurate, f16c, f64, SIMSIMD_F16_TO_F32) // simsimd_vdot_f16c_accurate
SIMSIMD_MAKE_DOT(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_dot_bf16_accurate
SIMSIMD_MAKE_COMPLEX_DOT(accurate, bf16c, f64, SIMSIMD_BF16_TO_F32) // simsimd_dot_bf16c_accurate
SIMSIMD_MAKE_COMPLEX_VDOT(accurate, bf16c, f64, SIMSIMD_BF16_TO_F32) // simsimd_vdot_bf16c_accurate
#if _SIMSIMD_TARGET_ARM
#if SIMSIMD_TARGET_NEON
#pragma GCC push_options
#pragma GCC target("arch=armv8-a+simd")
#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
SIMSIMD_INTERNAL float32x4_t _simsimd_partial_load_f32x4_neon(simsimd_f32_t const *x, simsimd_size_t n) {
union {
float32x4_t vec;
simsimd_f32_t scalars[4];
} result;
simsimd_size_t i = 0;
for (; i < n; ++i) result.scalars[i] = x[i];
for (; i < 4; ++i) result.scalars[i] = 0;
return result.vec;
}
SIMSIMD_PUBLIC void simsimd_dot_f32_neon(simsimd_f32_t const *a_scalars, simsimd_f32_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
float32x4_t ab_vec = vdupq_n_f32(0);
simsimd_size_t idx_scalars = 0;
for (; idx_scalars + 4 <= count_scalars; idx_scalars += 4) {
float32x4_t a_vec = vld1q_f32(a_scalars + idx_scalars);
float32x4_t b_vec = vld1q_f32(b_scalars + idx_scalars);
ab_vec = vfmaq_f32(ab_vec, a_vec, b_vec);
}
simsimd_f32_t ab = vaddvq_f32(ab_vec);
for (; idx_scalars < count_scalars; ++idx_scalars) ab += a_scalars[idx_scalars] * b_scalars[idx_scalars];
*result = ab;
}
SIMSIMD_PUBLIC void simsimd_dot_f32c_neon(simsimd_f32c_t const *a_pairs, simsimd_f32c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
float32x4_t ab_real_vec = vdupq_n_f32(0);
float32x4_t ab_imag_vec = vdupq_n_f32(0);
simsimd_size_t idx_pairs = 0;
for (; idx_pairs + 4 <= count_pairs; idx_pairs += 4) {
// Unpack the input arrays into real and imaginary parts:
float32x4x2_t a_vec = vld2q_f32((simsimd_f32_t const *)(a_pairs + idx_pairs));
float32x4x2_t b_vec = vld2q_f32((simsimd_f32_t const *)(b_pairs + idx_pairs));
float32x4_t a_real_vec = a_vec.val[0];
float32x4_t a_imag_vec = a_vec.val[1];
float32x4_t b_real_vec = b_vec.val[0];
float32x4_t b_imag_vec = b_vec.val[1];
// Compute the dot product:
ab_real_vec = vfmaq_f32(ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = vfmsq_f32(ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = vfmaq_f32(ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = vfmaq_f32(ab_imag_vec, a_imag_vec, b_real_vec);
}
// Reduce horizontal sums:
simsimd_f32_t ab_real = vaddvq_f32(ab_real_vec);
simsimd_f32_t ab_imag = vaddvq_f32(ab_imag_vec);
// Handle the tail:
for (; idx_pairs != count_pairs; ++idx_pairs) {
simsimd_f32c_t a_pair = a_pairs[idx_pairs], b_pair = b_pairs[idx_pairs];
simsimd_f32_t ar = a_pair.real, ai = a_pair.imag, br = b_pair.real, bi = b_pair.imag;
ab_real += ar * br - ai * bi;
ab_imag += ar * bi + ai * br;
}
results[0] = ab_real;
results[1] = ab_imag;
}
SIMSIMD_PUBLIC void simsimd_vdot_f32c_neon(simsimd_f32c_t const *a_pairs, simsimd_f32c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
float32x4_t ab_real_vec = vdupq_n_f32(0);
float32x4_t ab_imag_vec = vdupq_n_f32(0);
simsimd_size_t idx_pairs = 0;
for (; idx_pairs + 4 <= count_pairs; idx_pairs += 4) {
// Unpack the input arrays into real and imaginary parts:
float32x4x2_t a_vec = vld2q_f32((simsimd_f32_t const *)(a_pairs + idx_pairs));
float32x4x2_t b_vec = vld2q_f32((simsimd_f32_t const *)(b_pairs + idx_pairs));
float32x4_t a_real_vec = a_vec.val[0];
float32x4_t a_imag_vec = a_vec.val[1];
float32x4_t b_real_vec = b_vec.val[0];
float32x4_t b_imag_vec = b_vec.val[1];
// Compute the dot product:
ab_real_vec = vfmaq_f32(ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = vfmaq_f32(ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = vfmaq_f32(ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = vfmsq_f32(ab_imag_vec, a_imag_vec, b_real_vec);
}
// Reduce horizontal sums:
simsimd_f32_t ab_real = vaddvq_f32(ab_real_vec);
simsimd_f32_t ab_imag = vaddvq_f32(ab_imag_vec);
// Handle the tail:
for (; idx_pairs != count_pairs; ++idx_pairs) {
simsimd_f32c_t a_pair = a_pairs[idx_pairs], b_pair = b_pairs[idx_pairs];
simsimd_f32_t ar = a_pair.real, ai = a_pair.imag, br = b_pair.real, bi = b_pair.imag;
ab_real += ar * br + ai * bi;
ab_imag += ar * bi - ai * br;
}
results[0] = ab_real;
results[1] = ab_imag;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON
#if SIMSIMD_TARGET_NEON_I8
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+dotprod")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+dotprod"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const *a_scalars, simsimd_i8_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
int32x4_t ab_vec = vdupq_n_s32(0);
simsimd_size_t idx_scalars = 0;
// If the 128-bit `vdot_s32` intrinsic is unavailable, we can use the 64-bit `vdot_s32`.
// for (simsimd_size_t idx_scalars = 0; idx_scalars != n; idx_scalars += 8) {
// int16x8_t a_vec = vmovl_s8(vld1_s8(a_scalars + idx_scalars));
// int16x8_t b_vec = vmovl_s8(vld1_s8(b_scalars + idx_scalars));
// int16x8_t ab_part_vec = vmulq_s16(a_vec, b_vec);
// ab_vec = vaddq_s32(ab_vec, vaddq_s32(vmovl_s16(vget_high_s16(ab_part_vec)), //
// vmovl_s16(vget_low_s16(ab_part_vec))));
// }
for (; idx_scalars + 16 <= count_scalars; idx_scalars += 16) {
int8x16_t a_vec = vld1q_s8(a_scalars + idx_scalars);
int8x16_t b_vec = vld1q_s8(b_scalars + idx_scalars);
ab_vec = vdotq_s32(ab_vec, a_vec, b_vec);
}
// Take care of the tail:
simsimd_i32_t ab = vaddvq_s32(ab_vec);
for (; idx_scalars < count_scalars; ++idx_scalars) {
simsimd_i32_t ai = a_scalars[idx_scalars], bi = b_scalars[idx_scalars];
ab += ai * bi;
}
*result = ab;
}
SIMSIMD_PUBLIC void simsimd_dot_u8_neon(simsimd_u8_t const *a_scalars, simsimd_u8_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
uint32x4_t ab_vec = vdupq_n_u32(0);
simsimd_size_t idx_scalars = 0;
for (; idx_scalars + 16 <= count_scalars; idx_scalars += 16) {
uint8x16_t a_vec = vld1q_u8(a_scalars + idx_scalars);
uint8x16_t b_vec = vld1q_u8(b_scalars + idx_scalars);
ab_vec = vdotq_u32(ab_vec, a_vec, b_vec);
}
// Take care of the tail:
simsimd_u32_t ab = vaddvq_u32(ab_vec);
for (; idx_scalars < count_scalars; ++idx_scalars) {
simsimd_u32_t ai = a_scalars[idx_scalars], bi = b_scalars[idx_scalars];
ab += ai * bi;
}
*result = ab;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_I8
#if SIMSIMD_TARGET_NEON_F16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd+fp16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
SIMSIMD_INTERNAL float16x4_t _simsimd_partial_load_f16x4_neon(simsimd_f16_t const *x, simsimd_size_t n) {
// In case the software emulation for `f16` scalars is enabled, the `simsimd_f16_to_f32`
// function will run. It is extremely slow, so even for the tail, let's combine serial
// loads and stores with vectorized math.
union {
float16x4_t vec;
simsimd_f16_t scalars[4];
} result;
simsimd_size_t i = 0;
for (; i < n; ++i) result.scalars[i] = x[i];
for (; i < 4; ++i) result.scalars[i] = 0;
return result.vec;
}
SIMSIMD_PUBLIC void simsimd_dot_f16_neon(simsimd_f16_t const *a_scalars, simsimd_f16_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
float32x4_t a_vec, b_vec;
float32x4_t ab_vec = vdupq_n_f32(0);
simsimd_size_t i = 0;
simsimd_dot_f16_neon_cycle:
if (count_scalars < 4) {
a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a_scalars, count_scalars));
b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b_scalars, count_scalars));
count_scalars = 0;
}
else {
a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)a_scalars));
b_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)b_scalars));
a_scalars += 4, b_scalars += 4, count_scalars -= 4;
}
ab_vec = vfmaq_f32(ab_vec, a_vec, b_vec);
if (count_scalars) goto simsimd_dot_f16_neon_cycle;
*result = vaddvq_f32(ab_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f16c_neon(simsimd_f16c_t const *a_pairs, simsimd_f16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
// A nicer approach is to use `f16` arithmetic for the dot product, but that requires
// FMLA extensions available on Arm v8.3 and later. That we can also process 16 entries
// at once. That's how the original implementation worked, but compiling it was a nightmare :)
float32x4_t ab_real_vec = vdupq_n_f32(0);
float32x4_t ab_imag_vec = vdupq_n_f32(0);
while (count_pairs >= 4) {
// Unpack the input arrays into real and imaginary parts.
// MSVC sadly doesn't recognize the `vld2_f16`, so we load the data as signed
// integers of the same size and reinterpret with `vreinterpret_f16_s16` afterwards.
int16x4x2_t a_vec = vld2_s16((short *)a_pairs);
int16x4x2_t b_vec = vld2_s16((short *)b_pairs);
float32x4_t a_real_vec = vcvt_f32_f16(vreinterpret_f16_s16(a_vec.val[0]));
float32x4_t a_imag_vec = vcvt_f32_f16(vreinterpret_f16_s16(a_vec.val[1]));
float32x4_t b_real_vec = vcvt_f32_f16(vreinterpret_f16_s16(b_vec.val[0]));
float32x4_t b_imag_vec = vcvt_f32_f16(vreinterpret_f16_s16(b_vec.val[1]));
// Compute the dot product:
ab_real_vec = vfmaq_f32(ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = vfmsq_f32(ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = vfmaq_f32(ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = vfmaq_f32(ab_imag_vec, a_imag_vec, b_real_vec);
count_pairs -= 4, a_pairs += 4, b_pairs += 4;
}
// Reduce horizontal sums and aggregate with the tail:
simsimd_dot_f16c_serial(a_pairs, b_pairs, count_pairs, results);
results[0] += vaddvq_f32(ab_real_vec);
results[1] += vaddvq_f32(ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_f16c_neon(simsimd_f16c_t const *a_pairs, simsimd_f16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
// A nicer approach is to use `f16` arithmetic for the dot product, but that requires
// FMLA extensions available on Arm v8.3 and later. That we can also process 16 entries
// at once. That's how the original implementation worked, but compiling it was a nightmare :)
float32x4_t ab_real_vec = vdupq_n_f32(0);
float32x4_t ab_imag_vec = vdupq_n_f32(0);
while (count_pairs >= 4) {
// Unpack the input arrays into real and imaginary parts.
// MSVC sadly doesn't recognize the `vld2_f16`, so we load the data as signed
// integers of the same size and reinterpret with `vreinterpret_f16_s16` afterwards.
int16x4x2_t a_vec = vld2_s16((short *)a_pairs);
int16x4x2_t b_vec = vld2_s16((short *)b_pairs);
float32x4_t a_real_vec = vcvt_f32_f16(vreinterpret_f16_s16(a_vec.val[0]));
float32x4_t a_imag_vec = vcvt_f32_f16(vreinterpret_f16_s16(a_vec.val[1]));
float32x4_t b_real_vec = vcvt_f32_f16(vreinterpret_f16_s16(b_vec.val[0]));
float32x4_t b_imag_vec = vcvt_f32_f16(vreinterpret_f16_s16(b_vec.val[1]));
// Compute the dot product:
ab_real_vec = vfmaq_f32(ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = vfmaq_f32(ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = vfmaq_f32(ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = vfmsq_f32(ab_imag_vec, a_imag_vec, b_real_vec);
count_pairs -= 4, a_pairs += 4, b_pairs += 4;
}
// Reduce horizontal sums and aggregate with the tail:
simsimd_vdot_f16c_serial(a_pairs, b_pairs, count_pairs, results);
results[0] += vaddvq_f32(ab_real_vec);
results[1] += vaddvq_f32(ab_imag_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_F16
#if SIMSIMD_TARGET_NEON_BF16
#pragma GCC push_options
#pragma GCC target("arch=armv8.6-a+simd+bf16")
#pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function)
SIMSIMD_INTERNAL bfloat16x8_t _simsimd_partial_load_bf16x8_neon(simsimd_bf16_t const *x, simsimd_size_t n) {
union {
bfloat16x8_t vec;
simsimd_bf16_t scalars[8];
} result;
simsimd_size_t i = 0;
for (; i < n; ++i) result.scalars[i] = x[i];
for (; i < 8; ++i) result.scalars[i] = 0;
return result.vec;
}
SIMSIMD_PUBLIC void simsimd_dot_bf16_neon(simsimd_bf16_t const *a_scalars, simsimd_bf16_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
bfloat16x8_t a_vec, b_vec;
float32x4_t ab_vec = vdupq_n_f32(0);
simsimd_dot_bf16_neon_cycle:
if (count_scalars < 8) {
a_vec = _simsimd_partial_load_bf16x8_neon(a_scalars, count_scalars);
b_vec = _simsimd_partial_load_bf16x8_neon(b_scalars, count_scalars);
count_scalars = 0;
}
else {
a_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)a_scalars);
b_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)b_scalars);
a_scalars += 8, b_scalars += 8, count_scalars -= 8;
}
ab_vec = vbfdotq_f32(ab_vec, a_vec, b_vec);
if (count_scalars) goto simsimd_dot_bf16_neon_cycle;
*result = vaddvq_f32(ab_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_bf16c_neon(simsimd_bf16c_t const *a_pairs, simsimd_bf16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
// A nicer approach is to use `bf16` arithmetic for the dot product, but that requires
// FMLA extensions available on Arm v8.3 and later. That we can also process 16 entries
// at once. That's how the original implementation worked, but compiling it was a nightmare :)
float32x4_t ab_real_vec = vdupq_n_f32(0);
float32x4_t ab_imag_vec = vdupq_n_f32(0);
while (count_pairs >= 4) {
// Unpack the input arrays into real and imaginary parts.
// MSVC sadly doesn't recognize the `vld2_bf16`, so we load the data as signed
// integers of the same size and reinterpret with `vreinterpret_bf16_s16` afterwards.
int16x4x2_t a_vec = vld2_s16((short const *)a_pairs);
int16x4x2_t b_vec = vld2_s16((short const *)b_pairs);
float32x4_t a_real_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(a_vec.val[0]));
float32x4_t a_imag_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(a_vec.val[1]));
float32x4_t b_real_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(b_vec.val[0]));
float32x4_t b_imag_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(b_vec.val[1]));
// Compute the dot product:
ab_real_vec = vfmaq_f32(ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = vfmsq_f32(ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = vfmaq_f32(ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = vfmaq_f32(ab_imag_vec, a_imag_vec, b_real_vec);
count_pairs -= 4, a_pairs += 4, b_pairs += 4;
}
// Reduce horizontal sums and aggregate with the tail:
simsimd_dot_bf16c_serial(a_pairs, b_pairs, count_pairs, results);
results[0] += vaddvq_f32(ab_real_vec);
results[1] += vaddvq_f32(ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_bf16c_neon(simsimd_bf16c_t const *a_pairs, simsimd_bf16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
// A nicer approach is to use `bf16` arithmetic for the dot product, but that requires
// FMLA extensions available on Arm v8.3 and later. That we can also process 16 entries
// at once. That's how the original implementation worked, but compiling it was a nightmare :)
float32x4_t ab_real_vec = vdupq_n_f32(0);
float32x4_t ab_imag_vec = vdupq_n_f32(0);
while (count_pairs >= 4) {
// Unpack the input arrays into real and imaginary parts.
// MSVC sadly doesn't recognize the `vld2_bf16`, so we load the data as signed
// integers of the same size and reinterpret with `vreinterpret_bf16_s16` afterwards.
int16x4x2_t a_vec = vld2_s16((short const *)a_pairs);
int16x4x2_t b_vec = vld2_s16((short const *)b_pairs);
float32x4_t a_real_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(a_vec.val[0]));
float32x4_t a_imag_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(a_vec.val[1]));
float32x4_t b_real_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(b_vec.val[0]));
float32x4_t b_imag_vec = vcvt_f32_bf16(vreinterpret_bf16_s16(b_vec.val[1]));
// Compute the dot product:
ab_real_vec = vfmaq_f32(ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = vfmaq_f32(ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = vfmaq_f32(ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = vfmsq_f32(ab_imag_vec, a_imag_vec, b_real_vec);
count_pairs -= 4, a_pairs += 4, b_pairs += 4;
}
// Reduce horizontal sums and aggregate with the tail:
simsimd_vdot_bf16c_serial(a_pairs, b_pairs, count_pairs, results);
results[0] += vaddvq_f32(ab_real_vec);
results[1] += vaddvq_f32(ab_imag_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_BF16
#if SIMSIMD_TARGET_SVE
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_dot_f32_sve(simsimd_f32_t const *a_scalars, simsimd_f32_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
simsimd_size_t idx_scalars = 0;
svfloat32_t ab_vec = svdup_f32(0.f);
do {
svbool_t pg_vec = svwhilelt_b32((unsigned int)idx_scalars, (unsigned int)count_scalars);
svfloat32_t a_vec = svld1_f32(pg_vec, a_scalars + idx_scalars);
svfloat32_t b_vec = svld1_f32(pg_vec, b_scalars + idx_scalars);
ab_vec = svmla_f32_x(pg_vec, ab_vec, a_vec, b_vec);
idx_scalars += svcntw();
} while (idx_scalars < count_scalars);
*result = svaddv_f32(svptrue_b32(), ab_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f32c_sve(simsimd_f32c_t const *a_pairs, simsimd_f32c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
simsimd_size_t idx_pairs = 0;
svfloat32_t ab_real_vec = svdup_f32(0.f);
svfloat32_t ab_imag_vec = svdup_f32(0.f);
do {
svbool_t pg_vec = svwhilelt_b32((unsigned int)idx_pairs, (unsigned int)count_pairs);
svfloat32x2_t a_vec = svld2_f32(pg_vec, (simsimd_f32_t const *)(a_pairs + idx_pairs));
svfloat32x2_t b_vec = svld2_f32(pg_vec, (simsimd_f32_t const *)(b_pairs + idx_pairs));
svfloat32_t a_real_vec = svget2_f32(a_vec, 0);
svfloat32_t a_imag_vec = svget2_f32(a_vec, 1);
svfloat32_t b_real_vec = svget2_f32(b_vec, 0);
svfloat32_t b_imag_vec = svget2_f32(b_vec, 1);
ab_real_vec = svmla_f32_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmls_f32_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f32_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmla_f32_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcntw();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f32(svptrue_b32(), ab_real_vec);
results[1] = svaddv_f32(svptrue_b32(), ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_f32c_sve(simsimd_f32c_t const *a_pairs, simsimd_f32c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
simsimd_size_t idx_pairs = 0;
svfloat32_t ab_real_vec = svdup_f32(0.f);
svfloat32_t ab_imag_vec = svdup_f32(0.f);
do {
svbool_t pg_vec = svwhilelt_b32((unsigned int)idx_pairs, (unsigned int)count_pairs);
svfloat32x2_t a_vec = svld2_f32(pg_vec, (simsimd_f32_t const *)(a_pairs + idx_pairs));
svfloat32x2_t b_vec = svld2_f32(pg_vec, (simsimd_f32_t const *)(b_pairs + idx_pairs));
svfloat32_t a_real_vec = svget2_f32(a_vec, 0);
svfloat32_t a_imag_vec = svget2_f32(a_vec, 1);
svfloat32_t b_real_vec = svget2_f32(b_vec, 0);
svfloat32_t b_imag_vec = svget2_f32(b_vec, 1);
ab_real_vec = svmla_f32_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmla_f32_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f32_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmls_f32_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcntw();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f32(svptrue_b32(), ab_real_vec);
results[1] = svaddv_f32(svptrue_b32(), ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f64_sve(simsimd_f64_t const *a_scalars, simsimd_f64_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
simsimd_size_t idx_scalars = 0;
svfloat64_t ab_vec = svdup_f64(0.);
do {
svbool_t pg_vec = svwhilelt_b64((unsigned int)idx_scalars, (unsigned int)count_scalars);
svfloat64_t a_vec = svld1_f64(pg_vec, a_scalars + idx_scalars);
svfloat64_t b_vec = svld1_f64(pg_vec, b_scalars + idx_scalars);
ab_vec = svmla_f64_x(pg_vec, ab_vec, a_vec, b_vec);
idx_scalars += svcntd();
} while (idx_scalars < count_scalars);
*result = svaddv_f64(svptrue_b32(), ab_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f64c_sve(simsimd_f64c_t const *a_pairs, simsimd_f64c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
simsimd_size_t idx_pairs = 0;
svfloat64_t ab_real_vec = svdup_f64(0.);
svfloat64_t ab_imag_vec = svdup_f64(0.);
do {
svbool_t pg_vec = svwhilelt_b64((unsigned int)idx_pairs, (unsigned int)count_pairs);
svfloat64x2_t a_vec = svld2_f64(pg_vec, (simsimd_f64_t const *)(a_pairs + idx_pairs));
svfloat64x2_t b_vec = svld2_f64(pg_vec, (simsimd_f64_t const *)(b_pairs + idx_pairs));
svfloat64_t a_real_vec = svget2_f64(a_vec, 0);
svfloat64_t a_imag_vec = svget2_f64(a_vec, 1);
svfloat64_t b_real_vec = svget2_f64(b_vec, 0);
svfloat64_t b_imag_vec = svget2_f64(b_vec, 1);
ab_real_vec = svmla_f64_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmls_f64_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f64_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmla_f64_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcntd();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f64(svptrue_b64(), ab_real_vec);
results[1] = svaddv_f64(svptrue_b64(), ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_f64c_sve(simsimd_f64c_t const *a_pairs, simsimd_f64c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
simsimd_size_t idx_pairs = 0;
svfloat64_t ab_real_vec = svdup_f64(0.);
svfloat64_t ab_imag_vec = svdup_f64(0.);
do {
svbool_t pg_vec = svwhilelt_b64((unsigned int)idx_pairs, (unsigned int)count_pairs);
svfloat64x2_t a_vec = svld2_f64(pg_vec, (simsimd_f64_t const *)(a_pairs + idx_pairs));
svfloat64x2_t b_vec = svld2_f64(pg_vec, (simsimd_f64_t const *)(b_pairs + idx_pairs));
svfloat64_t a_real_vec = svget2_f64(a_vec, 0);
svfloat64_t a_imag_vec = svget2_f64(a_vec, 1);
svfloat64_t b_real_vec = svget2_f64(b_vec, 0);
svfloat64_t b_imag_vec = svget2_f64(b_vec, 1);
ab_real_vec = svmla_f64_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmla_f64_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f64_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmls_f64_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcntd();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f64(svptrue_b64(), ab_real_vec);
results[1] = svaddv_f64(svptrue_b64(), ab_imag_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve+fp16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_dot_f16_sve(simsimd_f16_t const *a_scalars, simsimd_f16_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
simsimd_size_t idx_scalars = 0;
svfloat16_t ab_vec = svdup_f16(0);
do {
svbool_t pg_vec = svwhilelt_b16((unsigned int)idx_scalars, (unsigned int)count_scalars);
svfloat16_t a_vec = svld1_f16(pg_vec, (simsimd_f16_for_arm_simd_t const *)(a_scalars + idx_scalars));
svfloat16_t b_vec = svld1_f16(pg_vec, (simsimd_f16_for_arm_simd_t const *)(b_scalars + idx_scalars));
ab_vec = svmla_f16_x(pg_vec, ab_vec, a_vec, b_vec);
idx_scalars += svcnth();
} while (idx_scalars < count_scalars);
simsimd_f16_for_arm_simd_t ab = svaddv_f16(svptrue_b16(), ab_vec);
*result = ab;
}
SIMSIMD_PUBLIC void simsimd_dot_f16c_sve(simsimd_f16c_t const *a_pairs, simsimd_f16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
simsimd_size_t idx_pairs = 0;
svfloat16_t ab_real_vec = svdup_f16(0);
svfloat16_t ab_imag_vec = svdup_f16(0);
do {
svbool_t pg_vec = svwhilelt_b32((unsigned int)idx_pairs, (unsigned int)count_pairs);
svfloat16x2_t a_vec = svld2_f16(pg_vec, (simsimd_f16_for_arm_simd_t const *)(a_pairs + idx_pairs));
svfloat16x2_t b_vec = svld2_f16(pg_vec, (simsimd_f16_for_arm_simd_t const *)(b_pairs + idx_pairs));
svfloat16_t a_real_vec = svget2_f16(a_vec, 0);
svfloat16_t a_imag_vec = svget2_f16(a_vec, 1);
svfloat16_t b_real_vec = svget2_f16(b_vec, 0);
svfloat16_t b_imag_vec = svget2_f16(b_vec, 1);
ab_real_vec = svmla_f16_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmls_f16_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f16_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmla_f16_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcnth();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f16(svptrue_b16(), ab_real_vec);
results[1] = svaddv_f16(svptrue_b16(), ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_f16c_sve(simsimd_f16c_t const *a_pairs, simsimd_f16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
simsimd_size_t idx_pairs = 0;
svfloat16_t ab_real_vec = svdup_f16(0);
svfloat16_t ab_imag_vec = svdup_f16(0);
do {
svbool_t pg_vec = svwhilelt_b32((unsigned int)idx_pairs, (unsigned int)count_pairs);
svfloat16x2_t a_vec = svld2_f16(pg_vec, (simsimd_f16_for_arm_simd_t const *)(a_pairs + idx_pairs));
svfloat16x2_t b_vec = svld2_f16(pg_vec, (simsimd_f16_for_arm_simd_t const *)(b_pairs + idx_pairs));
svfloat16_t a_real_vec = svget2_f16(a_vec, 0);
svfloat16_t a_imag_vec = svget2_f16(a_vec, 1);
svfloat16_t b_real_vec = svget2_f16(b_vec, 0);
svfloat16_t b_imag_vec = svget2_f16(b_vec, 1);
ab_real_vec = svmla_f16_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmla_f16_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f16_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmls_f16_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcnth();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f16(svptrue_b16(), ab_real_vec);
results[1] = svaddv_f16(svptrue_b16(), ab_imag_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SVE
#endif // _SIMSIMD_TARGET_ARM
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_HASWELL
#pragma GCC push_options
#pragma GCC target("avx2", "f16c", "fma")
#pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function)
SIMSIMD_INTERNAL simsimd_f64_t _simsimd_reduce_f64x4_haswell(__m256d vec) {
// Reduce the double-precision vector to a scalar
// Horizontal add the first and second double-precision values, and third and fourth
__m128d vec_low = _mm256_castpd256_pd128(vec);
__m128d vec_high = _mm256_extractf128_pd(vec, 1);
__m128d vec128 = _mm_add_pd(vec_low, vec_high);
// Horizontal add again to accumulate all four values into one
vec128 = _mm_hadd_pd(vec128, vec128);
// Convert the final sum to a scalar double-precision value and return
return _mm_cvtsd_f64(vec128);
}
SIMSIMD_INTERNAL simsimd_f64_t _simsimd_reduce_f32x8_haswell(__m256 vec) {
// Convert the lower and higher 128-bit lanes of the input vector to double precision
__m128 low_f32 = _mm256_castps256_ps128(vec);
__m128 high_f32 = _mm256_extractf128_ps(vec, 1);
// Convert single-precision (float) vectors to double-precision (double) vectors
__m256d low_f64 = _mm256_cvtps_pd(low_f32);
__m256d high_f64 = _mm256_cvtps_pd(high_f32);
// Perform the addition in double-precision
__m256d sum = _mm256_add_pd(low_f64, high_f64);
return _simsimd_reduce_f64x4_haswell(sum);
}
SIMSIMD_INTERNAL simsimd_i32_t _simsimd_reduce_i32x8_haswell(__m256i vec) {
__m128i low = _mm256_castsi256_si128(vec);
__m128i high = _mm256_extracti128_si256(vec, 1);
__m128i sum = _mm_add_epi32(low, high);
sum = _mm_hadd_epi32(sum, sum);
sum = _mm_hadd_epi32(sum, sum);
return _mm_cvtsi128_si32(sum);
}
SIMSIMD_PUBLIC void simsimd_dot_f32_haswell(simsimd_f32_t const *a_scalars, simsimd_f32_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *results) {
__m256 ab_vec = _mm256_setzero_ps();
simsimd_size_t idx_scalars = 0;
for (; idx_scalars + 8 <= count_scalars; idx_scalars += 8) {
__m256 a_vec = _mm256_loadu_ps(a_scalars + idx_scalars);
__m256 b_vec = _mm256_loadu_ps(b_scalars + idx_scalars);
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
}
simsimd_f64_t ab = _simsimd_reduce_f32x8_haswell(ab_vec);
for (; idx_scalars < count_scalars; ++idx_scalars) ab += a_scalars[idx_scalars] * b_scalars[idx_scalars];
*results = ab;
}
SIMSIMD_PUBLIC void simsimd_dot_f32c_haswell(simsimd_f32c_t const *a_pairs, simsimd_f32c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
// The naive approach would be to use FMA and FMS instructions on different parts of the vectors.
// Prior to that we would need to shuffle the input vectors to separate real and imaginary parts.
// Both operations are quite expensive, and the resulting kernel would run at 2.5 GB/s.
// __m128 ab_real_vec = _mm_setzero_ps();
// __m128 ab_imag_vec = _mm_setzero_ps();
// __m256i permute_vec = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
// simsimd_size_t idx_pairs = 0;
// for (; idx_pairs + 4 <= count_pairs; idx_pairs += 4) {
// __m256 a_vec = _mm256_loadu_ps((simsimd_f32_t const *)(a_pairs + idx_pairs));
// __m256 b_vec = _mm256_loadu_ps((simsimd_f32_t const *)(b_pairs + idx_pairs));
// __m256 a_shuffled = _mm256_permutevar8x32_ps(a_vec, permute_vec);
// __m256 b_shuffled = _mm256_permutevar8x32_ps(b_vec, permute_vec);
// __m128 a_real_vec = _mm256_extractf128_ps(a_shuffled, 0);
// __m128 a_imag_vec = _mm256_extractf128_ps(a_shuffled, 1);
// __m128 b_real_vec = _mm256_extractf128_ps(b_shuffled, 0);
// __m128 b_imag_vec = _mm256_extractf128_ps(b_shuffled, 1);
// ab_real_vec = _mm_fmadd_ps(a_real_vec, b_real_vec, ab_real_vec);
// ab_real_vec = _mm_fnmadd_ps(a_imag_vec, b_imag_vec, ab_real_vec);
// ab_imag_vec = _mm_fmadd_ps(a_real_vec, b_imag_vec, ab_imag_vec);
// ab_imag_vec = _mm_fmadd_ps(a_imag_vec, b_real_vec, ab_imag_vec);
// }
//
// Instead, we take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors. Moreover, `XOR` can be placed after the primary loop.
// Both operations are quite cheap, and the throughput doubles from 2.5 GB/s to 5 GB/s.
__m256 ab_real_vec = _mm256_setzero_ps();
__m256 ab_imag_vec = _mm256_setzero_ps();
__m256i sign_flip_vec = _mm256_set1_epi64x(0x8000000000000000);
__m256i swap_adjacent_vec = _mm256_set_epi8( //
11, 10, 9, 8, // Points to the third f32 in 128-bit lane
15, 14, 13, 12, // Points to the fourth f32 in 128-bit lane
3, 2, 1, 0, // Points to the first f32 in 128-bit lane
7, 6, 5, 4, // Points to the second f32 in 128-bit lane
11, 10, 9, 8, // Points to the third f32 in 128-bit lane
15, 14, 13, 12, // Points to the fourth f32 in 128-bit lane
3, 2, 1, 0, // Points to the first f32 in 128-bit lane
7, 6, 5, 4 // Points to the second f32 in 128-bit lane
);
simsimd_size_t idx_pairs = 0;
for (; idx_pairs + 4 <= count_pairs; idx_pairs += 4) {
__m256 a_vec = _mm256_loadu_ps((simsimd_f32_t const *)(a_pairs + idx_pairs));
__m256 b_vec = _mm256_loadu_ps((simsimd_f32_t const *)(b_pairs + idx_pairs));
__m256 b_swapped_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec));
ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec);
ab_imag_vec = _mm256_fmadd_ps(a_vec, b_swapped_vec, ab_imag_vec);
}
// Flip the sign bit in every second scalar before accumulation:
ab_real_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(ab_real_vec), sign_flip_vec));
// Reduce horizontal sums:
simsimd_distance_t ab_real = _simsimd_reduce_f32x8_haswell(ab_real_vec);
simsimd_distance_t ab_imag = _simsimd_reduce_f32x8_haswell(ab_imag_vec);
// Handle the tail:
for (; idx_pairs != count_pairs; ++idx_pairs) {
simsimd_f32c_t a_pair = a_pairs[idx_pairs], b_pair = b_pairs[idx_pairs];
simsimd_f32_t ar = a_pair.real, ai = a_pair.imag, br = b_pair.real, bi = b_pair.imag;
ab_real += ar * br - ai * bi;
ab_imag += ar * bi + ai * br;
}
results[0] = ab_real;
results[1] = ab_imag;
}
SIMSIMD_PUBLIC void simsimd_vdot_f32c_haswell(simsimd_f32c_t const *a_pairs, simsimd_f32c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
__m256 ab_real_vec = _mm256_setzero_ps();
__m256 ab_imag_vec = _mm256_setzero_ps();
__m256i sign_flip_vec = _mm256_set1_epi64x(0x8000000000000000);
__m256i swap_adjacent_vec = _mm256_set_epi8( //
11, 10, 9, 8, // Points to the third f32 in 128-bit lane
15, 14, 13, 12, // Points to the fourth f32 in 128-bit lane
3, 2, 1, 0, // Points to the first f32 in 128-bit lane
7, 6, 5, 4, // Points to the second f32 in 128-bit lane
11, 10, 9, 8, // Points to the third f32 in 128-bit lane
15, 14, 13, 12, // Points to the fourth f32 in 128-bit lane
3, 2, 1, 0, // Points to the first f32 in 128-bit lane
7, 6, 5, 4 // Points to the second f32 in 128-bit lane
);
simsimd_size_t idx_pairs = 0;
for (; idx_pairs + 4 <= count_pairs; idx_pairs += 4) {
__m256 a_vec = _mm256_loadu_ps((simsimd_f32_t const *)(a_pairs + idx_pairs));
__m256 b_vec = _mm256_loadu_ps((simsimd_f32_t const *)(b_pairs + idx_pairs));
ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec);
b_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec));
ab_imag_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_imag_vec);
}
// Flip the sign bit in every second scalar before accumulation:
ab_imag_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(ab_imag_vec), sign_flip_vec));
// Reduce horizontal sums:
simsimd_distance_t ab_real = _simsimd_reduce_f32x8_haswell(ab_real_vec);
simsimd_distance_t ab_imag = _simsimd_reduce_f32x8_haswell(ab_imag_vec);
// Handle the tail:
for (; idx_pairs != count_pairs; ++idx_pairs) {
simsimd_f32c_t a_pair = a_pairs[idx_pairs], b_pair = b_pairs[idx_pairs];
simsimd_f32_t ar = a_pair.real, ai = a_pair.imag, br = b_pair.real, bi = b_pair.imag;
ab_real += ar * br + ai * bi;
ab_imag += ar * bi - ai * br;
}
results[0] = ab_real;
results[1] = ab_imag;
}
SIMSIMD_INTERNAL __m256 _simsimd_partial_load_f16x8_haswell(simsimd_f16_t const *a, simsimd_size_t n) {
// In case the software emulation for `f16` scalars is enabled, the `simsimd_f16_to_f32`
// function will run. It is extremely slow, so even for the tail, let's combine serial
// loads and stores with vectorized math.
union {
__m128i vec;
simsimd_f16_t scalars[8];
} result;
simsimd_size_t i = 0;
for (; i < n; ++i) result.scalars[i] = a[i];
for (; i < 8; ++i) result.scalars[i] = 0;
return _mm256_cvtph_ps(result.vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f16_haswell(simsimd_f16_t const *a_scalars, simsimd_f16_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m256 a_vec, b_vec;
__m256 ab_vec = _mm256_setzero_ps();
simsimd_dot_f16_haswell_cycle:
if (count_scalars < 8) {
a_vec = _simsimd_partial_load_f16x8_haswell(a_scalars, count_scalars);
b_vec = _simsimd_partial_load_f16x8_haswell(b_scalars, count_scalars);
count_scalars = 0;
}
else {
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)a_scalars));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)b_scalars));
count_scalars -= 8, a_scalars += 8, b_scalars += 8;
}
// We can silence the NaNs using blends:
//
// __m256 a_is_nan = _mm256_cmp_ps(a_vec, a_vec, _CMP_UNORD_Q);
// __m256 b_is_nan = _mm256_cmp_ps(b_vec, b_vec, _CMP_UNORD_Q);
// ab_vec = _mm256_blendv_ps(_mm256_fmadd_ps(a_vec, b_vec, ab_vec), ab_vec, _mm256_or_ps(a_is_nan, b_is_nan));
//
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
if (count_scalars) goto simsimd_dot_f16_haswell_cycle;
*result = _simsimd_reduce_f32x8_haswell(ab_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f16c_haswell(simsimd_f16c_t const *a_pairs, simsimd_f16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
// Ideally the implementation would load 256 bits worth of vector data at a time,
// shuffle those within a register, split in halfs, and only then upcast.
// That way, we are stepping through 32x 16-bit vector components at a time, or 16 dimensions.
// Sadly, shuffling 16-bit entries in a YMM register is hard to implement efficiently.
//
// Simpler approach is to load 128 bits at a time, upcast, and then shuffle.
// This mostly replicates the `simsimd_dot_f32c_haswell`.
__m256 ab_real_vec = _mm256_setzero_ps();
__m256 ab_imag_vec = _mm256_setzero_ps();
__m256i sign_flip_vec = _mm256_set1_epi64x(0x8000000000000000);
__m256i swap_adjacent_vec = _mm256_set_epi8( //
11, 10, 9, 8, // Points to the third f32 in 128-bit lane
15, 14, 13, 12, // Points to the fourth f32 in 128-bit lane
3, 2, 1, 0, // Points to the first f32 in 128-bit lane
7, 6, 5, 4, // Points to the second f32 in 128-bit lane
11, 10, 9, 8, // Points to the third f32 in 128-bit lane
15, 14, 13, 12, // Points to the fourth f32 in 128-bit lane
3, 2, 1, 0, // Points to the first f32 in 128-bit lane
7, 6, 5, 4 // Points to the second f32 in 128-bit lane
);
while (count_pairs >= 4) {
__m256 a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)a_pairs));
__m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)b_pairs));
__m256 b_swapped_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec));
ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec);
ab_imag_vec = _mm256_fmadd_ps(a_vec, b_swapped_vec, ab_imag_vec);
count_pairs -= 4, a_pairs += 4, b_pairs += 4;
}
// Flip the sign bit in every second scalar before accumulation:
ab_real_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(ab_real_vec), sign_flip_vec));
// Reduce horizontal sums and aggregate with the tail:
simsimd_dot_f16c_serial(a_pairs, b_pairs, count_pairs, results);
results[0] += _simsimd_reduce_f32x8_haswell(ab_real_vec);
results[1] += _simsimd_reduce_f32x8_haswell(ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_f16c_haswell(simsimd_f16c_t const *a_pairs, simsimd_f16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
// Ideally the implementation would load 256 bits worth of vector data at a time,
// shuffle those within a register, split in halfs, and only then upcast.
// That way, we are stepping through 32x 16-bit vector components at a time, or 16 dimensions.
// Sadly, shuffling 16-bit entries in a YMM register is hard to implement efficiently.
//
// Simpler approach is to load 128 bits at a time, upcast, and then shuffle.
// This mostly replicates the `simsimd_vdot_f32c_haswell`.
__m256 ab_real_vec = _mm256_setzero_ps();
__m256 ab_imag_vec = _mm256_setzero_ps();
__m256i sign_flip_vec = _mm256_set1_epi64x(0x8000000000000000);
__m256i swap_adjacent_vec = _mm256_set_epi8( //
11, 10, 9, 8, // Points to the third f32 in 128-bit lane
15, 14, 13, 12, // Points to the fourth f32 in 128-bit lane
3, 2, 1, 0, // Points to the first f32 in 128-bit lane
7, 6, 5, 4, // Points to the second f32 in 128-bit lane
11, 10, 9, 8, // Points to the third f32 in 128-bit lane
15, 14, 13, 12, // Points to the fourth f32 in 128-bit lane
3, 2, 1, 0, // Points to the first f32 in 128-bit lane
7, 6, 5, 4 // Points to the second f32 in 128-bit lane
);
while (count_pairs >= 4) {
__m256 a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)a_pairs));
__m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)b_pairs));
ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec);
b_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec));
ab_imag_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_imag_vec);
count_pairs -= 4, a_pairs += 4, b_pairs += 4;
}
// Flip the sign bit in every second scalar before accumulation:
ab_imag_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(ab_imag_vec), sign_flip_vec));
// Reduce horizontal sums and aggregate with the tail:
simsimd_dot_f16c_serial(a_pairs, b_pairs, count_pairs, results);
results[0] += _simsimd_reduce_f32x8_haswell(ab_real_vec);
results[1] += _simsimd_reduce_f32x8_haswell(ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_i8_haswell(simsimd_i8_t const *a_scalars, simsimd_i8_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m256i ab_i32_low_vec = _mm256_setzero_si256();
__m256i ab_i32_high_vec = _mm256_setzero_si256();
// AVX2 has no instructions for 8-bit signed integer dot-products,
// but it has a weird instruction for mixed signed-unsigned 8-bit dot-product.
// So we need to normalize the first vector to its absolute value,
// and shift the product sign into the second vector.
//
// __m256i a_i8_abs_vec = _mm256_abs_epi8(a_i8_vec);
// __m256i b_i8_flipped_vec = _mm256_sign_epi8(b_i8_vec, a_i8_vec);
// __m256i ab_i16_vec = _mm256_maddubs_epi16(a_i8_abs_vec, b_i8_flipped_vec);
//
// The problem with this approach, however, is the `-128` value in the second vector.
// Flipping its sign will do nothing, and the result will be incorrect.
// This can easily lead to noticeable numerical errors in the final result.
simsimd_size_t idx_scalars = 0;
for (; idx_scalars + 32 <= count_scalars; idx_scalars += 32) {
__m256i a_i8_vec = _mm256_lddqu_si256((__m256i const *)(a_scalars + idx_scalars));
__m256i b_i8_vec = _mm256_lddqu_si256((__m256i const *)(b_scalars + idx_scalars));
// Upcast `int8` to `int16`
__m256i a_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 0));
__m256i a_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 1));
__m256i b_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 0));
__m256i b_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 1));
// Multiply and accumulate at `int16` level, accumulate at `int32` level
ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec));
ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec));
}
// Horizontal sum across the 256-bit register
int ab = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
// Take care of the tail:
for (; idx_scalars < count_scalars; ++idx_scalars) ab += (int)(a_scalars[idx_scalars]) * b_scalars[idx_scalars];
*result = ab;
}
SIMSIMD_PUBLIC void simsimd_dot_u8_haswell(simsimd_u8_t const *a_scalars, simsimd_u8_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m256i ab_i32_low_vec = _mm256_setzero_si256();
__m256i ab_i32_high_vec = _mm256_setzero_si256();
__m256i const zeros_vec = _mm256_setzero_si256();
// AVX2 has no instructions for unsigned 8-bit integer dot-products,
// but it has a weird instruction for mixed signed-unsigned 8-bit dot-product.
simsimd_size_t idx_scalars = 0;
for (; idx_scalars + 32 <= count_scalars; idx_scalars += 32) {
__m256i a_u8_vec = _mm256_lddqu_si256((__m256i const *)(a_scalars + idx_scalars));
__m256i b_u8_vec = _mm256_lddqu_si256((__m256i const *)(b_scalars + idx_scalars));
// Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking
// instructions instead of extracts, as they are much faster and more efficient.
__m256i a_i16_low_vec = _mm256_unpacklo_epi8(a_u8_vec, zeros_vec);
__m256i a_i16_high_vec = _mm256_unpackhi_epi8(a_u8_vec, zeros_vec);
__m256i b_i16_low_vec = _mm256_unpacklo_epi8(b_u8_vec, zeros_vec);
__m256i b_i16_high_vec = _mm256_unpackhi_epi8(b_u8_vec, zeros_vec);
// Multiply and accumulate at `int16` level, accumulate at `int32` level
ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec));
ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec));
}
// Horizontal sum across the 256-bit register
int ab = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
// Take care of the tail:
for (; idx_scalars < count_scalars; ++idx_scalars) ab += (int)(a_scalars[idx_scalars]) * b_scalars[idx_scalars];
*result = ab;
}
SIMSIMD_INTERNAL __m256 _simsimd_bf16x8_to_f32x8_haswell(__m128i x) {
// Upcasting from `bf16` to `f32` is done by shifting the `bf16` values by 16 bits to the left, like:
return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(x), 16));
}
SIMSIMD_INTERNAL __m128i _simsimd_f32x8_to_bf16x8_haswell(__m256 x) {
// Pack the 32-bit integers into 16-bit integers.
// This is less trivial than unpacking: https://stackoverflow.com/a/77781241/2766161
// The best approach is to shuffle within lanes first: https://stackoverflow.com/a/49723746/2766161
// Our shuffling mask will drop the low 2-bytes from every 4-byte word.
__m256i trunc_elements = _mm256_shuffle_epi8( //
_mm256_castps_si256(x), //
_mm256_set_epi8( //
-1, -1, -1, -1, -1, -1, -1, -1, 15, 14, 11, 10, 7, 6, 3, 2, //
-1, -1, -1, -1, -1, -1, -1, -1, 15, 14, 11, 10, 7, 6, 3, 2 //
));
__m256i ordered = _mm256_permute4x64_epi64(trunc_elements, 0x58);
__m128i result = _mm256_castsi256_si128(ordered);
return result;
}
SIMSIMD_INTERNAL __m128i _simsimd_partial_load_bf16x8_haswell(simsimd_bf16_t const *a, simsimd_size_t n) {
// In case the software emulation for `bf16` scalars is enabled, the `simsimd_bf16_to_f32`
// function will run. It is extremely slow, so even for the tail, let's combine serial
// loads and stores with vectorized math.
union {
__m128i vec;
simsimd_bf16_t scalars[8];
} result;
simsimd_size_t i = 0;
for (; i < n; ++i) result.scalars[i] = a[i];
for (; i < 8; ++i) result.scalars[i] = 0;
return result.vec;
}
SIMSIMD_PUBLIC void simsimd_dot_bf16_haswell(simsimd_bf16_t const *a_scalars, simsimd_bf16_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m128i a_vec, b_vec;
__m256 ab_vec = _mm256_setzero_ps();
simsimd_dot_bf16_haswell_cycle:
if (count_scalars < 8) {
a_vec = _simsimd_partial_load_bf16x8_haswell(a_scalars, count_scalars);
b_vec = _simsimd_partial_load_bf16x8_haswell(b_scalars, count_scalars);
count_scalars = 0;
}
else {
a_vec = _mm_lddqu_si128((__m128i const *)a_scalars);
b_vec = _mm_lddqu_si128((__m128i const *)b_scalars);
a_scalars += 8, b_scalars += 8, count_scalars -= 8;
}
ab_vec = _mm256_fmadd_ps(_simsimd_bf16x8_to_f32x8_haswell(a_vec), _simsimd_bf16x8_to_f32x8_haswell(b_vec), ab_vec);
if (count_scalars) goto simsimd_dot_bf16_haswell_cycle;
*result = _simsimd_reduce_f32x8_haswell(ab_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_HASWELL
#if SIMSIMD_TARGET_SKYLAKE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "bmi2")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,avx512bw,bmi2"))), apply_to = function)
SIMSIMD_INTERNAL simsimd_f64_t _simsimd_reduce_f32x16_skylake(__m512 a) {
__m512 x = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(0, 0, 3, 2)));
__m128 r = _mm512_castps512_ps128(_mm512_add_ps(x, _mm512_shuffle_f32x4(x, x, _MM_SHUFFLE(0, 0, 0, 1))));
r = _mm_hadd_ps(r, r);
return _mm_cvtss_f32(_mm_hadd_ps(r, r));
}
SIMSIMD_INTERNAL __m512 _simsimd_bf16x16_to_f32x16_skylake(__m256i a) {
// Upcasting from `bf16` to `f32` is done by shifting the `bf16` values by 16 bits to the left, like:
return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16));
}
SIMSIMD_INTERNAL __m256i _simsimd_f32x16_to_bf16x16_skylake(__m512 a) {
// Add 2^15 and right shift 16 to do round-nearest
__m512i x = _mm512_srli_epi32(_mm512_add_epi32(_mm512_castps_si512(a), _mm512_set1_epi32(1 << 15)), 16);
return _mm512_cvtepi32_epi16(x);
}
SIMSIMD_PUBLIC void simsimd_dot_f32_skylake(simsimd_f32_t const *a_scalars, simsimd_f32_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m512 a_vec, b_vec;
__m512 ab_vec = _mm512_setzero();
simsimd_dot_f32_skylake_cycle:
if (count_scalars < 16) {
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, count_scalars);
a_vec = _mm512_maskz_loadu_ps(mask, a_scalars);
b_vec = _mm512_maskz_loadu_ps(mask, b_scalars);
count_scalars = 0;
}
else {
a_vec = _mm512_loadu_ps(a_scalars);
b_vec = _mm512_loadu_ps(b_scalars);
a_scalars += 16, b_scalars += 16, count_scalars -= 16;
}
ab_vec = _mm512_fmadd_ps(a_vec, b_vec, ab_vec);
if (count_scalars) goto simsimd_dot_f32_skylake_cycle;
*result = _simsimd_reduce_f32x16_skylake(ab_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f64_skylake(simsimd_f64_t const *a_scalars, simsimd_f64_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m512d a_vec, b_vec;
__m512d ab_vec = _mm512_setzero_pd();
simsimd_dot_f64_skylake_cycle:
if (count_scalars < 8) {
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, count_scalars);
a_vec = _mm512_maskz_loadu_pd(mask, a_scalars);
b_vec = _mm512_maskz_loadu_pd(mask, b_scalars);
count_scalars = 0;
}
else {
a_vec = _mm512_loadu_pd(a_scalars);
b_vec = _mm512_loadu_pd(b_scalars);
a_scalars += 8, b_scalars += 8, count_scalars -= 8;
}
ab_vec = _mm512_fmadd_pd(a_vec, b_vec, ab_vec);
if (count_scalars) goto simsimd_dot_f64_skylake_cycle;
*result = _mm512_reduce_add_pd(ab_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f32c_skylake(simsimd_f32c_t const *a_pairs, simsimd_f32c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
__m512 a_vec, b_vec;
__m512 ab_real_vec = _mm512_setzero();
__m512 ab_imag_vec = _mm512_setzero();
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set1_epi64(0x8000000000000000);
simsimd_dot_f32c_skylake_cycle:
if (count_pairs < 8) {
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
a_vec = _mm512_maskz_loadu_ps(mask, a_pairs);
b_vec = _mm512_maskz_loadu_ps(mask, b_pairs);
count_pairs = 0;
}
else {
a_vec = _mm512_loadu_ps(a_pairs);
b_vec = _mm512_loadu_ps(b_pairs);
a_pairs += 8, b_pairs += 8, count_pairs -= 8;
}
ab_real_vec = _mm512_fmadd_ps(b_vec, a_vec, ab_real_vec);
b_vec = _mm512_permute_ps(b_vec, 0xB1); //? Swap adjacent entries within each pair
ab_imag_vec = _mm512_fmadd_ps(b_vec, a_vec, ab_imag_vec);
if (count_pairs) goto simsimd_dot_f32c_skylake_cycle;
// Flip the sign bit in every second scalar before accumulation:
ab_real_vec = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(ab_real_vec), sign_flip_vec));
// Reduce horizontal sums:
results[0] = _simsimd_reduce_f32x16_skylake(ab_real_vec);
results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_f32c_skylake(simsimd_f32c_t const *a_pairs, simsimd_f32c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
__m512 a_vec, b_vec;
__m512 ab_real_vec = _mm512_setzero();
__m512 ab_imag_vec = _mm512_setzero();
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set1_epi64(0x8000000000000000);
__m512i const swap_adjacent_vec = _mm512_set_epi8( //
59, 58, 57, 56, 63, 62, 61, 60, 51, 50, 49, 48, 55, 54, 53, 52, // 4th 128-bit lane
43, 42, 41, 40, 47, 46, 45, 44, 35, 34, 33, 32, 39, 38, 37, 36, // 3rd 128-bit lane
27, 26, 25, 24, 31, 30, 29, 28, 19, 18, 17, 16, 23, 22, 21, 20, // 2nd 128-bit lane
11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 // 1st 128-bit lane
);
simsimd_vdot_f32c_skylake_cycle:
if (count_pairs < 8) {
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
a_vec = _mm512_maskz_loadu_ps(mask, (simsimd_f32_t const *)a_pairs);
b_vec = _mm512_maskz_loadu_ps(mask, (simsimd_f32_t const *)b_pairs);
count_pairs = 0;
}
else {
a_vec = _mm512_loadu_ps((simsimd_f32_t const *)a_pairs);
b_vec = _mm512_loadu_ps((simsimd_f32_t const *)b_pairs);
a_pairs += 8, b_pairs += 8, count_pairs -= 8;
}
ab_real_vec = _mm512_fmadd_ps(a_vec, b_vec, ab_real_vec);
b_vec = _mm512_permute_ps(b_vec, 0xB1); //? Swap adjacent entries within each pair
ab_imag_vec = _mm512_fmadd_ps(a_vec, b_vec, ab_imag_vec);
if (count_pairs) goto simsimd_vdot_f32c_skylake_cycle;
// Flip the sign bit in every second scalar before accumulation:
ab_imag_vec = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(ab_imag_vec), sign_flip_vec));
// Reduce horizontal sums:
results[0] = _simsimd_reduce_f32x16_skylake(ab_real_vec);
results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f64c_skylake(simsimd_f64c_t const *a_pairs, simsimd_f64c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
__m512d a_vec, b_vec;
__m512d ab_real_vec = _mm512_setzero_pd();
__m512d ab_imag_vec = _mm512_setzero_pd();
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set_epi64( //
0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000, //
0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000 //
);
simsimd_dot_f64c_skylake_cycle:
if (count_pairs < 4) {
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
a_vec = _mm512_maskz_loadu_pd(mask, a_pairs);
b_vec = _mm512_maskz_loadu_pd(mask, b_pairs);
count_pairs = 0;
}
else {
a_vec = _mm512_loadu_pd(a_pairs);
b_vec = _mm512_loadu_pd(b_pairs);
a_pairs += 4, b_pairs += 4, count_pairs -= 4;
}
ab_real_vec = _mm512_fmadd_pd(b_vec, a_vec, ab_real_vec);
b_vec = _mm512_permute_pd(b_vec, 0x55); //? Same as 0b01010101.
ab_imag_vec = _mm512_fmadd_pd(b_vec, a_vec, ab_imag_vec);
if (count_pairs) goto simsimd_dot_f64c_skylake_cycle;
// Flip the sign bit in every second scalar before accumulation:
ab_real_vec = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(ab_real_vec), sign_flip_vec));
// Reduce horizontal sums:
results[0] = _mm512_reduce_add_pd(ab_real_vec);
results[1] = _mm512_reduce_add_pd(ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_f64c_skylake(simsimd_f64c_t const *a_pairs, simsimd_f64c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
__m512d a_vec, b_vec;
__m512d ab_real_vec = _mm512_setzero_pd();
__m512d ab_imag_vec = _mm512_setzero_pd();
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set_epi64( //
0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000, //
0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000 //
);
simsimd_vdot_f64c_skylake_cycle:
if (count_pairs < 4) {
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
a_vec = _mm512_maskz_loadu_pd(mask, (simsimd_f32_t const *)a_pairs);
b_vec = _mm512_maskz_loadu_pd(mask, (simsimd_f32_t const *)b_pairs);
count_pairs = 0;
}
else {
a_vec = _mm512_loadu_pd((simsimd_f32_t const *)a_pairs);
b_vec = _mm512_loadu_pd((simsimd_f32_t const *)b_pairs);
a_pairs += 4, b_pairs += 4, count_pairs -= 4;
}
ab_real_vec = _mm512_fmadd_pd(a_vec, b_vec, ab_real_vec);
b_vec = _mm512_permute_pd(b_vec, 0x55); //? Same as 0b01010101.
ab_imag_vec = _mm512_fmadd_pd(a_vec, b_vec, ab_imag_vec);
if (count_pairs) goto simsimd_vdot_f64c_skylake_cycle;
// Flip the sign bit in every second scalar before accumulation:
ab_imag_vec = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(ab_imag_vec), sign_flip_vec));
// Reduce horizontal sums:
results[0] = _mm512_reduce_add_pd(ab_real_vec);
results[1] = _mm512_reduce_add_pd(ab_imag_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SKYLAKE
#if SIMSIMD_TARGET_GENOA
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512bf16")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512bf16"))), \
apply_to = function)
SIMSIMD_PUBLIC void simsimd_dot_bf16_genoa(simsimd_bf16_t const *a_scalars, simsimd_bf16_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m512i a_i16_vec, b_i16_vec;
__m512 ab_vec = _mm512_setzero_ps();
simsimd_dot_bf16_genoa_cycle:
if (count_scalars < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, count_scalars);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a_scalars);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b_scalars);
count_scalars = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a_scalars);
b_i16_vec = _mm512_loadu_epi16(b_scalars);
a_scalars += 32, b_scalars += 32, count_scalars -= 32;
}
ab_vec = _mm512_dpbf16_ps(ab_vec, (__m512bh)(a_i16_vec), (__m512bh)(b_i16_vec));
if (count_scalars) goto simsimd_dot_bf16_genoa_cycle;
*result = _simsimd_reduce_f32x16_skylake(ab_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_bf16c_genoa(simsimd_bf16c_t const *a_pairs, simsimd_bf16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
__m512i a_vec, b_vec;
__m512 ab_real_vec = _mm512_setzero_ps();
__m512 ab_imag_vec = _mm512_setzero_ps();
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000);
__m512i const swap_adjacent_vec = _mm512_set_epi8( //
61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane
45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane
29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 // 1st 128-bit lane
);
simsimd_dot_bf16c_genoa_cycle:
if (count_pairs < 16) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
a_vec = _mm512_maskz_loadu_epi16(mask, (simsimd_i16_t const *)a_pairs);
b_vec = _mm512_maskz_loadu_epi16(mask, (simsimd_i16_t const *)b_pairs);
count_pairs = 0;
}
else {
a_vec = _mm512_loadu_epi16((simsimd_i16_t const *)a_pairs);
b_vec = _mm512_loadu_epi16((simsimd_i16_t const *)b_pairs);
a_pairs += 16, b_pairs += 16, count_pairs -= 16;
}
ab_real_vec = _mm512_dpbf16_ps(ab_real_vec, (__m512bh)(_mm512_xor_si512(b_vec, sign_flip_vec)), (__m512bh)(a_vec));
ab_imag_vec =
_mm512_dpbf16_ps(ab_imag_vec, (__m512bh)(_mm512_shuffle_epi8(b_vec, swap_adjacent_vec)), (__m512bh)(a_vec));
if (count_pairs) goto simsimd_dot_bf16c_genoa_cycle;
// Reduce horizontal sums:
results[0] = _simsimd_reduce_f32x16_skylake(ab_real_vec);
results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_bf16c_genoa(simsimd_bf16c_t const *a_pairs, simsimd_bf16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
__m512i a_vec, b_vec;
__m512 ab_real_vec = _mm512_setzero_ps();
__m512 ab_imag_vec = _mm512_setzero_ps();
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000);
__m512i const swap_adjacent_vec = _mm512_set_epi8( //
61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane
45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane
29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 // 1st 128-bit lane
);
simsimd_dot_bf16c_genoa_cycle:
if (count_pairs < 16) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
a_vec = _mm512_maskz_loadu_epi16(mask, (simsimd_i16_t const *)a_pairs);
b_vec = _mm512_maskz_loadu_epi16(mask, (simsimd_i16_t const *)b_pairs);
count_pairs = 0;
}
else {
a_vec = _mm512_loadu_epi16((simsimd_i16_t const *)a_pairs);
b_vec = _mm512_loadu_epi16((simsimd_i16_t const *)b_pairs);
a_pairs += 16, b_pairs += 16, count_pairs -= 16;
}
ab_real_vec = _mm512_dpbf16_ps(ab_real_vec, (__m512bh)(a_vec), (__m512bh)(b_vec));
a_vec = _mm512_xor_si512(a_vec, sign_flip_vec);
b_vec = _mm512_shuffle_epi8(b_vec, swap_adjacent_vec);
ab_imag_vec = _mm512_dpbf16_ps(ab_imag_vec, (__m512bh)(a_vec), (__m512bh)(b_vec));
if (count_pairs) goto simsimd_dot_bf16c_genoa_cycle;
// Reduce horizontal sums:
results[0] = _simsimd_reduce_f32x16_skylake(ab_real_vec);
results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_GENOA
#if SIMSIMD_TARGET_SAPPHIRE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512fp16")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512fp16"))), \
apply_to = function)
SIMSIMD_PUBLIC void simsimd_dot_f16_sapphire(simsimd_f16_t const *a_scalars, simsimd_f16_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m512i a_i16_vec, b_i16_vec;
__m512h ab_vec = _mm512_setzero_ph();
simsimd_dot_f16_sapphire_cycle:
if (count_scalars < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, count_scalars);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a_scalars);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b_scalars);
count_scalars = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a_scalars);
b_i16_vec = _mm512_loadu_epi16(b_scalars);
a_scalars += 32, b_scalars += 32, count_scalars -= 32;
}
ab_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(a_i16_vec), _mm512_castsi512_ph(b_i16_vec), ab_vec);
if (count_scalars) goto simsimd_dot_f16_sapphire_cycle;
*result = _mm512_reduce_add_ph(ab_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_f16c_sapphire(simsimd_f16c_t const *a_pairs, simsimd_f16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
__m512i a_vec, b_vec;
__m512h ab_real_vec = _mm512_setzero_ph();
__m512h ab_imag_vec = _mm512_setzero_ph();
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000);
__m512i const swap_adjacent_vec = _mm512_set_epi8( //
61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane
45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane
29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 // 1st 128-bit lane
);
simsimd_dot_f16c_sapphire_cycle:
if (count_pairs < 16) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
a_vec = _mm512_maskz_loadu_epi16(mask, a_pairs);
b_vec = _mm512_maskz_loadu_epi16(mask, b_pairs);
count_pairs = 0;
}
else {
a_vec = _mm512_loadu_epi16(a_pairs);
b_vec = _mm512_loadu_epi16(b_pairs);
a_pairs += 16, b_pairs += 16, count_pairs -= 16;
}
// TODO: Consider using `_mm512_fmaddsub` and `_mm512_fcmadd_pch`
ab_real_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(_mm512_xor_si512(b_vec, sign_flip_vec)),
_mm512_castsi512_ph(a_vec), ab_real_vec);
ab_imag_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(_mm512_shuffle_epi8(b_vec, swap_adjacent_vec)),
_mm512_castsi512_ph(a_vec), ab_imag_vec);
if (count_pairs) goto simsimd_dot_f16c_sapphire_cycle;
// Reduce horizontal sums:
// TODO: Optimize this with tree-like reductions
results[0] = _mm512_reduce_add_ph(ab_real_vec);
results[1] = _mm512_reduce_add_ph(ab_imag_vec);
}
SIMSIMD_PUBLIC void simsimd_vdot_f16c_sapphire(simsimd_f16c_t const *a_pairs, simsimd_f16c_t const *b_pairs,
simsimd_size_t count_pairs, simsimd_distance_t *results) {
__m512i a_vec, b_vec;
__m512h ab_real_vec = _mm512_setzero_ph();
__m512h ab_imag_vec = _mm512_setzero_ph();
// We take into account, that FMS is the same as FMA with a negative multiplier.
// To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
// This way we can avoid the shuffling and the need for separate real and imaginary parts.
// For the imaginary part of the product, we would need to swap the real and imaginary parts of
// one of the vectors.
__m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000);
__m512i const swap_adjacent_vec = _mm512_set_epi8( //
61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane
45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane
29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 // 1st 128-bit lane
);
simsimd_dot_f16c_sapphire_cycle:
if (count_pairs < 16) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
a_vec = _mm512_maskz_loadu_epi16(mask, a_pairs);
b_vec = _mm512_maskz_loadu_epi16(mask, b_pairs);
count_pairs = 0;
}
else {
a_vec = _mm512_loadu_epi16(a_pairs);
b_vec = _mm512_loadu_epi16(b_pairs);
a_pairs += 16, b_pairs += 16, count_pairs -= 16;
}
// TODO: Consider using `_mm512_fmaddsub` and `_mm512_fcmadd_pch`
ab_real_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(a_vec), _mm512_castsi512_ph(b_vec), ab_real_vec);
a_vec = _mm512_xor_si512(a_vec, sign_flip_vec);
b_vec = _mm512_shuffle_epi8(b_vec, swap_adjacent_vec);
ab_imag_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(a_vec), _mm512_castsi512_ph(b_vec), ab_imag_vec);
if (count_pairs) goto simsimd_dot_f16c_sapphire_cycle;
// Reduce horizontal sums:
results[0] = _mm512_reduce_add_ph(ab_real_vec);
results[1] = _mm512_reduce_add_ph(ab_imag_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SAPPHIRE
#if SIMSIMD_TARGET_ICE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512vnni")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512vnni"))), \
apply_to = function)
SIMSIMD_PUBLIC void simsimd_dot_i8_ice(simsimd_i8_t const *a_scalars, simsimd_i8_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m512i a_i16_vec, b_i16_vec;
__m512i ab_i32_vec = _mm512_setzero_si512();
simsimd_dot_i8_ice_cycle:
if (count_scalars < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, count_scalars);
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, a_scalars));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, b_scalars));
count_scalars = 0;
}
else {
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)a_scalars));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)b_scalars));
a_scalars += 32, b_scalars += 32, count_scalars -= 32;
}
// Unfortunately we can't use the `_mm512_dpbusd_epi32` intrinsics here either,
// as it's asymmetric with respect to the sign of the input arguments:
// Signed(ZeroExtend16(a_scalars.byte[4*j]) * SignExtend16(b_scalars.byte[4*j]))
// So we have to use the `_mm512_dpwssd_epi32` intrinsics instead, upcasting
// to 16-bit beforehand.
ab_i32_vec = _mm512_dpwssd_epi32(ab_i32_vec, a_i16_vec, b_i16_vec);
if (count_scalars) goto simsimd_dot_i8_ice_cycle;
*result = _mm512_reduce_add_epi32(ab_i32_vec);
}
SIMSIMD_PUBLIC void simsimd_dot_u8_ice(simsimd_u8_t const *a_scalars, simsimd_u8_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m512i a_u8_vec, b_u8_vec;
__m512i a_i16_low_vec, a_i16_high_vec, b_i16_low_vec, b_i16_high_vec;
__m512i ab_i32_low_vec = _mm512_setzero_si512();
__m512i ab_i32_high_vec = _mm512_setzero_si512();
__m512i const zeros_vec = _mm512_setzero_si512();
simsimd_dot_u8_ice_cycle:
if (count_scalars < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, count_scalars);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a_scalars);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b_scalars);
count_scalars = 0;
}
else {
a_u8_vec = _mm512_loadu_si512(a_scalars);
b_u8_vec = _mm512_loadu_si512(b_scalars);
a_scalars += 64, b_scalars += 64, count_scalars -= 64;
}
// Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking
// instructions instead of extracts, as they are much faster and more efficient.
a_i16_low_vec = _mm512_unpacklo_epi8(a_u8_vec, zeros_vec);
a_i16_high_vec = _mm512_unpackhi_epi8(a_u8_vec, zeros_vec);
b_i16_low_vec = _mm512_unpacklo_epi8(b_u8_vec, zeros_vec);
b_i16_high_vec = _mm512_unpackhi_epi8(b_u8_vec, zeros_vec);
// Unfortunately we can't use the `_mm512_dpbusd_epi32` intrinsics here either,
// as it's asymmetric with respect to the sign of the input arguments:
// Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
// So we have to use the `_mm512_dpwssd_epi32` intrinsics instead, upcasting
// to 16-bit beforehand.
ab_i32_low_vec = _mm512_dpwssd_epi32(ab_i32_low_vec, a_i16_low_vec, b_i16_low_vec);
ab_i32_high_vec = _mm512_dpwssd_epi32(ab_i32_high_vec, a_i16_high_vec, b_i16_high_vec);
if (count_scalars) goto simsimd_dot_u8_ice_cycle;
*result = _mm512_reduce_add_epi32(_mm512_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_ICE
#if SIMSIMD_TARGET_SIERRA
#pragma GCC push_options
#pragma GCC target("avx2", "bmi2", "avx2vnni")
#pragma clang attribute push(__attribute__((target("avx2,bmi2,avx2vnni"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_dot_i8_sierra(simsimd_i8_t const *a_scalars, simsimd_i8_t const *b_scalars,
simsimd_size_t count_scalars, simsimd_distance_t *result) {
__m256i ab_i32_vec = _mm256_setzero_si256();
simsimd_size_t idx_scalars = 0;
for (; idx_scalars + 32 <= count_scalars; idx_scalars += 32) {
__m256i a_i8_vec = _mm256_lddqu_si256((__m256i const *)(a_scalars + idx_scalars));
__m256i b_i8_vec = _mm256_lddqu_si256((__m256i const *)(b_scalars + idx_scalars));
ab_i32_vec = _mm256_dpbssds_epi32(ab_i32_vec, a_i8_vec, b_i8_vec);
}
// Further reduce to a single sum for each vector
int ab = _simsimd_reduce_i32x8_haswell(ab_i32_vec);
// Take care of the tail:
for (; idx_scalars < count_scalars; ++idx_scalars) ab += (int)(a_scalars[idx_scalars]) * b_scalars[idx_scalars];
*result = ab;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SIERRA
#endif // _SIMSIMD_TARGET_X86
#ifdef __cplusplus
}
#endif
#endif
simsimd-6.5.12/include/simsimd/elementwise.h 0000644 0000000 0000000 00000335423 10461020230 0017160 0 ustar 0000000 0000000 /**
* @file elementwise.h
* @brief SIMD-accelerated mixed-precision element-wise operations.
* @author Ash Vardanian
* @date October 16, 2024
*
* Contains following element-wise operations:
* - Sum (Add): R[i] = A[i] + B[i]
* - Scale (Multiply): R[i] = Alpha * A[i]
* - WSum or Weighted-Sum: R[i] = Alpha * A[i] + Beta * B[i]
* - FMA or Fused-Multiply-Add: R[i] = Alpha * A[i] * B[i] + Beta * C[i]
*
* This tiny set of operations if enough to implement a wide range of algorithms.
* To scale a vector by a scalar, just call WSum with $Beta$ = 0.
* To sum two vectors, just call WSum with $Alpha$ = $Beta$ = 1.
* To average two vectors, just call WSum with $Alpha$ = $Beta$ = 0.5.
* To multiply vectors element-wise, just call FMA with $Beta$ = 0.
*
* For datatypes:
* - 64-bit IEEE floating point numbers
* - 32-bit IEEE floating point numbers
* - 16-bit IEEE floating point numbers
* - 16-bit brain floating point numbers
* - 8-bit unsigned integers
* - 8-bit signed integers
*
* For hardware architectures:
* - Arm: NEON
* - x86: Haswell, Skylake, Sapphire
*
* We use `f16` for `i8` and `u8` arithmetic. This is because Arm received `f16` support earlier than `bf16`.
* For example, Apple M1 has `f16` support and `bf16` was only added in M2. On the other hand, on paper,
* AMD Genoa has `bf16` support, and `f16` is only available on Intel Sapphire Rapids and newer.
* Sadly, the SIMD support for `bf16` is limited to mixed-precision dot-products, which makes it useless here.
*
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
*/
#ifndef SIMSIMD_ELEMENTWISE_H
#define SIMSIMD_ELEMENTWISE_H
#include "types.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Serial backends for all numeric types.
* By default they use 32-bit arithmetic, unless the arguments themselves contain 64-bit floats.
* For double-precision computation check out the "*_accurate" variants of those "*_serial" functions.
*/
SIMSIMD_PUBLIC void simsimd_wsum_f64_serial( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_f32_serial( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_f16_serial( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_bf16_serial( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_i8_serial( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_u8_serial( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f64_serial( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f32_serial( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f16_serial( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result);
SIMSIMD_PUBLIC void simsimd_fma_bf16_serial( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result);
SIMSIMD_PUBLIC void simsimd_fma_i8_serial( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result);
SIMSIMD_PUBLIC void simsimd_fma_u8_serial( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result);
#define SIMSIMD_MAKE_WSUM(name, input_type, accumulator_type, load_and_convert, convert_and_store) \
SIMSIMD_PUBLIC void simsimd_wsum_##input_type##_##name( \
simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_##input_type##_t *result) { \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \
simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \
simsimd_##accumulator_type##_t ai_scaled = (simsimd_##accumulator_type##_t)(ai * alpha); \
simsimd_##accumulator_type##_t bi_scaled = (simsimd_##accumulator_type##_t)(bi * beta); \
simsimd_##accumulator_type##_t sum = ai_scaled + bi_scaled; \
convert_and_store(sum, result + i); \
} \
}
#define SIMSIMD_MAKE_FMA(name, input_type, accumulator_type, load_and_convert, convert_and_store) \
SIMSIMD_PUBLIC void simsimd_fma_##input_type##_##name( \
simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, simsimd_##input_type##_t const *c, \
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_##input_type##_t *result) { \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \
simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \
simsimd_##accumulator_type##_t ci = load_and_convert(c + i); \
simsimd_##accumulator_type##_t abi_scaled = (simsimd_##accumulator_type##_t)(ai * bi * alpha); \
simsimd_##accumulator_type##_t ci_scaled = (simsimd_##accumulator_type##_t)(ci * beta); \
simsimd_##accumulator_type##_t sum = abi_scaled + ci_scaled; \
convert_and_store(sum, result + i); \
} \
}
SIMSIMD_MAKE_WSUM(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_wsum_f64_serial
SIMSIMD_MAKE_WSUM(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_wsum_f32_serial
SIMSIMD_MAKE_WSUM(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_wsum_f16_serial
SIMSIMD_MAKE_WSUM(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_wsum_bf16_serial
SIMSIMD_MAKE_WSUM(serial, i8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_I8) // simsimd_wsum_i8_serial
SIMSIMD_MAKE_WSUM(serial, u8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_U8) // simsimd_wsum_u8_serial
SIMSIMD_MAKE_WSUM(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_wsum_f32_accurate
SIMSIMD_MAKE_WSUM(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_wsum_f16_accurate
SIMSIMD_MAKE_WSUM(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_wsum_bf16_accurate
SIMSIMD_MAKE_WSUM(accurate, i8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_I8) // simsimd_wsum_i8_accurate
SIMSIMD_MAKE_WSUM(accurate, u8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_U8) // simsimd_wsum_u8_accurate
SIMSIMD_MAKE_FMA(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_fma_f64_serial
SIMSIMD_MAKE_FMA(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_fma_f32_serial
SIMSIMD_MAKE_FMA(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_fma_f16_serial
SIMSIMD_MAKE_FMA(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_fma_bf16_serial
SIMSIMD_MAKE_FMA(serial, i8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_I8) // simsimd_fma_i8_serial
SIMSIMD_MAKE_FMA(serial, u8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_U8) // simsimd_fma_u8_serial
SIMSIMD_MAKE_FMA(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_fma_f32_accurate
SIMSIMD_MAKE_FMA(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_fma_f16_accurate
SIMSIMD_MAKE_FMA(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_fma_bf16_accurate
SIMSIMD_MAKE_FMA(accurate, i8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_I8) // simsimd_fma_i8_accurate
SIMSIMD_MAKE_FMA(accurate, u8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_U8) // simsimd_fma_u8_accurate
/* SIMD-powered backends for Arm NEON, mostly using 32-bit arithmetic over 128-bit words.
* By far the most portable backend, covering most Arm v8 devices, over a billion phones, and almost all
* server CPUs produced before 2023.
*/
SIMSIMD_PUBLIC void simsimd_wsum_f32_neon( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_f16_neon( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_bf16_neon( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_u8_neon( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_i8_neon( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f32_neon( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f16_neon( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result);
SIMSIMD_PUBLIC void simsimd_fma_bf16_neon( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result);
SIMSIMD_PUBLIC void simsimd_fma_u8_neon( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result);
SIMSIMD_PUBLIC void simsimd_fma_i8_neon( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result);
/* SIMD-powered backends for AVX2 CPUs of Haswell generation and newer, using 32-bit arithmetic over 256-bit words.
* First demonstrated in 2011, at least one Haswell-based processor was still being sold in 2022 — the Pentium G3420.
* Practically all modern x86 CPUs support AVX2, FMA, and F16C, making it a perfect baseline for SIMD algorithms.
* On other hand, there is no need to implement AVX2 versions of `f32` and `f64` functions, as those are
* properly vectorized by recent compilers.
*/
SIMSIMD_PUBLIC void simsimd_wsum_f64_haswell( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_f32_haswell( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_f16_haswell( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_bf16_haswell( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_i8_haswell( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_u8_haswell( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f64_haswell( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f32_haswell( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f16_haswell( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result);
SIMSIMD_PUBLIC void simsimd_fma_bf16_haswell( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result);
SIMSIMD_PUBLIC void simsimd_fma_i8_haswell( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result);
SIMSIMD_PUBLIC void simsimd_fma_u8_haswell( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result);
/* SIMD-powered backends for various generations of AVX512 CPUs.
* Unlike the distance metrics, the SIMD implementation of FMA and WSum benefits from aligned stores.
* Assuming the size of ZMM register matches the width of the cache line, we skip the unaligned head
* and tail of the output buffer, and only use aligned stores in the main loop.
*/
SIMSIMD_PUBLIC void simsimd_wsum_f64_skylake( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_f32_skylake( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_bf16_skylake( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f64_skylake( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f32_skylake( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result);
SIMSIMD_PUBLIC void simsimd_fma_bf16_skylake( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_f16_sapphire( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_i8_sapphire( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result);
SIMSIMD_PUBLIC void simsimd_wsum_u8_sapphire( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result);
SIMSIMD_PUBLIC void simsimd_fma_f16_sapphire( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result);
SIMSIMD_PUBLIC void simsimd_fma_i8_sapphire( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result);
SIMSIMD_PUBLIC void simsimd_fma_u8_sapphire( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result);
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_HASWELL
#pragma GCC push_options
#pragma GCC target("avx2", "f16c", "fma")
#pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_sum_f32_haswell(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_f32_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_loadu_ps(a + i);
__m256 b_vec = _mm256_loadu_ps(b + i);
__m256 sum_vec = _mm256_add_ps(a_vec, b_vec);
_mm256_storeu_ps(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = a[i] + b[i];
}
SIMSIMD_PUBLIC void simsimd_scale_f32_haswell(simsimd_f32_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_f32_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_loadu_ps(a + i);
__m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec);
_mm256_storeu_ps(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = alpha_f32 * a[i];
}
SIMSIMD_PUBLIC void simsimd_wsum_f32_haswell( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_f32_haswell(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_f32_haswell(a, n, alpha, result); }
else { simsimd_scale_f32_haswell(b, n, beta, result); }
return;
}
// The general case.
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_loadu_ps(a + i);
__m256 b_vec = _mm256_loadu_ps(b + i);
__m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec);
__m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec);
_mm256_storeu_ps(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = alpha_f32 * a[i] + beta_f32 * b[i];
}
SIMSIMD_PUBLIC void simsimd_sum_f64_haswell(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_f64_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
__m256d a_vec = _mm256_loadu_pd(a + i);
__m256d b_vec = _mm256_loadu_pd(b + i);
__m256d sum_vec = _mm256_add_pd(a_vec, b_vec);
_mm256_storeu_pd(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = a[i] + b[i];
}
SIMSIMD_PUBLIC void simsimd_scale_f64_haswell(simsimd_f64_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_f64_t *result) {
__m256d alpha_vec = _mm256_set1_pd(alpha);
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
__m256d a_vec = _mm256_loadu_pd(a + i);
__m256d sum_vec = _mm256_mul_pd(a_vec, alpha_vec);
_mm256_storeu_pd(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = alpha * a[i];
}
SIMSIMD_PUBLIC void simsimd_wsum_f64_haswell( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_f64_haswell(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_f64_haswell(a, n, alpha, result); }
else { simsimd_scale_f64_haswell(b, n, beta, result); }
return;
}
// The general case.
__m256d alpha_vec = _mm256_set1_pd(alpha);
__m256d beta_vec = _mm256_set1_pd(beta);
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
__m256d a_vec = _mm256_loadu_pd(a + i);
__m256d b_vec = _mm256_loadu_pd(b + i);
__m256d a_scaled_vec = _mm256_mul_pd(a_vec, alpha_vec);
__m256d b_scaled_vec = _mm256_mul_pd(b_vec, beta_vec);
__m256d sum_vec = _mm256_add_pd(a_scaled_vec, b_scaled_vec);
_mm256_storeu_pd(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = alpha * a[i] + beta * b[i];
}
SIMSIMD_PUBLIC void simsimd_sum_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_f16_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m128i a_f16 = _mm_lddqu_si128((__m128i const *)(a + i));
__m128i b_f16 = _mm_lddqu_si128((__m128i const *)(b + i));
__m256 a_vec = _mm256_cvtph_ps(a_f16);
__m256 b_vec = _mm256_cvtph_ps(b_f16);
__m256 sum_vec = _mm256_add_ps(a_vec, b_vec);
__m128i sum_f16 = _mm256_cvtps_ph(sum_vec, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
_mm_storeu_si128((__m128i *)(result + i), sum_f16);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i);
simsimd_f32_t bi = SIMSIMD_F16_TO_F32(b + i);
simsimd_f32_t sum = ai + bi;
SIMSIMD_F32_TO_F16(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_scale_f16_haswell(simsimd_f16_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_f16_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m128i a_f16 = _mm_lddqu_si128((__m128i const *)(a + i));
__m256 a_vec = _mm256_cvtph_ps(a_f16);
__m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec);
__m128i sum_f16 = _mm256_cvtps_ph(sum_vec, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
_mm_storeu_si128((__m128i *)(result + i), sum_f16);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i);
simsimd_f32_t sum = alpha_f32 * ai;
SIMSIMD_F32_TO_F16(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_wsum_f16_haswell( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_f16_haswell(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_f16_haswell(a, n, alpha, result); }
else { simsimd_scale_f16_haswell(b, n, beta, result); }
return;
}
// The general case.
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m128i a_f16 = _mm_lddqu_si128((__m128i const *)(a + i));
__m128i b_f16 = _mm_lddqu_si128((__m128i const *)(b + i));
__m256 a_vec = _mm256_cvtph_ps(a_f16);
__m256 b_vec = _mm256_cvtph_ps(b_f16);
__m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec);
__m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec);
__m128i sum_f16 = _mm256_cvtps_ph(sum_vec, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
_mm_storeu_si128((__m128i *)(result + i), sum_f16);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i);
simsimd_f32_t bi = SIMSIMD_F16_TO_F32(b + i);
simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi;
SIMSIMD_F32_TO_F16(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_sum_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_bf16_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m128i a_bf16 = _mm_lddqu_si128((__m128i const *)(a + i));
__m128i b_bf16 = _mm_lddqu_si128((__m128i const *)(b + i));
__m256 a_vec = _simsimd_bf16x8_to_f32x8_haswell(a_bf16);
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(b_bf16);
__m256 sum_vec = _mm256_add_ps(a_vec, b_vec);
__m128i sum_bf16 = _simsimd_f32x8_to_bf16x8_haswell(sum_vec);
_mm_storeu_si128((__m128i *)(result + i), sum_bf16);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i);
simsimd_f32_t bi = SIMSIMD_BF16_TO_F32(b + i);
simsimd_f32_t sum = ai + bi;
SIMSIMD_F32_TO_BF16(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_scale_bf16_haswell(simsimd_bf16_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_bf16_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m128i a_bf16 = _mm_lddqu_si128((__m128i const *)(a + i));
__m256 a_vec = _simsimd_bf16x8_to_f32x8_haswell(a_bf16);
__m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec);
__m128i sum_bf16 = _simsimd_f32x8_to_bf16x8_haswell(sum_vec);
_mm_storeu_si128((__m128i *)(result + i), sum_bf16);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i);
simsimd_f32_t sum = alpha_f32 * ai;
SIMSIMD_F32_TO_BF16(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_wsum_bf16_haswell( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_bf16_haswell(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_bf16_haswell(a, n, alpha, result); }
else { simsimd_scale_bf16_haswell(b, n, beta, result); }
return;
}
// The general case.
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m128i a_bf16 = _mm_lddqu_si128((__m128i const *)(a + i));
__m128i b_bf16 = _mm_lddqu_si128((__m128i const *)(b + i));
__m256 a_vec = _simsimd_bf16x8_to_f32x8_haswell(a_bf16);
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(b_bf16);
__m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec);
__m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec);
__m128i sum_bf16 = _simsimd_f32x8_to_bf16x8_haswell(sum_vec);
_mm_storeu_si128((__m128i *)(result + i), sum_bf16);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i);
simsimd_f32_t bi = SIMSIMD_BF16_TO_F32(b + i);
simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi;
SIMSIMD_F32_TO_BF16(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_fma_f32_haswell( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_loadu_ps(a + i);
__m256 b_vec = _mm256_loadu_ps(b + i);
__m256 c_vec = _mm256_loadu_ps(c + i);
__m256 ab_vec = _mm256_mul_ps(a_vec, b_vec);
__m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec);
__m256 c_scaled_vec = _mm256_mul_ps(c_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(ab_scaled_vec, c_scaled_vec);
_mm256_storeu_ps(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = alpha_f32 * a[i] * b[i] + beta_f32 * c[i];
}
SIMSIMD_PUBLIC void simsimd_fma_f64_haswell( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result) {
__m256d alpha_vec = _mm256_set1_pd(alpha);
__m256d beta_vec = _mm256_set1_pd(beta);
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
__m256d a_vec = _mm256_loadu_pd(a + i);
__m256d b_vec = _mm256_loadu_pd(b + i);
__m256d c_vec = _mm256_loadu_pd(c + i);
__m256d ab_vec = _mm256_mul_pd(a_vec, b_vec);
__m256d ab_scaled_vec = _mm256_mul_pd(ab_vec, alpha_vec);
__m256d c_scaled_vec = _mm256_mul_pd(c_vec, beta_vec);
__m256d sum_vec = _mm256_add_pd(ab_scaled_vec, c_scaled_vec);
_mm256_storeu_pd(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = alpha * a[i] * b[i] + beta * c[i];
}
SIMSIMD_PUBLIC void simsimd_fma_f16_haswell( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m128i a_f16 = _mm_lddqu_si128((__m128i const *)(a + i));
__m128i b_f16 = _mm_lddqu_si128((__m128i const *)(b + i));
__m128i c_f16 = _mm_lddqu_si128((__m128i const *)(c + i));
__m256 a_vec = _mm256_cvtph_ps(a_f16);
__m256 b_vec = _mm256_cvtph_ps(b_f16);
__m256 c_vec = _mm256_cvtph_ps(c_f16);
__m256 ab_vec = _mm256_mul_ps(a_vec, b_vec);
__m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec);
__m256 c_scaled_vec = _mm256_mul_ps(c_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(ab_scaled_vec, c_scaled_vec);
__m128i sum_f16 = _mm256_cvtps_ph(sum_vec, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
_mm_storeu_si128((__m128i *)(result + i), sum_f16);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i);
simsimd_f32_t bi = SIMSIMD_F16_TO_F32(b + i);
simsimd_f32_t ci = SIMSIMD_F16_TO_F32(c + i);
simsimd_f32_t sum = alpha * ai * bi + beta * ci;
SIMSIMD_F32_TO_F16(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_fma_bf16_haswell( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m128i a_bf16 = _mm_lddqu_si128((__m128i const *)(a + i));
__m128i b_bf16 = _mm_lddqu_si128((__m128i const *)(b + i));
__m128i c_bf16 = _mm_lddqu_si128((__m128i const *)(c + i));
__m256 a_vec = _simsimd_bf16x8_to_f32x8_haswell(a_bf16);
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(b_bf16);
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(c_bf16);
__m256 ab_vec = _mm256_mul_ps(a_vec, b_vec);
__m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec);
__m256 c_scaled_vec = _mm256_mul_ps(c_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(ab_scaled_vec, c_scaled_vec);
__m128i sum_bf16 = _simsimd_f32x8_to_bf16x8_haswell(sum_vec);
_mm_storeu_si128((__m128i *)(result + i), sum_bf16);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i);
simsimd_f32_t bi = SIMSIMD_BF16_TO_F32(b + i);
simsimd_f32_t ci = SIMSIMD_BF16_TO_F32(c + i);
simsimd_f32_t sum = alpha * ai * bi + beta * ci;
SIMSIMD_F32_TO_BF16(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_sum_i8_haswell(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_i8_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_vec = _mm256_lddqu_si256((__m256i *)(a + i));
__m256i b_vec = _mm256_lddqu_si256((__m256i *)(b + i));
__m256i sum_vec = _mm256_adds_epi8(a_vec, b_vec);
_mm256_storeu_si256((__m256i *)(result + i), sum_vec);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i];
simsimd_f32_t sum = ai + bi;
SIMSIMD_F32_TO_U8(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_scale_i8_haswell(simsimd_i8_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_i8_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
int sum_i32s[8], a_i32s[8];
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
// The normal part.
__m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_i8_t)sum_i32s[0];
result[i + 1] = (simsimd_i8_t)sum_i32s[1];
result[i + 2] = (simsimd_i8_t)sum_i32s[2];
result[i + 3] = (simsimd_i8_t)sum_i32s[3];
result[i + 4] = (simsimd_i8_t)sum_i32s[4];
result[i + 5] = (simsimd_i8_t)sum_i32s[5];
result[i + 6] = (simsimd_i8_t)sum_i32s[6];
result[i + 7] = (simsimd_i8_t)sum_i32s[7];
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i];
simsimd_f32_t sum = alpha_f32 * ai;
SIMSIMD_F32_TO_I8(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_wsum_i8_haswell( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_i8_haswell(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_i8_haswell(a, n, alpha, result); }
else { simsimd_scale_i8_haswell(b, n, beta, result); }
return;
}
// The general case.
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
int sum_i32s[8], a_i32s[8], b_i32s[8];
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s));
// The normal part.
__m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec);
__m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_i8_t)sum_i32s[0];
result[i + 1] = (simsimd_i8_t)sum_i32s[1];
result[i + 2] = (simsimd_i8_t)sum_i32s[2];
result[i + 3] = (simsimd_i8_t)sum_i32s[3];
result[i + 4] = (simsimd_i8_t)sum_i32s[4];
result[i + 5] = (simsimd_i8_t)sum_i32s[5];
result[i + 6] = (simsimd_i8_t)sum_i32s[6];
result[i + 7] = (simsimd_i8_t)sum_i32s[7];
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i];
simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi;
SIMSIMD_F32_TO_I8(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_sum_u8_haswell(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_u8_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_vec = _mm256_lddqu_si256((__m256i *)(a + i));
__m256i b_vec = _mm256_lddqu_si256((__m256i *)(b + i));
__m256i sum_vec = _mm256_adds_epu8(a_vec, b_vec);
_mm256_storeu_si256((__m256i *)(result + i), sum_vec);
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i];
simsimd_f32_t sum = ai + bi;
SIMSIMD_F32_TO_U8(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_scale_u8_haswell(simsimd_u8_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_u8_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
int sum_i32s[8], a_i32s[8];
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
// The normal part.
__m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(255));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_u8_t)sum_i32s[0];
result[i + 1] = (simsimd_u8_t)sum_i32s[1];
result[i + 2] = (simsimd_u8_t)sum_i32s[2];
result[i + 3] = (simsimd_u8_t)sum_i32s[3];
result[i + 4] = (simsimd_u8_t)sum_i32s[4];
result[i + 5] = (simsimd_u8_t)sum_i32s[5];
result[i + 6] = (simsimd_u8_t)sum_i32s[6];
result[i + 7] = (simsimd_u8_t)sum_i32s[7];
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i];
simsimd_f32_t sum = alpha_f32 * ai;
SIMSIMD_F32_TO_U8(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_wsum_u8_haswell( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_u8_haswell(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_u8_haswell(a, n, alpha, result); }
else { simsimd_scale_u8_haswell(b, n, beta, result); }
return;
}
// The general case.
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
int sum_i32s[8], a_i32s[8], b_i32s[8];
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s));
// The normal part.
__m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec);
__m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(255));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_u8_t)sum_i32s[0];
result[i + 1] = (simsimd_u8_t)sum_i32s[1];
result[i + 2] = (simsimd_u8_t)sum_i32s[2];
result[i + 3] = (simsimd_u8_t)sum_i32s[3];
result[i + 4] = (simsimd_u8_t)sum_i32s[4];
result[i + 5] = (simsimd_u8_t)sum_i32s[5];
result[i + 6] = (simsimd_u8_t)sum_i32s[6];
result[i + 7] = (simsimd_u8_t)sum_i32s[7];
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i];
simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi;
SIMSIMD_F32_TO_U8(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_fma_i8_haswell( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8];
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], //
c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s));
__m256 c_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)c_i32s));
// The normal part.
__m256 ab_vec = _mm256_mul_ps(a_vec, b_vec);
__m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec);
__m256 c_scaled_vec = _mm256_mul_ps(c_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(ab_scaled_vec, c_scaled_vec);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_i8_t)sum_i32s[0];
result[i + 1] = (simsimd_i8_t)sum_i32s[1];
result[i + 2] = (simsimd_i8_t)sum_i32s[2];
result[i + 3] = (simsimd_i8_t)sum_i32s[3];
result[i + 4] = (simsimd_i8_t)sum_i32s[4];
result[i + 5] = (simsimd_i8_t)sum_i32s[5];
result[i + 6] = (simsimd_i8_t)sum_i32s[6];
result[i + 7] = (simsimd_i8_t)sum_i32s[7];
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i], ci = c[i];
simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci;
SIMSIMD_F32_TO_I8(sum, result + i);
}
}
SIMSIMD_PUBLIC void simsimd_fma_u8_haswell( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8];
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], //
c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s));
__m256 c_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)c_i32s));
// The normal part.
__m256 ab_vec = _mm256_mul_ps(a_vec, b_vec);
__m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec);
__m256 c_scaled_vec = _mm256_mul_ps(c_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(ab_scaled_vec, c_scaled_vec);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(255));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_u8_t)sum_i32s[0];
result[i + 1] = (simsimd_u8_t)sum_i32s[1];
result[i + 2] = (simsimd_u8_t)sum_i32s[2];
result[i + 3] = (simsimd_u8_t)sum_i32s[3];
result[i + 4] = (simsimd_u8_t)sum_i32s[4];
result[i + 5] = (simsimd_u8_t)sum_i32s[5];
result[i + 6] = (simsimd_u8_t)sum_i32s[6];
result[i + 7] = (simsimd_u8_t)sum_i32s[7];
}
// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i], ci = c[i];
simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci;
SIMSIMD_F32_TO_U8(sum, result + i);
}
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_HASWELL
#if SIMSIMD_TARGET_SKYLAKE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "bmi2")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,avx512bw,bmi2"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_sum_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_f64_t *result) {
__m512d a_vec, b_vec, sum_vec;
__mmask8 mask = 0xFF;
simsimd_sum_f64_skylake_cycle:
if (n < 8) {
mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_pd(mask, a);
b_vec = _mm512_maskz_loadu_pd(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_pd(a);
b_vec = _mm512_loadu_pd(b);
a += 8, b += 8, n -= 8;
}
sum_vec = _mm512_add_pd(a_vec, b_vec);
_mm512_mask_storeu_pd(result, mask, sum_vec);
result += 8;
if (n) goto simsimd_sum_f64_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_scale_f64_skylake(simsimd_f64_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_f64_t *result) {
__m512d alpha_vec = _mm512_set1_pd(alpha);
__m512d a_vec, b_vec, a_scaled_vec, sum_vec;
__mmask8 mask = 0xFF;
simsimd_scale_f64_skylake_cycle:
if (n < 8) {
mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_pd(mask, a);
n = 0;
}
else {
a_vec = _mm512_loadu_pd(a);
a += 8, n -= 8;
}
sum_vec = _mm512_mul_pd(a_vec, alpha_vec);
_mm512_mask_storeu_pd(result, mask, sum_vec);
result += 8;
if (n) goto simsimd_scale_f64_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_wsum_f64_skylake( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_f64_skylake(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_f64_skylake(a, n, alpha, result); }
else { simsimd_scale_f64_skylake(b, n, beta, result); }
return;
}
// The general case.
__m512d alpha_vec = _mm512_set1_pd(alpha);
__m512d beta_vec = _mm512_set1_pd(beta);
__m512d a_vec, b_vec, a_scaled_vec, sum_vec;
__mmask8 mask = 0xFF;
simsimd_wsum_f64_skylake_cycle:
if (n < 8) {
mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_pd(mask, a);
b_vec = _mm512_maskz_loadu_pd(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_pd(a);
b_vec = _mm512_loadu_pd(b);
a += 8, b += 8, n -= 8;
}
a_scaled_vec = _mm512_mul_pd(a_vec, alpha_vec);
sum_vec = _mm512_fmadd_pd(b_vec, beta_vec, a_scaled_vec);
_mm512_mask_storeu_pd(result, mask, sum_vec);
result += 8;
if (n) goto simsimd_wsum_f64_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_sum_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_f32_t *result) {
__m512 a_vec, b_vec, sum_vec;
__mmask16 mask = 0xFFFF;
simsimd_sum_f32_skylake_cycle:
if (n < 16) {
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_ps(mask, a);
b_vec = _mm512_maskz_loadu_ps(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_ps(a);
b_vec = _mm512_loadu_ps(b);
a += 16, b += 16, n -= 16;
}
sum_vec = _mm512_add_ps(a_vec, b_vec);
_mm512_mask_storeu_ps(result, mask, sum_vec);
result += 16;
if (n) goto simsimd_sum_f32_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_scale_f32_skylake(simsimd_f32_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_f32_t *result) {
__m512 alpha_vec = _mm512_set1_ps(alpha);
__m512 a_vec, sum_vec;
__mmask16 mask = 0xFFFF;
simsimd_scale_f32_skylake_cycle:
if (n < 16) {
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_ps(mask, a);
n = 0;
}
else {
a_vec = _mm512_loadu_ps(a);
a += 16, n -= 16;
}
sum_vec = _mm512_mul_ps(a_vec, alpha_vec);
_mm512_mask_storeu_ps(result, mask, sum_vec);
result += 16;
if (n) goto simsimd_scale_f32_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_wsum_f32_skylake( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_f32_skylake(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_f32_skylake(a, n, alpha, result); }
else { simsimd_scale_f32_skylake(b, n, beta, result); }
return;
}
// The general case.
__m512 alpha_vec = _mm512_set1_ps(alpha);
__m512 beta_vec = _mm512_set1_ps(beta);
__m512 a_vec, b_vec, a_scaled_vec, sum_vec;
__mmask16 mask = 0xFFFF;
simsimd_wsum_f32_skylake_cycle:
if (n < 16) {
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_ps(mask, a);
b_vec = _mm512_maskz_loadu_ps(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_ps(a);
b_vec = _mm512_loadu_ps(b);
a += 16, b += 16, n -= 16;
}
a_scaled_vec = _mm512_mul_ps(a_vec, alpha_vec);
sum_vec = _mm512_fmadd_ps(b_vec, beta_vec, a_scaled_vec);
_mm512_mask_storeu_ps(result, mask, sum_vec);
result += 16;
if (n) goto simsimd_wsum_f32_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_sum_bf16_skylake(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_bf16_t *result) {
__m256i a_bf16_vec, b_bf16_vec, sum_bf16_vec;
__m512 a_vec, b_vec, sum_vec;
__mmask16 mask = 0xFFFF;
simsimd_sum_bf16_skylake_cycle:
if (n < 16) {
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_bf16_vec = _mm256_maskz_loadu_epi16(mask, a);
b_bf16_vec = _mm256_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_bf16_vec = _mm256_loadu_epi16(a);
b_bf16_vec = _mm256_loadu_epi16(b);
a += 16, b += 16, n -= 16;
}
a_vec = _simsimd_bf16x16_to_f32x16_skylake(a_bf16_vec);
b_vec = _simsimd_bf16x16_to_f32x16_skylake(b_bf16_vec);
sum_vec = _mm512_add_ps(a_vec, b_vec);
sum_bf16_vec = _simsimd_f32x16_to_bf16x16_skylake(sum_vec);
_mm256_mask_storeu_epi16(result, mask, sum_bf16_vec);
result += 16;
if (n) goto simsimd_sum_bf16_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_scale_bf16_skylake(simsimd_bf16_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_bf16_t *result) {
__m512 alpha_vec = _mm512_set1_ps(alpha);
__m256i a_bf16_vec, b_bf16_vec, sum_bf16_vec;
__m512 a_vec, b_vec, sum_vec;
__mmask16 mask = 0xFFFF;
simsimd_wsum_bf16_skylake_cycle:
if (n < 16) {
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_bf16_vec = _mm256_maskz_loadu_epi16(mask, a);
n = 0;
}
else {
a_bf16_vec = _mm256_loadu_epi16(a);
a += 16, n -= 16;
}
a_vec = _simsimd_bf16x16_to_f32x16_skylake(a_bf16_vec);
sum_vec = _mm512_mul_ps(a_vec, alpha_vec);
sum_bf16_vec = _simsimd_f32x16_to_bf16x16_skylake(sum_vec);
_mm256_mask_storeu_epi16(result, mask, sum_bf16_vec);
result += 16;
if (n) goto simsimd_wsum_bf16_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_wsum_bf16_skylake( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_bf16_skylake(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_bf16_skylake(a, n, alpha, result); }
else { simsimd_scale_bf16_skylake(b, n, beta, result); }
return;
}
// The general case.
__m512 alpha_vec = _mm512_set1_ps(alpha);
__m512 beta_vec = _mm512_set1_ps(beta);
__m256i a_bf16_vec, b_bf16_vec, sum_bf16_vec;
__m512 a_vec, b_vec, a_scaled_vec, sum_vec;
__mmask16 mask = 0xFFFF;
simsimd_wsum_bf16_skylake_cycle:
if (n < 16) {
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_bf16_vec = _mm256_maskz_loadu_epi16(mask, a);
b_bf16_vec = _mm256_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_bf16_vec = _mm256_loadu_epi16(a);
b_bf16_vec = _mm256_loadu_epi16(b);
a += 16, b += 16, n -= 16;
}
a_vec = _simsimd_bf16x16_to_f32x16_skylake(a_bf16_vec);
b_vec = _simsimd_bf16x16_to_f32x16_skylake(b_bf16_vec);
a_scaled_vec = _mm512_mul_ps(a_vec, alpha_vec);
sum_vec = _mm512_fmadd_ps(b_vec, beta_vec, a_scaled_vec);
sum_bf16_vec = _simsimd_f32x16_to_bf16x16_skylake(sum_vec);
_mm256_mask_storeu_epi16(result, mask, sum_bf16_vec);
result += 16;
if (n) goto simsimd_wsum_bf16_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_fma_f64_skylake( //
simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result) {
__m512d alpha_vec = _mm512_set1_pd(alpha);
__m512d beta_vec = _mm512_set1_pd(beta);
__m512d a_vec, b_vec, c_vec, ab_vec, ab_scaled_vec, sum_vec;
__mmask8 mask = 0xFF;
simsimd_fma_f64_skylake_cycle:
if (n < 8) {
mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_pd(mask, a);
b_vec = _mm512_maskz_loadu_pd(mask, b);
c_vec = _mm512_maskz_loadu_pd(mask, c);
n = 0;
}
else {
a_vec = _mm512_loadu_pd(a);
b_vec = _mm512_loadu_pd(b);
c_vec = _mm512_loadu_pd(c);
a += 8, b += 8, c += 8, n -= 8;
}
ab_vec = _mm512_mul_pd(a_vec, b_vec);
ab_scaled_vec = _mm512_mul_pd(ab_vec, alpha_vec);
sum_vec = _mm512_fmadd_pd(c_vec, beta_vec, ab_scaled_vec);
_mm512_mask_storeu_pd(result, mask, sum_vec);
result += 8;
if (n) goto simsimd_fma_f64_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_fma_f32_skylake( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result) {
__m512 alpha_vec = _mm512_set1_ps(alpha);
__m512 beta_vec = _mm512_set1_ps(beta);
__m512 a_vec, b_vec, c_vec, ab_vec, ab_scaled_vec, sum_vec;
__mmask16 mask = 0xFFFF;
simsimd_fma_f32_skylake_cycle:
if (n < 16) {
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_ps(mask, a);
b_vec = _mm512_maskz_loadu_ps(mask, b);
c_vec = _mm512_maskz_loadu_ps(mask, c);
n = 0;
}
else {
a_vec = _mm512_loadu_ps(a);
b_vec = _mm512_loadu_ps(b);
c_vec = _mm512_loadu_ps(c);
a += 16, b += 16, c += 16, n -= 16;
}
ab_vec = _mm512_mul_ps(a_vec, b_vec);
ab_scaled_vec = _mm512_mul_ps(ab_vec, alpha_vec);
sum_vec = _mm512_fmadd_ps(c_vec, beta_vec, ab_scaled_vec);
_mm512_mask_storeu_ps(result, mask, sum_vec);
result += 16;
if (n) goto simsimd_fma_f32_skylake_cycle;
}
SIMSIMD_PUBLIC void simsimd_fma_bf16_skylake( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result) {
__m512 alpha_vec = _mm512_set1_ps(alpha);
__m512 beta_vec = _mm512_set1_ps(beta);
__m256i a_bf16_vec, b_bf16_vec, c_bf16_vec, sum_bf16_vec;
__m512 a_vec, b_vec, c_vec, ab_vec, ab_scaled_vec, sum_vec;
__mmask16 mask = 0xFFFF;
simsimd_fma_bf16_skylake_cycle:
if (n < 16) {
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_bf16_vec = _mm256_maskz_loadu_epi16(mask, a);
b_bf16_vec = _mm256_maskz_loadu_epi16(mask, b);
c_bf16_vec = _mm256_maskz_loadu_epi16(mask, c);
n = 0;
}
else {
a_bf16_vec = _mm256_loadu_epi16(a);
b_bf16_vec = _mm256_loadu_epi16(b);
c_bf16_vec = _mm256_loadu_epi16(c);
a += 16, b += 16, c += 16, n -= 16;
}
a_vec = _simsimd_bf16x16_to_f32x16_skylake(a_bf16_vec);
b_vec = _simsimd_bf16x16_to_f32x16_skylake(b_bf16_vec);
c_vec = _simsimd_bf16x16_to_f32x16_skylake(c_bf16_vec);
ab_vec = _mm512_mul_ps(a_vec, b_vec);
ab_scaled_vec = _mm512_mul_ps(ab_vec, alpha_vec);
sum_vec = _mm512_fmadd_ps(c_vec, beta_vec, ab_scaled_vec);
sum_bf16_vec = _simsimd_f32x16_to_bf16x16_skylake(sum_vec);
_mm256_mask_storeu_epi16(result, mask, sum_bf16_vec);
result += 16;
if (n) goto simsimd_fma_bf16_skylake_cycle;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SKYLAKE
#if SIMSIMD_TARGET_SAPPHIRE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512fp16", "f16c")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512fp16,f16c"))), \
apply_to = function)
/**
* Using `_mm512_set1_ph((_Float16)1.f)` results in compilation warnings if we are pedantic.
* https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-8/details-about-intrinsics-for-half-floats.html
*/
SIMSIMD_INTERNAL __m512h _mm512_set1_ph_from_ps(float a) {
unsigned short h = _cvtss_sh(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
return (__m512h)_mm512_set1_epi16(h);
}
SIMSIMD_PUBLIC void simsimd_sum_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_f16_t *result) {
__mmask32 mask = 0xFFFFFFFF;
__m512h a_f16_vec, b_f16_vec;
__m512h sum_f16_vec;
simsimd_sum_f16_sapphire_cycle:
if (n < 32) {
mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a));
b_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b));
n = 0;
}
else {
a_f16_vec = _mm512_loadu_ph(a);
b_f16_vec = _mm512_loadu_ph(b);
a += 32, b += 32, n -= 32;
}
sum_f16_vec = _mm512_add_ph(a_f16_vec, b_f16_vec);
_mm512_mask_storeu_epi16(result, mask, _mm512_castph_si512(sum_f16_vec));
result += 32;
if (n) goto simsimd_sum_f16_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_scale_f16_sapphire(simsimd_f16_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_f16_t *result) {
__mmask32 mask = 0xFFFFFFFF;
__m512h alpha_vec = _mm512_set1_ph_from_ps(alpha);
__m512h a_f16_vec, b_f16_vec;
__m512h sum_f16_vec;
simsimd_scale_f16_sapphire_cycle:
if (n < 32) {
mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a));
n = 0;
}
else {
a_f16_vec = _mm512_loadu_ph(a);
a += 32, n -= 32;
}
sum_f16_vec = _mm512_mul_ph(a_f16_vec, alpha_vec);
_mm512_mask_storeu_epi16(result, mask, _mm512_castph_si512(sum_f16_vec));
result += 32;
if (n) goto simsimd_scale_f16_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_wsum_f16_sapphire( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_f16_sapphire(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_f16_sapphire(a, n, alpha, result); }
else { simsimd_scale_f16_sapphire(b, n, beta, result); }
return;
}
// The general case.
__mmask32 mask = 0xFFFFFFFF;
__m512h alpha_vec = _mm512_set1_ph_from_ps(alpha);
__m512h beta_vec = _mm512_set1_ph_from_ps(beta);
__m512h a_f16_vec, b_f16_vec;
__m512h a_scaled_f16_vec, sum_f16_vec;
simsimd_wsum_f16_sapphire_cycle:
if (n < 32) {
mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a));
b_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b));
n = 0;
}
else {
a_f16_vec = _mm512_loadu_ph(a);
b_f16_vec = _mm512_loadu_ph(b);
a += 32, b += 32, n -= 32;
}
a_scaled_f16_vec = _mm512_mul_ph(a_f16_vec, alpha_vec);
sum_f16_vec = _mm512_fmadd_ph(b_f16_vec, beta_vec, a_scaled_f16_vec);
_mm512_mask_storeu_epi16(result, mask, _mm512_castph_si512(sum_f16_vec));
result += 32;
if (n) goto simsimd_wsum_f16_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_fma_f16_sapphire( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result) {
__mmask32 mask = 0xFFFFFFFF;
__m512h alpha_vec = _mm512_set1_ph_from_ps(alpha);
__m512h beta_vec = _mm512_set1_ph_from_ps(beta);
__m512h a_f16_vec, b_f16_vec, c_f16_vec;
__m512h ab_f16_vec, ab_scaled_f16_vec, sum_f16_vec;
simsimd_fma_f16_sapphire_cycle:
if (n < 32) {
mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a));
b_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b));
c_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, c));
n = 0;
}
else {
a_f16_vec = _mm512_loadu_ph(a);
b_f16_vec = _mm512_loadu_ph(b);
c_f16_vec = _mm512_loadu_ph(c);
a += 32, b += 32, c += 32, n -= 32;
}
ab_f16_vec = _mm512_mul_ph(a_f16_vec, b_f16_vec);
ab_scaled_f16_vec = _mm512_mul_ph(ab_f16_vec, alpha_vec);
sum_f16_vec = _mm512_fmadd_ph(c_f16_vec, beta_vec, ab_scaled_f16_vec);
_mm512_mask_storeu_epi16(result, mask, _mm512_castph_si512(sum_f16_vec));
result += 32;
if (n) goto simsimd_fma_f16_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_sum_u8_sapphire(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_u8_t *result) {
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
__m512i a_u8_vec, b_u8_vec, sum_u8_vec;
simsimd_sum_u8_sapphire_cycle:
if (n < 64) {
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
}
else {
a_u8_vec = _mm512_loadu_epi8(a);
b_u8_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n -= 64;
}
sum_u8_vec = _mm512_adds_epu8(a_u8_vec, b_u8_vec);
_mm512_mask_storeu_epi8(result, mask, sum_u8_vec);
result += 64;
if (n) goto simsimd_sum_u8_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_scale_u8_sapphire(simsimd_u8_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_u8_t *result) {
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
__m512h alpha_vec = _mm512_set1_ph_from_ps(alpha);
__m512i a_u8_vec, b_u8_vec, sum_u8_vec;
__m512h a_f16_low_vec, a_f16_high_vec;
__m512h a_scaled_f16_low_vec, a_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec;
__m512i sum_i16_low_vec, sum_i16_high_vec;
simsimd_scale_u8_sapphire_cycle:
if (n < 64) {
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
n = 0;
}
else {
a_u8_vec = _mm512_loadu_epi8(a);
a += 64, n -= 64;
}
// Upcast:
a_f16_low_vec = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(a_u8_vec, _mm512_setzero_si512()));
a_f16_high_vec = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(a_u8_vec, _mm512_setzero_si512()));
// Scale:
sum_f16_low_vec = _mm512_mul_ph(a_f16_low_vec, alpha_vec);
sum_f16_high_vec = _mm512_mul_ph(a_f16_high_vec, alpha_vec);
// Downcast:
sum_i16_low_vec = _mm512_cvtph_epi16(sum_f16_low_vec);
sum_i16_high_vec = _mm512_cvtph_epi16(sum_f16_high_vec);
sum_u8_vec = _mm512_packus_epi16(sum_i16_low_vec, sum_i16_high_vec);
_mm512_mask_storeu_epi8(result, mask, sum_u8_vec);
result += 64;
if (n) goto simsimd_scale_u8_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_wsum_u8_sapphire( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_u8_sapphire(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_u8_sapphire(a, n, alpha, result); }
else { simsimd_scale_u8_sapphire(b, n, beta, result); }
return;
}
// The general case.
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
__m512h alpha_vec = _mm512_set1_ph_from_ps(alpha);
__m512h beta_vec = _mm512_set1_ph_from_ps(beta);
__m512i a_u8_vec, b_u8_vec, sum_u8_vec;
__m512h a_f16_low_vec, a_f16_high_vec, b_f16_low_vec, b_f16_high_vec;
__m512h a_scaled_f16_low_vec, a_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec;
__m512i sum_i16_low_vec, sum_i16_high_vec;
simsimd_wsum_u8_sapphire_cycle:
if (n < 64) {
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
}
else {
a_u8_vec = _mm512_loadu_epi8(a);
b_u8_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n -= 64;
}
// Upcast:
a_f16_low_vec = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(a_u8_vec, _mm512_setzero_si512()));
a_f16_high_vec = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(a_u8_vec, _mm512_setzero_si512()));
b_f16_low_vec = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(b_u8_vec, _mm512_setzero_si512()));
b_f16_high_vec = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(b_u8_vec, _mm512_setzero_si512()));
// Scale:
a_scaled_f16_low_vec = _mm512_mul_ph(a_f16_low_vec, alpha_vec);
a_scaled_f16_high_vec = _mm512_mul_ph(a_f16_high_vec, alpha_vec);
// Add:
sum_f16_low_vec = _mm512_fmadd_ph(b_f16_low_vec, beta_vec, a_scaled_f16_low_vec);
sum_f16_high_vec = _mm512_fmadd_ph(b_f16_high_vec, beta_vec, a_scaled_f16_high_vec);
// Downcast:
sum_i16_low_vec = _mm512_cvtph_epi16(sum_f16_low_vec);
sum_i16_high_vec = _mm512_cvtph_epi16(sum_f16_high_vec);
sum_u8_vec = _mm512_packus_epi16(sum_i16_low_vec, sum_i16_high_vec);
_mm512_mask_storeu_epi8(result, mask, sum_u8_vec);
result += 64;
if (n) goto simsimd_wsum_u8_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_sum_i8_sapphire(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_i8_t *result) {
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
__m512i a_i8_vec, b_i8_vec, sum_i8_vec;
__m512h a_f16_low_vec, a_f16_high_vec, b_f16_low_vec, b_f16_high_vec;
__m512h a_scaled_f16_low_vec, a_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec;
__m512i sum_i16_low_vec, sum_i16_high_vec;
simsimd_sum_i8_sapphire_cycle:
if (n < 64) {
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
a_i8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_i8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
}
else {
a_i8_vec = _mm512_loadu_epi8(a);
b_i8_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n -= 64;
}
sum_i8_vec = _mm512_adds_epi8(a_i8_vec, b_i8_vec);
_mm512_mask_storeu_epi8(result, mask, sum_i8_vec);
result += 64;
if (n) goto simsimd_sum_i8_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_scale_i8_sapphire(simsimd_i8_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_i8_t *result) {
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
__m512h alpha_vec = _mm512_set1_ph_from_ps(alpha);
__m512i a_i8_vec, sum_i8_vec;
__m512h a_f16_low_vec, a_f16_high_vec;
__m512h sum_f16_low_vec, sum_f16_high_vec;
__m512i sum_i16_low_vec, sum_i16_high_vec;
simsimd_wsum_i8_sapphire_cycle:
if (n < 64) {
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
a_i8_vec = _mm512_maskz_loadu_epi8(mask, a);
n = 0;
}
else {
a_i8_vec = _mm512_loadu_epi8(a);
a += 64, n -= 64;
}
// Upcast:
a_f16_low_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_vec)));
a_f16_high_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_vec, 1)));
// Scale:
sum_f16_low_vec = _mm512_mul_ph(a_f16_low_vec, alpha_vec);
sum_f16_high_vec = _mm512_mul_ph(a_f16_high_vec, alpha_vec);
// Downcast:
sum_i16_low_vec = _mm512_cvtph_epi16(sum_f16_low_vec);
sum_i16_high_vec = _mm512_cvtph_epi16(sum_f16_high_vec);
sum_i8_vec = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtsepi16_epi8(sum_i16_low_vec)),
_mm512_cvtsepi16_epi8(sum_i16_high_vec), 1);
_mm512_mask_storeu_epi8(result, mask, sum_i8_vec);
result += 64;
if (n) goto simsimd_wsum_i8_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_wsum_i8_sapphire( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_i8_sapphire(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_i8_sapphire(a, n, alpha, result); }
else { simsimd_scale_i8_sapphire(b, n, beta, result); }
return;
}
// The general case.
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
__m512h alpha_vec = _mm512_set1_ph_from_ps(alpha);
__m512h beta_vec = _mm512_set1_ph_from_ps(beta);
__m512i a_i8_vec, b_i8_vec, sum_i8_vec;
__m512h a_f16_low_vec, a_f16_high_vec, b_f16_low_vec, b_f16_high_vec;
__m512h a_scaled_f16_low_vec, a_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec;
__m512i sum_i16_low_vec, sum_i16_high_vec;
simsimd_wsum_i8_sapphire_cycle:
if (n < 64) {
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
a_i8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_i8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
}
else {
a_i8_vec = _mm512_loadu_epi8(a);
b_i8_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n -= 64;
}
// Upcast:
a_f16_low_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_vec)));
a_f16_high_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_vec, 1)));
b_f16_low_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(b_i8_vec)));
b_f16_high_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(b_i8_vec, 1)));
// Scale:
a_scaled_f16_low_vec = _mm512_mul_ph(a_f16_low_vec, alpha_vec);
a_scaled_f16_high_vec = _mm512_mul_ph(a_f16_high_vec, alpha_vec);
// Add:
sum_f16_low_vec = _mm512_fmadd_ph(b_f16_low_vec, beta_vec, a_scaled_f16_low_vec);
sum_f16_high_vec = _mm512_fmadd_ph(b_f16_high_vec, beta_vec, a_scaled_f16_high_vec);
// Downcast:
sum_i16_low_vec = _mm512_cvtph_epi16(sum_f16_low_vec);
sum_i16_high_vec = _mm512_cvtph_epi16(sum_f16_high_vec);
sum_i8_vec = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtsepi16_epi8(sum_i16_low_vec)),
_mm512_cvtsepi16_epi8(sum_i16_high_vec), 1);
_mm512_mask_storeu_epi8(result, mask, sum_i8_vec);
result += 64;
if (n) goto simsimd_wsum_i8_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_fma_i8_sapphire( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) {
__mmask64 mask = 0xFFFFFFFFFFFFFFFF;
__m512h alpha_vec = _mm512_set1_ph_from_ps(alpha);
__m512h beta_vec = _mm512_set1_ph_from_ps(beta);
__m512i a_i8_vec, b_i8_vec, c_i8_vec, sum_i8_vec;
__m512h a_f16_low_vec, a_f16_high_vec, b_f16_low_vec, b_f16_high_vec;
__m512h c_f16_low_vec, c_f16_high_vec, ab_f16_low_vec, ab_f16_high_vec;
__m512h ab_scaled_f16_low_vec, ab_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec;
__m512i sum_i16_low_vec, sum_i16_high_vec;
simsimd_fma_i8_sapphire_cycle:
if (n < 64) {
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
a_i8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_i8_vec = _mm512_maskz_loadu_epi8(mask, b);
c_i8_vec = _mm512_maskz_loadu_epi8(mask, c);
n = 0;
}
else {
a_i8_vec = _mm512_loadu_epi8(a);
b_i8_vec = _mm512_loadu_epi8(b);
c_i8_vec = _mm512_loadu_epi8(c);
a += 64, b += 64, c += 64, n -= 64;
}
// Upcast:
a_f16_low_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_vec)));
a_f16_high_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_vec, 1)));
b_f16_low_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(b_i8_vec)));
b_f16_high_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(b_i8_vec, 1)));
c_f16_low_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(c_i8_vec)));
c_f16_high_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(c_i8_vec, 1)));
// Multiply:
ab_f16_low_vec = _mm512_mul_ph(a_f16_low_vec, b_f16_low_vec);
ab_f16_high_vec = _mm512_mul_ph(a_f16_high_vec, b_f16_high_vec);
// Scale:
ab_scaled_f16_low_vec = _mm512_mul_ph(ab_f16_low_vec, alpha_vec);
ab_scaled_f16_high_vec = _mm512_mul_ph(ab_f16_high_vec, alpha_vec);
// Add:
sum_f16_low_vec = _mm512_fmadd_ph(c_f16_low_vec, beta_vec, ab_scaled_f16_low_vec);
sum_f16_high_vec = _mm512_fmadd_ph(c_f16_high_vec, beta_vec, ab_scaled_f16_high_vec);
// Downcast:
sum_i16_low_vec = _mm512_cvtph_epi16(sum_f16_low_vec);
sum_i16_high_vec = _mm512_cvtph_epi16(sum_f16_high_vec);
sum_i8_vec = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtsepi16_epi8(sum_i16_low_vec)),
_mm512_cvtsepi16_epi8(sum_i16_high_vec), 1);
_mm512_mask_storeu_epi8(result, mask, sum_i8_vec);
result += 64;
if (n) goto simsimd_fma_i8_sapphire_cycle;
}
SIMSIMD_PUBLIC void simsimd_fma_u8_sapphire( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) {
__mmask64 mask = 0xFFFFFFFFFFFFFFFF;
__m512h alpha_vec = _mm512_set1_ph_from_ps(alpha);
__m512h beta_vec = _mm512_set1_ph_from_ps(beta);
__m512i a_u8_vec, b_u8_vec, c_u8_vec, sum_u8_vec;
__m512h a_f16_low_vec, a_f16_high_vec, b_f16_low_vec, b_f16_high_vec;
__m512h c_f16_low_vec, c_f16_high_vec, ab_f16_low_vec, ab_f16_high_vec;
__m512h ab_scaled_f16_low_vec, ab_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec;
__m512i sum_i16_low_vec, sum_i16_high_vec;
simsimd_fma_u8_sapphire_cycle:
if (n < 64) {
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
c_u8_vec = _mm512_maskz_loadu_epi8(mask, c);
n = 0;
}
else {
a_u8_vec = _mm512_loadu_epi8(a);
b_u8_vec = _mm512_loadu_epi8(b);
c_u8_vec = _mm512_loadu_epi8(c);
a += 64, b += 64, c += 64, n -= 64;
}
// Upcast:
a_f16_low_vec = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(a_u8_vec, _mm512_setzero_si512()));
a_f16_high_vec = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(a_u8_vec, _mm512_setzero_si512()));
b_f16_low_vec = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(b_u8_vec, _mm512_setzero_si512()));
b_f16_high_vec = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(b_u8_vec, _mm512_setzero_si512()));
c_f16_low_vec = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(c_u8_vec, _mm512_setzero_si512()));
c_f16_high_vec = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(c_u8_vec, _mm512_setzero_si512()));
// Multiply:
ab_f16_low_vec = _mm512_mul_ph(a_f16_low_vec, b_f16_low_vec);
ab_f16_high_vec = _mm512_mul_ph(a_f16_high_vec, b_f16_high_vec);
// Scale:
ab_scaled_f16_low_vec = _mm512_mul_ph(ab_f16_low_vec, alpha_vec);
ab_scaled_f16_high_vec = _mm512_mul_ph(ab_f16_high_vec, alpha_vec);
// Add:
sum_f16_low_vec = _mm512_fmadd_ph(c_f16_low_vec, beta_vec, ab_scaled_f16_low_vec);
sum_f16_high_vec = _mm512_fmadd_ph(c_f16_high_vec, beta_vec, ab_scaled_f16_high_vec);
// Downcast:
sum_i16_low_vec = _mm512_cvtph_epi16(sum_f16_low_vec);
sum_i16_high_vec = _mm512_cvtph_epi16(sum_f16_high_vec);
sum_u8_vec = _mm512_packus_epi16(sum_i16_low_vec, sum_i16_high_vec);
_mm512_mask_storeu_epi8(result, mask, sum_u8_vec);
result += 64;
if (n) goto simsimd_fma_u8_sapphire_cycle;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SAPPHIRE
#endif // _SIMSIMD_TARGET_X86
#if _SIMSIMD_TARGET_ARM
#if SIMSIMD_TARGET_NEON
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_sum_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_f32_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vld1q_f32(a + i);
float32x4_t b_vec = vld1q_f32(b + i);
float32x4_t sum_vec = vaddq_f32(a_vec, b_vec);
vst1q_f32(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = a[i] + b[i];
}
SIMSIMD_PUBLIC void simsimd_scale_f32_neon(simsimd_f32_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_f32_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vld1q_f32(a + i);
float32x4_t sum_vec = vmulq_n_f32(a_vec, alpha_f32);
vst1q_f32(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = alpha_f32 * a[i];
}
SIMSIMD_PUBLIC void simsimd_wsum_f32_neon( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_f32_neon(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_f32_neon(a, n, alpha, result); }
else { simsimd_scale_f32_neon(b, n, beta, result); }
return;
}
// The general case.
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vld1q_f32(a + i);
float32x4_t b_vec = vld1q_f32(b + i);
float32x4_t a_scaled_vec = vmulq_n_f32(a_vec, alpha_f32);
float32x4_t b_scaled_vec = vmulq_n_f32(b_vec, beta_f32);
float32x4_t sum_vec = vaddq_f32(a_scaled_vec, b_scaled_vec);
vst1q_f32(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = alpha_f32 * a[i] + beta_f32 * b[i];
}
SIMSIMD_PUBLIC void simsimd_fma_f32_neon( //
simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vld1q_f32(a + i);
float32x4_t b_vec = vld1q_f32(b + i);
float32x4_t c_vec = vld1q_f32(c + i);
float32x4_t ab_vec = vmulq_f32(a_vec, b_vec);
float32x4_t ab_scaled_vec = vmulq_n_f32(ab_vec, alpha_f32);
float32x4_t sum_vec = vfmaq_n_f32(ab_scaled_vec, c_vec, beta_f32);
vst1q_f32(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) result[i] = alpha_f32 * a[i] * b[i] + beta_f32 * c[i];
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON
#if SIMSIMD_TARGET_NEON_BF16
#pragma GCC push_options
#pragma GCC target("arch=armv8.6-a+simd+bf16")
#pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_sum_bf16_neon(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_bf16_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vcvt_f32_bf16(vld1_bf16((bfloat16_t const *)a + i));
float32x4_t b_vec = vcvt_f32_bf16(vld1_bf16((bfloat16_t const *)b + i));
float32x4_t sum_vec = vaddq_f32(a_vec, b_vec);
vst1_bf16((bfloat16_t *)result + i, vcvt_bf16_f32(sum_vec));
}
// The tail:
for (; i < n; ++i) simsimd_f32_to_bf16(simsimd_bf16_to_f32(a + i) + simsimd_bf16_to_f32(b + i), result + i);
}
SIMSIMD_PUBLIC void simsimd_scale_bf16_neon(simsimd_bf16_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_bf16_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vcvt_f32_bf16(vld1_bf16((bfloat16_t const *)a + i));
float32x4_t sum_vec = vmulq_n_f32(a_vec, alpha_f32);
vst1_bf16((bfloat16_t *)result + i, vcvt_bf16_f32(sum_vec));
}
// The tail:
for (; i < n; ++i) simsimd_f32_to_bf16(alpha_f32 * simsimd_bf16_to_f32(a + i), result + i);
}
SIMSIMD_PUBLIC void simsimd_wsum_bf16_neon( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_bf16_neon(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_bf16_neon(a, n, alpha, result); }
else { simsimd_scale_bf16_neon(b, n, beta, result); }
return;
}
// The general case.
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vcvt_f32_bf16(vld1_bf16((bfloat16_t const *)a + i));
float32x4_t b_vec = vcvt_f32_bf16(vld1_bf16((bfloat16_t const *)b + i));
float32x4_t a_scaled_vec = vmulq_n_f32(a_vec, alpha_f32);
float32x4_t b_scaled_vec = vmulq_n_f32(b_vec, beta_f32);
float32x4_t sum_vec = vaddq_f32(a_scaled_vec, b_scaled_vec);
vst1_bf16((bfloat16_t *)result + i, vcvt_bf16_f32(sum_vec));
}
// The tail:
for (; i < n; ++i)
simsimd_f32_to_bf16(alpha_f32 * simsimd_bf16_to_f32(a + i) + beta_f32 * simsimd_bf16_to_f32(b + i), result + i);
}
SIMSIMD_PUBLIC void simsimd_fma_bf16_neon( //
simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result) {
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vcvt_f32_bf16(vld1_bf16((bfloat16_t const *)a + i));
float32x4_t b_vec = vcvt_f32_bf16(vld1_bf16((bfloat16_t const *)b + i));
float32x4_t c_vec = vcvt_f32_bf16(vld1_bf16((bfloat16_t const *)c + i));
float32x4_t ab_vec = vmulq_f32(a_vec, b_vec);
float32x4_t ab_scaled_vec = vmulq_n_f32(ab_vec, alpha_f32);
float32x4_t sum_vec = vfmaq_n_f32(ab_scaled_vec, c_vec, beta_f32);
vst1_bf16((bfloat16_t *)result + i, vcvt_bf16_f32(sum_vec));
}
// The tail:
for (; i < n; ++i)
simsimd_f32_to_bf16(
alpha_f32 * simsimd_bf16_to_f32(a + i) * simsimd_bf16_to_f32(b + i) + beta_f32 * simsimd_bf16_to_f32(c + i),
result + i);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_BF16
#if SIMSIMD_TARGET_NEON_F16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd+fp16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_sum_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_f16_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
float16x8_t a_vec = vld1q_f16((float16_t const *)a + i);
float16x8_t b_vec = vld1q_f16((float16_t const *)b + i);
float16x8_t sum_vec = vaddq_f16(a_vec, b_vec);
vst1q_f16((float16_t *)result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) ((float16_t *)result)[i] = ((float16_t const *)a)[i] + ((float16_t const *)b)[i];
}
SIMSIMD_PUBLIC void simsimd_scale_f16_neon(simsimd_f16_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_f16_t *result) {
float16_t alpha_f16 = (float16_t)alpha;
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
float16x8_t a_vec = vld1q_f16((float16_t const *)a + i);
float16x8_t sum_vec = vmulq_n_f16(a_vec, alpha_f16);
vst1q_f16((float16_t *)result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) ((float16_t *)result)[i] = alpha_f16 * ((float16_t const *)a)[i];
}
SIMSIMD_PUBLIC void simsimd_wsum_f16_neon( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_f16_neon(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_f16_neon(a, n, alpha, result); }
else { simsimd_scale_f16_neon(b, n, beta, result); }
return;
}
// The general case.
float16_t alpha_f16 = (float16_t)alpha;
float16_t beta_f16 = (float16_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
float16x8_t a_vec = vld1q_f16((float16_t const *)a + i);
float16x8_t b_vec = vld1q_f16((float16_t const *)b + i);
float16x8_t a_scaled_vec = vmulq_n_f16(a_vec, alpha_f16);
float16x8_t b_scaled_vec = vmulq_n_f16(b_vec, beta_f16);
float16x8_t sum_vec = vaddq_f16(a_scaled_vec, b_scaled_vec);
vst1q_f16((float16_t *)result + i, sum_vec);
}
// The tail:
for (; i < n; ++i)
((float16_t *)result)[i] = alpha_f16 * ((float16_t const *)a)[i] + beta_f16 * ((float16_t const *)b)[i];
}
SIMSIMD_PUBLIC void simsimd_fma_f16_neon( //
simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result) {
float16_t alpha_f16 = (float16_t)alpha;
float16_t beta_f16 = (float16_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
float16x8_t a_vec = vld1q_f16((float16_t const *)a + i);
float16x8_t b_vec = vld1q_f16((float16_t const *)b + i);
float16x8_t c_vec = vld1q_f16((float16_t const *)c + i);
float16x8_t ab_vec = vmulq_f16(a_vec, b_vec);
float16x8_t ab_scaled_vec = vmulq_n_f16(ab_vec, alpha_f16);
float16x8_t sum_vec = vfmaq_n_f16(ab_scaled_vec, c_vec, beta_f16);
vst1q_f16((float16_t *)result + i, sum_vec);
}
// The tail:
for (; i < n; ++i)
((float16_t *)result)[i] =
alpha_f16 * ((float16_t const *)a)[i] * ((float16_t const *)b)[i] + beta_f16 * ((float16_t const *)c)[i];
}
SIMSIMD_PUBLIC void simsimd_sum_u8_neon(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_u8_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 16 <= n; i += 16) {
uint8x16_t a_vec = vld1q_u8(a + i);
uint8x16_t b_vec = vld1q_u8(b + i);
uint8x16_t sum_vec = vqaddq_u8(a_vec, b_vec);
vst1q_u8(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) { SIMSIMD_F32_TO_U8(a[i] + b[i], result + i); }
}
SIMSIMD_PUBLIC void simsimd_scale_u8_neon(simsimd_u8_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_u8_t *result) {
float16_t alpha_f16 = (float16_t)alpha;
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
uint8x8_t a_u8_vec = vld1_u8(a + i);
float16x8_t a_vec = vcvtq_f16_u16(vmovl_u8(a_u8_vec));
float16x8_t sum_vec = vmulq_n_f16(a_vec, alpha_f16);
uint8x8_t sum_u8_vec = vqmovn_u16(vcvtaq_u16_f16(sum_vec));
vst1_u8(result + i, sum_u8_vec);
}
// The tail:
for (; i < n; ++i) { SIMSIMD_F32_TO_U8(alpha_f16 * a[i], result + i); }
}
SIMSIMD_PUBLIC void simsimd_wsum_u8_neon( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_u8_neon(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_u8_neon(a, n, alpha, result); }
else { simsimd_scale_u8_neon(b, n, beta, result); }
return;
}
// The general case.
float16_t alpha_f16 = (float16_t)alpha;
float16_t beta_f16 = (float16_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
uint8x8_t a_u8_vec = vld1_u8(a + i);
uint8x8_t b_u8_vec = vld1_u8(b + i);
float16x8_t a_vec = vcvtq_f16_u16(vmovl_u8(a_u8_vec));
float16x8_t b_vec = vcvtq_f16_u16(vmovl_u8(b_u8_vec));
float16x8_t a_scaled_vec = vmulq_n_f16(a_vec, alpha_f16);
float16x8_t b_scaled_vec = vmulq_n_f16(b_vec, beta_f16);
float16x8_t sum_vec = vaddq_f16(a_scaled_vec, b_scaled_vec);
uint8x8_t sum_u8_vec = vqmovn_u16(vcvtaq_u16_f16(sum_vec));
vst1_u8(result + i, sum_u8_vec);
}
// The tail:
for (; i < n; ++i) { SIMSIMD_F32_TO_U8(alpha_f16 * a[i] + beta_f16 * b[i], result + i); }
}
SIMSIMD_PUBLIC void simsimd_fma_u8_neon( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) {
float16_t alpha_f16 = (float16_t)alpha;
float16_t beta_f16 = (float16_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
uint8x8_t a_u8_vec = vld1_u8(a + i);
uint8x8_t b_u8_vec = vld1_u8(b + i);
uint8x8_t c_u8_vec = vld1_u8(c + i);
float16x8_t a_vec = vcvtq_f16_u16(vmovl_u8(a_u8_vec));
float16x8_t b_vec = vcvtq_f16_u16(vmovl_u8(b_u8_vec));
float16x8_t c_vec = vcvtq_f16_u16(vmovl_u8(c_u8_vec));
float16x8_t ab_vec = vmulq_f16(a_vec, b_vec);
float16x8_t ab_scaled_vec = vmulq_n_f16(ab_vec, alpha_f16);
float16x8_t sum_vec = vfmaq_n_f16(ab_scaled_vec, c_vec, beta_f16);
uint8x8_t sum_u8_vec = vqmovn_u16(vcvtaq_u16_f16(sum_vec));
vst1_u8(result + i, sum_u8_vec);
}
// The tail:
for (; i < n; ++i) { SIMSIMD_F32_TO_U8(alpha_f16 * a[i] * b[i] + beta_f16 * c[i], result + i); }
}
SIMSIMD_PUBLIC void simsimd_sum_i8_neon(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_i8_t *result) {
// The main loop:
simsimd_size_t i = 0;
for (; i + 16 <= n; i += 16) {
int8x16_t a_vec = vld1q_s8(a + i);
int8x16_t b_vec = vld1q_s8(b + i);
int8x16_t sum_vec = vqaddq_s8(a_vec, b_vec);
vst1q_s8(result + i, sum_vec);
}
// The tail:
for (; i < n; ++i) { SIMSIMD_F32_TO_I8(a[i] + b[i], result + i); }
}
SIMSIMD_PUBLIC void simsimd_scale_i8_neon(simsimd_i8_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
simsimd_i8_t *result) {
float16_t alpha_f16 = (float16_t)alpha;
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
int8x8_t a_i8_vec = vld1_s8(a + i);
float16x8_t a_vec = vcvtq_f16_s16(vmovl_s8(a_i8_vec));
float16x8_t sum_vec = vmulq_n_f16(a_vec, alpha_f16);
int8x8_t sum_i8_vec = vqmovn_s16(vcvtaq_s16_f16(sum_vec));
vst1_s8(result + i, sum_i8_vec);
}
// The tail:
for (; i < n; ++i) { SIMSIMD_F32_TO_I8(alpha_f16 * a[i], result + i); }
}
SIMSIMD_PUBLIC void simsimd_wsum_i8_neon( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) {
// There are are several special cases we may want to implement:
// 1. Simple addition, when both weights are equal to 1.0.
if (alpha == 1 && beta == 1) {
// In this case we can avoid expensive multiplications.
simsimd_sum_i8_neon(a, b, n, result);
return;
}
// 2. Just scaling, when one of the weights is equal to zero.
else if (alpha == 0 || beta == 0) {
// In this case we can avoid half of the load instructions.
if (beta == 0) { simsimd_scale_i8_neon(a, n, alpha, result); }
else { simsimd_scale_i8_neon(b, n, beta, result); }
return;
}
// The general case.
float16_t alpha_f16 = (float16_t)alpha;
float16_t beta_f16 = (float16_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
int8x8_t a_i8_vec = vld1_s8(a + i);
int8x8_t b_i8_vec = vld1_s8(b + i);
float16x8_t a_vec = vcvtq_f16_s16(vmovl_s8(a_i8_vec));
float16x8_t b_vec = vcvtq_f16_s16(vmovl_s8(b_i8_vec));
float16x8_t a_scaled_vec = vmulq_n_f16(a_vec, alpha_f16);
float16x8_t b_scaled_vec = vmulq_n_f16(b_vec, beta_f16);
float16x8_t sum_vec = vaddq_f16(a_scaled_vec, b_scaled_vec);
int8x8_t sum_i8_vec = vqmovn_s16(vcvtaq_s16_f16(sum_vec));
vst1_s8(result + i, sum_i8_vec);
}
// The tail:
for (; i < n; ++i) { SIMSIMD_F32_TO_I8(alpha_f16 * a[i] + beta_f16 * b[i], result + i); }
}
SIMSIMD_PUBLIC void simsimd_fma_i8_neon( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) {
float16_t alpha_f16 = (float16_t)alpha;
float16_t beta_f16 = (float16_t)beta;
// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
int8x8_t a_i8_vec = vld1_s8(a + i);
int8x8_t b_i8_vec = vld1_s8(b + i);
int8x8_t c_i8_vec = vld1_s8(c + i);
float16x8_t a_vec = vcvtq_f16_s16(vmovl_s8(a_i8_vec));
float16x8_t b_vec = vcvtq_f16_s16(vmovl_s8(b_i8_vec));
float16x8_t c_vec = vcvtq_f16_s16(vmovl_s8(c_i8_vec));
float16x8_t ab_vec = vmulq_f16(a_vec, b_vec);
float16x8_t ab_scaled_vec = vmulq_n_f16(ab_vec, alpha_f16);
float16x8_t sum_vec = vfmaq_n_f16(ab_scaled_vec, c_vec, beta_f16);
int8x8_t sum_i8_vec = vqmovn_s16(vcvtaq_s16_f16(sum_vec));
vst1_s8(result + i, sum_i8_vec);
}
// The tail:
for (; i < n; ++i) { SIMSIMD_F32_TO_I8(alpha_f16 * a[i] * b[i] + beta_f16 * c[i], result + i); }
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_F16
#endif // _SIMSIMD_TARGET_ARM
#ifdef __cplusplus
}
#endif
#endif
simsimd-6.5.12/include/simsimd/geospatial.h 0000644 0000000 0000000 00000002504 10461020230 0016756 0 ustar 0000000 0000000 /**
* @file geospatial.h
* @brief SIMD-accelerated Geospatial distance functions.
* @author Ash Vardanian
* @date July 1, 2023
*
* Contains:
* - Haversine (Great Circle) distance
* - Vincenty's distance function for Oblate Spheroid Geodesics
*
* For datatypes:
* - 32-bit IEEE-754 floating point
* - 64-bit IEEE-754 floating point
*
* For hardware architectures:
* - Arm: NEON
* - x86: Haswell
*
* In most cases, for distance computations, we don't need the exact Haversine formula.
* The very last part of the computation applies `asin(sqrt(x))` non-linear transformation.
* Both `asin` and `sqrt` are monotonically increasing functions, so their product is also
* monotonically increasing. This means, for relative similarity/closeness computation we
* can avoid that expensive last step.
*
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
* Oblate Spheroid Geodesic: https://mathworld.wolfram.com/OblateSpheroidGeodesic.html
* Staging experiments: https://github.com/ashvardanian/HaversineSimSIMD
*/
#ifndef SIMSIMD_GEOSPATIAL_H
#define SIMSIMD_GEOSPATIAL_H
#include "types.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __cplusplus
}
#endif
#endif
simsimd-6.5.12/include/simsimd/mesh.h 0000644 0000000 0000000 00000010006 10461020230 0015556 0 ustar 0000000 0000000 /**
* @file mesh.h
* @brief SIMD-accelerated similarity measures for meshes and rigid 3D bodies.
* @author Ash Vardanian
* @date June 19, 2024
*
* Contains:
* - Root Mean Square Deviation (RMSD) for rigid body superposition
* - Kabsch algorithm for optimal rigid body superposition
*
* For datatypes:
* - 64-bit IEEE-754 floating point
* - 32-bit IEEE-754 floating point
* - 16-bit IEEE-754 floating point
* - 16-bit brain-floating point
*
* For hardware architectures:
* - Arm: NEON
* - x86: Genoa, Sapphire
*
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
*/
#ifndef SIMSIMD_MESH_H
#define SIMSIMD_MESH_H
#include "types.h"
#ifdef __cplusplus
extern "C" {
#endif
// clang-format off
/* Serial backends for all numeric types.
* By default they use 32-bit arithmetic, unless the arguments themselves contain 64-bit floats.
* For double-precision computation check out the "*_accurate" variants of those "*_serial" functions.
*/
SIMSIMD_PUBLIC void simsimd_rmsd_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_f64_t* a_centroid, simsimd_f64_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kabsch_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_f64_t* a_centroid, simsimd_f64_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_rmsd_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_f32_t* a_centroid, simsimd_f32_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kabsch_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_f32_t* a_centroid, simsimd_f32_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_rmsd_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_f16_t* a_centroid, simsimd_f16_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kabsch_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_f16_t* a_centroid, simsimd_f16_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_rmsd_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_bf16_t* a_centroid, simsimd_bf16_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kabsch_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_bf16_t* a_centroid, simsimd_bf16_t* b_centroid, simsimd_distance_t* result);
/* Double-precision serial backends for all numeric types.
* For single-precision computation check out the "*_serial" counterparts of those "*_accurate" functions.
*/
SIMSIMD_PUBLIC void simsimd_rmsd_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_f32_t* a_centroid, simsimd_f32_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kabsch_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_f32_t* a_centroid, simsimd_f32_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_rmsd_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_f16_t* a_centroid, simsimd_f16_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kabsch_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_f16_t* a_centroid, simsimd_f16_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_rmsd_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_bf16_t* a_centroid, simsimd_bf16_t* b_centroid, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kabsch_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_bf16_t* a_centroid, simsimd_bf16_t* b_centroid, simsimd_distance_t* result);
// clang-format on
#ifdef __cplusplus
}
#endif
#endif
simsimd-6.5.12/include/simsimd/probability.h 0000644 0000000 0000000 00000073323 10461020230 0017155 0 ustar 0000000 0000000 /**
* @file probability.h
* @brief SIMD-accelerated Similarity Measures for Probability Distributions.
* @author Ash Vardanian
* @date October 20, 2023
*
* Contains:
* - Kullback-Leibler divergence (TODO: Rename handle to `kld`)
* - Jensen–Shannon divergence (TODO: Rename handle to `jsd`)
*
* For datatypes:
* - 32-bit floating point numbers
* - 16-bit floating point numbers
* - 16-bit brain-floating point numbers
*
* For hardware architectures:
* - Arm: NEON
* - x86: Haswell, Skylake, Sapphire
*
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
*/
#ifndef SIMSIMD_PROBABILITY_H
#define SIMSIMD_PROBABILITY_H
#include "types.h"
#ifdef __cplusplus
extern "C" {
#endif
// clang-format off
/* Serial backends for all numeric types.
* By default they use 32-bit arithmetic, unless the arguments themselves contain 64-bit floats.
* For double-precision computation check out the "*_accurate" variants of those "*_serial" functions.
*/
SIMSIMD_PUBLIC void simsimd_kl_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kl_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kl_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kl_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* Double-precision serial backends for all numeric types.
* For single-precision computation check out the "*_serial" counterparts of those "*_accurate" functions.
*/
SIMSIMD_PUBLIC void simsimd_kl_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kl_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kl_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for Arm NEON, mostly using 32-bit arithmetic over 128-bit words.
* By far the most portable backend, covering most Arm v8 devices, over a billion phones, and almost all
* server CPUs produced before 2023.
*/
SIMSIMD_PUBLIC void simsimd_kl_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kl_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for AVX2 CPUs of Haswell generation and newer, using 32-bit arithmetic over 256-bit words.
* First demonstrated in 2011, at least one Haswell-based processor was still being sold in 2022 — the Pentium G3420.
* Practically all modern x86 CPUs support AVX2, FMA, and F16C, making it a perfect baseline for SIMD algorithms.
* On other hand, there is no need to implement AVX2 versions of `f32` and `f64` functions, as those are
* properly vectorized by recent compilers.
*/
SIMSIMD_PUBLIC void simsimd_kl_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for various generations of AVX512 CPUs.
* Skylake is handy, as it supports masked loads and other operations, avoiding the need for the tail loop.
* Ice Lake added VNNI, VPOPCNTDQ, IFMA, VBMI, VAES, GFNI, VBMI2, BITALG, VPCLMULQDQ, and other extensions for integral operations.
* Sapphire Rapids added tiled matrix operations, but we are most interested in the new mixed-precision FMA instructions.
*/
SIMSIMD_PUBLIC void simsimd_kl_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_kl_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_js_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
// clang-format on
#define SIMSIMD_MAKE_KL(name, input_type, accumulator_type, load_and_convert, epsilon) \
SIMSIMD_PUBLIC void simsimd_kl_##input_type##_##name(simsimd_##input_type##_t const *a, \
simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t *result) { \
simsimd_##accumulator_type##_t d = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \
simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \
d += ai * SIMSIMD_LOG((ai + epsilon) / (bi + epsilon)); \
} \
*result = (simsimd_distance_t)d; \
}
#define SIMSIMD_MAKE_JS(name, input_type, accumulator_type, load_and_convert, epsilon) \
SIMSIMD_PUBLIC void simsimd_js_##input_type##_##name(simsimd_##input_type##_t const *a, \
simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t *result) { \
simsimd_##accumulator_type##_t d = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \
simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \
simsimd_##accumulator_type##_t mi = (ai + bi) / 2; \
d += ai * SIMSIMD_LOG((ai + epsilon) / (mi + epsilon)); \
d += bi * SIMSIMD_LOG((bi + epsilon) / (mi + epsilon)); \
} \
simsimd_distance_t d_half = ((simsimd_distance_t)d / 2); \
*result = d_half > 0 ? SIMSIMD_SQRT(d_half) : 0; \
}
SIMSIMD_MAKE_KL(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f64_serial
SIMSIMD_MAKE_JS(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f64_serial
SIMSIMD_MAKE_KL(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f32_serial
SIMSIMD_MAKE_JS(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f32_serial
SIMSIMD_MAKE_KL(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f16_serial
SIMSIMD_MAKE_JS(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f16_serial
SIMSIMD_MAKE_KL(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_bf16_serial
SIMSIMD_MAKE_JS(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_bf16_serial
SIMSIMD_MAKE_KL(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f32_accurate
SIMSIMD_MAKE_JS(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f32_accurate
SIMSIMD_MAKE_KL(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f16_accurate
SIMSIMD_MAKE_JS(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f16_accurate
SIMSIMD_MAKE_KL(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_bf16_accurate
SIMSIMD_MAKE_JS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_bf16_accurate
#if _SIMSIMD_TARGET_ARM
#if SIMSIMD_TARGET_NEON
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
SIMSIMD_PUBLIC float32x4_t _simsimd_log2_f32_neon(float32x4_t x) {
// Extracting the exponent
int32x4_t i = vreinterpretq_s32_f32(x);
int32x4_t e = vsubq_s32(vshrq_n_s32(vandq_s32(i, vdupq_n_s32(0x7F800000)), 23), vdupq_n_s32(127));
float32x4_t e_float = vcvtq_f32_s32(e);
// Extracting the mantissa
float32x4_t m = vreinterpretq_f32_s32(vorrq_s32(vandq_s32(i, vdupq_n_s32(0x007FFFFF)), vdupq_n_s32(0x3F800000)));
// Constants for polynomial
float32x4_t one = vdupq_n_f32(1.0f);
float32x4_t p = vdupq_n_f32(-3.4436006e-2f);
// Compute polynomial using Horner's method
p = vmlaq_f32(vdupq_n_f32(3.1821337e-1f), m, p);
p = vmlaq_f32(vdupq_n_f32(-1.2315303f), m, p);
p = vmlaq_f32(vdupq_n_f32(2.5988452f), m, p);
p = vmlaq_f32(vdupq_n_f32(-3.3241990f), m, p);
p = vmlaq_f32(vdupq_n_f32(3.1157899f), m, p);
// Final computation
float32x4_t result = vaddq_f32(vmulq_f32(p, vsubq_f32(m, one)), e_float);
return result;
}
SIMSIMD_PUBLIC void simsimd_kl_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON;
float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
float32x4_t sum_vec = vdupq_n_f32(0);
float32x4_t a_vec, b_vec;
simsimd_kl_f32_neon_cycle:
if (n < 4) {
a_vec = _simsimd_partial_load_f32x4_neon(a, n);
b_vec = _simsimd_partial_load_f32x4_neon(b, n);
n = 0;
}
else {
a_vec = vld1q_f32(a);
b_vec = vld1q_f32(b);
n -= 4, a += 4, b += 4;
}
float32x4_t ratio_vec = vdivq_f32(vaddq_f32(a_vec, epsilon_vec), vaddq_f32(b_vec, epsilon_vec));
float32x4_t log_ratio_vec = _simsimd_log2_f32_neon(ratio_vec);
float32x4_t prod_vec = vmulq_f32(a_vec, log_ratio_vec);
sum_vec = vaddq_f32(sum_vec, prod_vec);
if (n != 0) goto simsimd_kl_f32_neon_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer;
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON;
float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
float32x4_t sum_vec = vdupq_n_f32(0);
float32x4_t a_vec, b_vec;
simsimd_js_f32_neon_cycle:
if (n < 4) {
a_vec = _simsimd_partial_load_f32x4_neon(a, n);
b_vec = _simsimd_partial_load_f32x4_neon(b, n);
n = 0;
}
else {
a_vec = vld1q_f32(a);
b_vec = vld1q_f32(b);
n -= 4, a += 4, b += 4;
}
float32x4_t m_vec = vmulq_f32(vaddq_f32(a_vec, b_vec), vdupq_n_f32(0.5));
float32x4_t ratio_a_vec = vdivq_f32(vaddq_f32(a_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec));
float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec));
float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec);
float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec);
float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec);
float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec);
sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec));
if (n != 0) goto simsimd_js_f32_neon_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer / 2;
*result = sum > 0 ? _simsimd_sqrt_f32_neon(sum) : 0;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON
#if SIMSIMD_TARGET_NEON_F16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd+fp16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_kl_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON;
float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
float32x4_t a_vec, b_vec;
simsimd_kl_f16_neon_cycle:
if (n < 4) {
a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n));
b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n));
n = 0;
}
else {
a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)a));
b_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)b));
n -= 4, a += 4, b += 4;
}
float32x4_t ratio_vec = vdivq_f32(vaddq_f32(a_vec, epsilon_vec), vaddq_f32(b_vec, epsilon_vec));
float32x4_t log_ratio_vec = _simsimd_log2_f32_neon(ratio_vec);
float32x4_t prod_vec = vmulq_f32(a_vec, log_ratio_vec);
sum_vec = vaddq_f32(sum_vec, prod_vec);
if (n) goto simsimd_kl_f16_neon_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer;
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON;
float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
float32x4_t a_vec, b_vec;
simsimd_js_f16_neon_cycle:
if (n < 4) {
a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n));
b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n));
n = 0;
}
else {
a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)a));
b_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)b));
n -= 4, a += 4, b += 4;
}
float32x4_t m_vec = vmulq_f32(vaddq_f32(a_vec, b_vec), vdupq_n_f32(0.5));
float32x4_t ratio_a_vec = vdivq_f32(vaddq_f32(a_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec));
float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec));
float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec);
float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec);
float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec);
float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec);
sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec));
if (n) goto simsimd_js_f16_neon_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer / 2;
*result = sum > 0 ? _simsimd_sqrt_f32_neon(sum) : 0;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_F16
#endif // _SIMSIMD_TARGET_ARM
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_HASWELL
#pragma GCC push_options
#pragma GCC target("avx2", "f16c", "fma")
#pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function)
SIMSIMD_INTERNAL __m256 _simsimd_log2_f32_haswell(__m256 x) {
// Extracting the exponent
__m256i i = _mm256_castps_si256(x);
__m256i e = _mm256_srli_epi32(_mm256_and_si256(i, _mm256_set1_epi32(0x7F800000)), 23);
e = _mm256_sub_epi32(e, _mm256_set1_epi32(127)); // removing the bias
__m256 e_float = _mm256_cvtepi32_ps(e);
// Extracting the mantissa
__m256 m = _mm256_castsi256_ps(
_mm256_or_si256(_mm256_and_si256(i, _mm256_set1_epi32(0x007FFFFF)), _mm256_set1_epi32(0x3F800000)));
// Constants for polynomial
__m256 one = _mm256_set1_ps(1.0f);
__m256 p = _mm256_set1_ps(-3.4436006e-2f);
// Compute the polynomial using Horner's method
p = _mm256_fmadd_ps(m, p, _mm256_set1_ps(3.1821337e-1f));
p = _mm256_fmadd_ps(m, p, _mm256_set1_ps(-1.2315303f));
p = _mm256_fmadd_ps(m, p, _mm256_set1_ps(2.5988452f));
p = _mm256_fmadd_ps(m, p, _mm256_set1_ps(-3.3241990f));
p = _mm256_fmadd_ps(m, p, _mm256_set1_ps(3.1157899f));
// Final computation
__m256 result = _mm256_add_ps(_mm256_mul_ps(p, _mm256_sub_ps(m, one)), e_float);
return result;
}
SIMSIMD_PUBLIC void simsimd_kl_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 sum_vec = _mm256_setzero_ps();
simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON;
__m256 epsilon_vec = _mm256_set1_ps(epsilon);
__m256 a_vec, b_vec;
simsimd_kl_f16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_partial_load_f16x8_haswell(a, n);
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
}
else {
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
a_vec = _mm256_add_ps(a_vec, epsilon_vec);
b_vec = _mm256_add_ps(b_vec, epsilon_vec);
__m256 ratio_vec = _mm256_div_ps(a_vec, b_vec);
__m256 log_ratio_vec = _simsimd_log2_f32_haswell(ratio_vec);
__m256 prod_vec = _mm256_mul_ps(a_vec, log_ratio_vec);
sum_vec = _mm256_add_ps(sum_vec, prod_vec);
if (n) goto simsimd_kl_f16_haswell_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
simsimd_f32_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
sum *= log2_normalizer;
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_js_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON;
__m256 epsilon_vec = _mm256_set1_ps(epsilon);
__m256 sum_vec = _mm256_setzero_ps();
__m256 a_vec, b_vec;
simsimd_js_f16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_partial_load_f16x8_haswell(a, n);
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
}
else {
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
__m256 m_vec = _mm256_mul_ps(_mm256_add_ps(a_vec, b_vec), _mm256_set1_ps(0.5f)); // M = (P + Q) / 2
__m256 ratio_a_vec = _mm256_div_ps(_mm256_add_ps(a_vec, epsilon_vec), _mm256_add_ps(m_vec, epsilon_vec));
__m256 ratio_b_vec = _mm256_div_ps(_mm256_add_ps(b_vec, epsilon_vec), _mm256_add_ps(m_vec, epsilon_vec));
__m256 log_ratio_a_vec = _simsimd_log2_f32_haswell(ratio_a_vec);
__m256 log_ratio_b_vec = _simsimd_log2_f32_haswell(ratio_b_vec);
__m256 prod_a_vec = _mm256_mul_ps(a_vec, log_ratio_a_vec);
__m256 prod_b_vec = _mm256_mul_ps(b_vec, log_ratio_b_vec);
sum_vec = _mm256_add_ps(sum_vec, prod_a_vec);
sum_vec = _mm256_add_ps(sum_vec, prod_b_vec);
if (n) goto simsimd_js_f16_haswell_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
simsimd_f32_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
sum *= log2_normalizer / 2;
*result = sum > 0 ? _simsimd_sqrt_f32_haswell(sum) : 0;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_HASWELL
#if SIMSIMD_TARGET_SKYLAKE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2"))), apply_to = function)
SIMSIMD_INTERNAL __m512 _simsimd_log2_f32_skylake(__m512 x) {
// Extract the exponent and mantissa
__m512 one = _mm512_set1_ps(1.0f);
__m512 e = _mm512_getexp_ps(x);
__m512 m = _mm512_getmant_ps(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src);
// Compute the polynomial using Horner's method
__m512 p = _mm512_set1_ps(-3.4436006e-2f);
p = _mm512_fmadd_ps(m, p, _mm512_set1_ps(3.1821337e-1f));
p = _mm512_fmadd_ps(m, p, _mm512_set1_ps(-1.2315303f));
p = _mm512_fmadd_ps(m, p, _mm512_set1_ps(2.5988452f));
p = _mm512_fmadd_ps(m, p, _mm512_set1_ps(-3.3241990f));
p = _mm512_fmadd_ps(m, p, _mm512_set1_ps(3.1157899f));
return _mm512_add_ps(_mm512_mul_ps(p, _mm512_sub_ps(m, one)), e);
}
SIMSIMD_PUBLIC void simsimd_kl_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 sum_vec = _mm512_setzero();
simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON;
__m512 epsilon_vec = _mm512_set1_ps(epsilon);
__m512 a_vec, b_vec;
simsimd_kl_f32_skylake_cycle:
if (n < 16) {
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_add_ps(_mm512_maskz_loadu_ps(mask, a), epsilon_vec);
b_vec = _mm512_add_ps(_mm512_maskz_loadu_ps(mask, b), epsilon_vec);
n = 0;
}
else {
a_vec = _mm512_add_ps(_mm512_loadu_ps(a), epsilon_vec);
b_vec = _mm512_add_ps(_mm512_loadu_ps(b), epsilon_vec);
a += 16, b += 16, n -= 16;
}
__m512 ratio_vec = _mm512_div_ps(a_vec, b_vec);
__m512 log_ratio_vec = _simsimd_log2_f32_skylake(ratio_vec);
__m512 prod_vec = _mm512_mul_ps(a_vec, log_ratio_vec);
sum_vec = _mm512_add_ps(sum_vec, prod_vec);
if (n) goto simsimd_kl_f32_skylake_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
*result = _mm512_reduce_add_ps(sum_vec) * log2_normalizer;
}
SIMSIMD_PUBLIC void simsimd_js_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 sum_a_vec = _mm512_setzero();
__m512 sum_b_vec = _mm512_setzero();
simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON;
__m512 epsilon_vec = _mm512_set1_ps(epsilon);
__m512 a_vec, b_vec;
simsimd_js_f32_skylake_cycle:
if (n < 16) {
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_ps(mask, a);
b_vec = _mm512_maskz_loadu_ps(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_ps(a);
b_vec = _mm512_loadu_ps(b);
a += 16, b += 16, n -= 16;
}
__m512 m_vec = _mm512_mul_ps(_mm512_add_ps(a_vec, b_vec), _mm512_set1_ps(0.5f));
__mmask16 nonzero_mask_a = _mm512_cmp_ps_mask(a_vec, epsilon_vec, _CMP_GE_OQ);
__mmask16 nonzero_mask_b = _mm512_cmp_ps_mask(b_vec, epsilon_vec, _CMP_GE_OQ);
__mmask16 nonzero_mask = nonzero_mask_a & nonzero_mask_b;
__m512 m_with_epsilon = _mm512_add_ps(m_vec, epsilon_vec);
__m512 m_recip_approx = _mm512_rcp14_ps(m_with_epsilon);
__m512 ratio_a_vec = _mm512_mul_ps(_mm512_add_ps(a_vec, epsilon_vec), m_recip_approx);
__m512 ratio_b_vec = _mm512_mul_ps(_mm512_add_ps(b_vec, epsilon_vec), m_recip_approx);
__m512 log_ratio_a_vec = _simsimd_log2_f32_skylake(ratio_a_vec);
__m512 log_ratio_b_vec = _simsimd_log2_f32_skylake(ratio_b_vec);
sum_a_vec = _mm512_mask3_fmadd_ps(a_vec, log_ratio_a_vec, sum_a_vec, nonzero_mask);
sum_b_vec = _mm512_mask3_fmadd_ps(b_vec, log_ratio_b_vec, sum_b_vec, nonzero_mask);
if (n) goto simsimd_js_f32_skylake_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
simsimd_f32_t sum = _mm512_reduce_add_ps(_mm512_add_ps(sum_a_vec, sum_b_vec));
sum *= log2_normalizer / 2;
*result = sum > 0 ? _simsimd_sqrt_f32_haswell(sum) : 0;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_HASWELL
#if SIMSIMD_TARGET_SAPPHIRE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512fp16")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512fp16"))), apply_to = function)
SIMSIMD_INTERNAL __m512h _simsimd_log2_f16_sapphire(__m512h x) {
// Extract the exponent and mantissa
__m512h one = _mm512_set1_ph((simsimd_f16_t)1);
__m512h e = _mm512_getexp_ph(x);
__m512h m = _mm512_getmant_ph(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src);
// Compute the polynomial using Horner's method
__m512h p = _mm512_set1_ph((simsimd_f16_t)-3.4436006e-2f);
p = _mm512_fmadd_ph(m, p, _mm512_set1_ph((simsimd_f16_t)3.1821337e-1f));
p = _mm512_fmadd_ph(m, p, _mm512_set1_ph((simsimd_f16_t)-1.2315303f));
p = _mm512_fmadd_ph(m, p, _mm512_set1_ph((simsimd_f16_t)2.5988452f));
p = _mm512_fmadd_ph(m, p, _mm512_set1_ph((simsimd_f16_t)-3.3241990f));
p = _mm512_fmadd_ph(m, p, _mm512_set1_ph((simsimd_f16_t)3.1157899f));
return _mm512_add_ph(_mm512_mul_ph(p, _mm512_sub_ph(m, one)), e);
}
SIMSIMD_PUBLIC void simsimd_kl_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512h sum_vec = _mm512_setzero_ph();
__m512h epsilon_vec = _mm512_set1_ph((simsimd_f16_t)SIMSIMD_F16_DIVISION_EPSILON);
__m512h a_vec, b_vec;
simsimd_kl_f16_sapphire_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_add_ph(mask, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a)), epsilon_vec);
b_vec = _mm512_maskz_add_ph(mask, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b)), epsilon_vec);
n = 0;
}
else {
a_vec = _mm512_add_ph(_mm512_castsi512_ph(_mm512_loadu_epi16(a)), epsilon_vec);
b_vec = _mm512_add_ph(_mm512_castsi512_ph(_mm512_loadu_epi16(b)), epsilon_vec);
a += 32, b += 32, n -= 32;
}
__m512h ratio_vec = _mm512_div_ph(a_vec, b_vec);
__m512h log_ratio_vec = _simsimd_log2_f16_sapphire(ratio_vec);
__m512h prod_vec = _mm512_mul_ph(a_vec, log_ratio_vec);
sum_vec = _mm512_add_ph(sum_vec, prod_vec);
if (n) goto simsimd_kl_f16_sapphire_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
*result = _mm512_reduce_add_ph(sum_vec) * log2_normalizer;
}
SIMSIMD_PUBLIC void simsimd_js_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512h sum_a_vec = _mm512_setzero_ph();
__m512h sum_b_vec = _mm512_setzero_ph();
__m512h epsilon_vec = _mm512_set1_ph((simsimd_f16_t)SIMSIMD_F16_DIVISION_EPSILON);
__m512h a_vec, b_vec;
simsimd_js_f16_sapphire_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a));
b_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b));
n = 0;
}
else {
a_vec = _mm512_castsi512_ph(_mm512_loadu_epi16(a));
b_vec = _mm512_castsi512_ph(_mm512_loadu_epi16(b));
a += 32, b += 32, n -= 32;
}
__m512h m_vec = _mm512_mul_ph(_mm512_add_ph(a_vec, b_vec), _mm512_set1_ph((simsimd_f16_t)0.5f));
__mmask32 nonzero_mask_a = _mm512_cmp_ph_mask(a_vec, epsilon_vec, _CMP_GE_OQ);
__mmask32 nonzero_mask_b = _mm512_cmp_ph_mask(b_vec, epsilon_vec, _CMP_GE_OQ);
__mmask32 nonzero_mask = nonzero_mask_a & nonzero_mask_b;
__m512h m_with_epsilon = _mm512_add_ph(m_vec, epsilon_vec);
__m512h m_recip_approx = _mm512_rcp_ph(m_with_epsilon);
__m512h ratio_a_vec = _mm512_mul_ph(_mm512_add_ph(a_vec, epsilon_vec), m_recip_approx);
__m512h ratio_b_vec = _mm512_mul_ph(_mm512_add_ph(b_vec, epsilon_vec), m_recip_approx);
__m512h log_ratio_a_vec = _simsimd_log2_f16_sapphire(ratio_a_vec);
__m512h log_ratio_b_vec = _simsimd_log2_f16_sapphire(ratio_b_vec);
sum_a_vec = _mm512_mask3_fmadd_ph(a_vec, log_ratio_a_vec, sum_a_vec, nonzero_mask);
sum_b_vec = _mm512_mask3_fmadd_ph(b_vec, log_ratio_b_vec, sum_b_vec, nonzero_mask);
if (n) goto simsimd_js_f16_sapphire_cycle;
simsimd_f32_t log2_normalizer = 0.693147181f;
simsimd_f32_t sum = _mm512_reduce_add_ph(_mm512_add_ph(sum_a_vec, sum_b_vec));
sum *= log2_normalizer / 2;
*result = sum > 0 ? _simsimd_sqrt_f32_haswell(sum) : 0;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SAPPHIRE
#endif // _SIMSIMD_TARGET_X86
#ifdef __cplusplus
}
#endif
#endif
simsimd-6.5.12/include/simsimd/simsimd.h 0000644 0000000 0000000 00000371173 10461020230 0016306 0 ustar 0000000 0000000 /**
* @file simsimd.h
* @brief SIMD-accelerated Similarity Measures and Distance Functions.
* @author Ash Vardanian
* @date March 14, 2023
*
* References:
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics
* Detecting target CPU features at compile time: https://stackoverflow.com/a/28939692/2766161
*
* @section Choosing x86 Target Generations
*
* It's important to provide fine-grained controls over AVX512 families, as they are very fragmented:
*
* - Intel Skylake servers: F, CD, VL, DQ, BW
* - Intel Cascade Lake workstations: F, CD, VL, DQ, BW, VNNI
* > In other words, it extends Skylake with VNNI support
* - Intel Sunny Cove (Ice Lake) servers:
* F, CD, VL, DQ, BW, VNNI, VPOPCNTDQ, IFMA, VBMI, VAES, GFNI, VBMI2, BITALG, VPCLMULQDQ
* - AMD Zen4 (Genoa):
* F, CD, VL, DQ, BW, VNNI, VPOPCNTDQ, IFMA, VBMI, VAES, GFNI, VBMI2, BITALG, VPCLMULQDQ, BF16
* > In other words, it extends Sunny Cove with BF16 support
* - Intel Golden Cove (Sapphire Rapids): extends Zen4 and Sunny Cove with FP16 support
* - AMD Zen5 (Turin): makes VP2INTERSECT cool again
*
* Intel Palm Cove was an irrelevant intermediate release extending Skylake with IFMA and VBMI.
* Intel Willow Cove was an irrelevant intermediate release extending Sunny Cove with VP2INTERSECT,
* which are not supported by other CPUs to date and are only available in Tiger Lake laptops.
* Intel Cooper Lake was the only intermediary platform, that supported BF16, but not FP16.
* It's mostly used in 4-socket and 8-socket high-memory configurations.
*
* For us, it makes sense to differentiate only these AVX512 generations:
* 1. Intel Skylake (pre 2019): supports single-precision dot-products.
* 2. Intel Ice Lake (2019-2021): advanced integer algorithms.
* 3. AMD Genoa (2023+): brain-floating point support.
* 4. Intel Sapphire Rapids (2023+): advanced mixed-precision float processing.
* 5. AMD Turin (2024+): advanced sparse algorithms.
*
* Beyond those, we support AVX2 for old Haswell generation CPUs, and AVX2+VNNI for modern Sierra generation.
*
* To list all available macros for x86, take a recent compiler, like GCC 12 and run:
* gcc-12 -march=sapphirerapids -dM -E - < /dev/null | egrep "SSE|AVX" | sort
* On Arm machines you may want to check for other flags:
* gcc-12 -march=native -dM -E - < /dev/null | egrep "NEON|SVE|FP16|FMA" | sort
*
* @section Choosing Arm Target Generations
*
* Arm CPUs share design IP, but are produced by different vendors, potentially making the platform
* even more fragmented than x86. There are 2 important families of SIMD extensions - NEON and SVE.
*
* - Armv8-A: +fp, +simd
* - Armv8.1-A: armv8-a, +crc, +lse, +rdma
* - Armv8.2-A: armv8.1-a
* - Armv8.3-A: armv8.2-a, +pauth
* - Armv8.4-A: armv8.3-a, +flagm, +fp16fml, +dotprod
* - Armv8.5-A: armv8.4-a, +sb, +ssbs, +predres
* - Armv8.6-A: armv8.5-a, +bf16, +i8mm
* - Armv8.7-A: armv8.6-a, +ls64
* - Armv8.8-A: armv8.7-a, +mops
* - Armv8.9-A: armv8.8-a
* - Armv9-A: armv8.5-a, +sve, +sve2
* - Armv9.1-A: armv9-a, +bf16, +i8mm
* - Armv9.2-A: armv9.1-a, +ls64
* - Armv9.3-A: armv9.2-a, +mops
* - Armv9.4-A: armv9.3-a
*
* SVE has been optional since Armv8.2-A, but it's a requirement for Armv9.0-A.
* A 512-bit SVE variant has already been implemented on the Fugaku supercomputer.
* A more flexible version, 2x256 SVE, was implemented by the AWS Graviton3 ARM processor.
* Here are the most important recent families of CPU cores designed by Arm:
*
* - Neoverse N1: armv8.2-a, extended with Armv8.4 "dotprod" instructions.
* Used in AWS @b Graviton2 and Ampere @b Altra.
* https://developer.arm.com/Processors/Neoverse%20N1
* - Neoverse V1: armv8.4-a, extended with Armv8.6 bfloat/int8 "matmul" instructions.
* Used in AWS @b Graviton3, which also enables `sve`, `svebf16`, and `svei8mm`.
* https://developer.arm.com/Processors/Neoverse%20V1
* - Neoverse V2: armv9.0 with SVE2 and SVE bit-permutes
* Used in AWS @b Graviton4, NVIDIA @b Grace, Google @b Axion.
* https://developer.arm.com/Processors/Neoverse%20V2
* The N2 core is very similar to V2 and is used by Microsoft @b Cobalt.
* https://developer.arm.com/Processors/Neoverse%20N2
*
* On the consumer side, Apple is the biggest player with mobile @b A chips and desktop @b M chips.
* The M1 implements Armv8.5-A, both M2 and M3 implement Armv8.6-A, and M4 is expected to have Armv9.1-A.
*/
#ifndef SIMSIMD_H
#define SIMSIMD_H
#define SIMSIMD_VERSION_MAJOR 6
#define SIMSIMD_VERSION_MINOR 5
#define SIMSIMD_VERSION_PATCH 12
/**
* @brief Removes compile-time dispatching, and replaces it with runtime dispatching.
* So the `simsimd_dot_f32` function will invoke the most advanced backend supported by the CPU,
* that runs the program, rather than the most advanced backend supported by the CPU
* used to compile the library or the downstream application.
*/
#if !defined(SIMSIMD_DYNAMIC_DISPATCH)
#define SIMSIMD_DYNAMIC_DISPATCH (0) // true or false
#endif
#include "binary.h" // Hamming, Jaccard
#include "curved.h" // Mahalanobis, Bilinear Forms
#include "dot.h" // Inner (dot) product, and its conjugate
#include "elementwise.h" // Weighted Sum, Fused-Multiply-Add
#include "geospatial.h" // Haversine and Vincenty
#include "probability.h" // Kullback-Leibler, Jensen–Shannon
#include "sparse.h" // Intersect
#include "spatial.h" // L2, Cosine
// On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
#if defined(_SIMSIMD_DEFINED_APPLE)
#include // `fesetenv` - part of C 99 standard
#include // `sysctlbyname`
#endif
// Detect POSIX extensions availability for signal handling.
// POSIX extensions provide `sigaction`, `sigjmp_buf`, and `sigsetjmp` for safe signal handling.
// These are needed on Linux ARM for safely testing `mrs` instruction availability.
#if defined(_SIMSIMD_DEFINED_LINUX) && defined(_POSIX_VERSION)
#include // `sigjmp_buf`, `sigsetjmp`, `siglongjmp`
#include // `sigaction`, `SIGILL`
#define _SIMSIMD_HAS_POSIX_EXTENSIONS 1
#else
#define _SIMSIMD_HAS_POSIX_EXTENSIONS 0
#endif
// On Windows ARM, we use IsProcessorFeaturePresent API for capability detection
#if defined(_SIMSIMD_DEFINED_WINDOWS) && _SIMSIMD_TARGET_ARM
#include // `IsProcessorFeaturePresent`
#endif
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Enumeration of supported metric kinds.
* Some have aliases for convenience.
*/
typedef enum {
simsimd_metric_unknown_k = 0, ///< Unknown metric kind
// Classics:
simsimd_metric_dot_k = 'i', ///< Inner product
simsimd_metric_inner_k = 'i', ///< Inner product alias
simsimd_metric_vdot_k = 'v', ///< Complex inner product
simsimd_metric_cos_k = 'c', ///< Cosine similarity
simsimd_metric_cosine_k = 'c', ///< Cosine similarity alias
simsimd_metric_angular_k = 'c', ///< Cosine similarity alias
simsimd_metric_l2_k = '2', ///< Euclidean distance alias
simsimd_metric_euclidean_k = '2', ///< Euclidean distance alias
simsimd_metric_l2sq_k = 'e', ///< Squared Euclidean distance
simsimd_metric_sqeuclidean_k = 'e', ///< Squared Euclidean distance alias
// Binary:
simsimd_metric_hamming_k = 'h', ///< Hamming distance
simsimd_metric_manhattan_k = 'h', ///< Manhattan distance is same as Hamming
simsimd_metric_jaccard_k = 'j', ///< Jaccard coefficient
simsimd_metric_tanimoto_k = 'j', ///< Tanimoto coefficient is same as Jaccard
// Sets:
simsimd_metric_intersect_k = 'x', ///< Equivalent to unnormalized Jaccard
simsimd_metric_spdot_counts_k = 'y', ///< Sparse sets with integer weights
simsimd_metric_spdot_weights_k = 'z', ///< Sparse sets with brain floating-point weights
// Curved Spaces:
simsimd_metric_bilinear_k = 'b', ///< Bilinear form
simsimd_metric_mahalanobis_k = 'm', ///< Mahalanobis distance
// Probability:
simsimd_metric_kl_k = 'k', ///< Kullback-Leibler divergence
simsimd_metric_kullback_leibler_k = 'k', ///< Kullback-Leibler divergence alias
simsimd_metric_js_k = 's', ///< Jensen-Shannon divergence
simsimd_metric_jensen_shannon_k = 's', ///< Jensen-Shannon divergence alias
// BLAS-like operations:
simsimd_metric_fma_k = 'f', ///< Fused Multiply-Add
simsimd_metric_wsum_k = 'w', ///< Weighted Sum
} simsimd_metric_kind_t;
/**
* @brief Enumeration of SIMD capabilities of the target architecture.
*/
typedef enum {
simsimd_cap_serial_k = 1, ///< Serial (non-SIMD) capability
simsimd_cap_any_k = 0x7FFFFFFF, ///< Mask representing any capability with `INT_MAX`
simsimd_cap_haswell_k = 1 << 10, ///< x86 AVX2 capability with FMA and F16C extensions
simsimd_cap_skylake_k = 1 << 11, ///< x86 AVX512 baseline capability
simsimd_cap_ice_k = 1 << 12, ///< x86 AVX512 capability with advanced integer algos
simsimd_cap_genoa_k = 1 << 13, ///< x86 AVX512 capability with `bf16` support
simsimd_cap_sapphire_k = 1 << 14, ///< x86 AVX512 capability with `f16` support
simsimd_cap_turin_k = 1 << 15, ///< x86 AVX512 capability with conflict detection
simsimd_cap_sierra_k = 1 << 16, ///< x86 AVX2+VNNI capability with `i8` dot-products
simsimd_cap_neon_k = 1 << 20, ///< ARM NEON baseline capability
simsimd_cap_neon_f16_k = 1 << 21, ///< ARM NEON `f16` capability
simsimd_cap_neon_bf16_k = 1 << 22, ///< ARM NEON `bf16` capability
simsimd_cap_neon_i8_k = 1 << 23, ///< ARM NEON `i8` capability
simsimd_cap_sve_k = 1 << 24, ///< ARM SVE baseline capability
simsimd_cap_sve_f16_k = 1 << 25, ///< ARM SVE `f16` capability
simsimd_cap_sve_bf16_k = 1 << 26, ///< ARM SVE `bf16` capability
simsimd_cap_sve_i8_k = 1 << 27, ///< ARM SVE `i8` capability
simsimd_cap_sve2_k = 1 << 28, ///< ARM SVE2 capability
simsimd_cap_sve2p1_k = 1 << 29, ///< ARM SVE2p1 capability
} simsimd_capability_t;
/**
* @brief Enumeration of supported data types.
*
* Includes complex type descriptors which in C code would use the real counterparts,
* but the independent flags contain metadata to be passed between programming language
* interfaces.
*/
typedef enum {
simsimd_datatype_unknown_k = 0, ///< Unknown data type
simsimd_datatype_b8_k = 1 << 1, ///< Single-bit values packed into 8-bit words
simsimd_datatype_b1x8_k = simsimd_datatype_b8_k, ///< Single-bit values packed into 8-bit words
simsimd_datatype_i4x2_k = 1 << 19, ///< 4-bit signed integers packed into 8-bit words
simsimd_datatype_i8_k = 1 << 2, ///< 8-bit signed integer
simsimd_datatype_i16_k = 1 << 3, ///< 16-bit signed integer
simsimd_datatype_i32_k = 1 << 4, ///< 32-bit signed integer
simsimd_datatype_i64_k = 1 << 5, ///< 64-bit signed integer
simsimd_datatype_u8_k = 1 << 6, ///< 8-bit unsigned integer
simsimd_datatype_u16_k = 1 << 7, ///< 16-bit unsigned integer
simsimd_datatype_u32_k = 1 << 8, ///< 32-bit unsigned integer
simsimd_datatype_u64_k = 1 << 9, ///< 64-bit unsigned integer
simsimd_datatype_f64_k = 1 << 10, ///< Double precision floating point
simsimd_datatype_f32_k = 1 << 11, ///< Single precision floating point
simsimd_datatype_f16_k = 1 << 12, ///< Half precision floating point
simsimd_datatype_bf16_k = 1 << 13, ///< Brain floating point
simsimd_datatype_f64c_k = 1 << 20, ///< Complex double precision floating point
simsimd_datatype_f32c_k = 1 << 21, ///< Complex single precision floating point
simsimd_datatype_f16c_k = 1 << 22, ///< Complex half precision floating point
simsimd_datatype_bf16c_k = 1 << 23, ///< Complex brain floating point
} simsimd_datatype_t;
/**
* @brief Type-punned function pointer for dense vector representations and simplest similarity measures.
*
* @param[in] a Pointer to the first data array.
* @param[in] b Pointer to the second data array.
* @param[in] n Number of scalar words in the input arrays.
* When dealing with sub-byte types, the number of scalar words is the number of bytes.
* When dealing with complex types, the number of scalar words is the sum of real and imaginary parts.
* @param[out] d Output value as a double-precision float.
* In complex dot-products @b two scalars are exported for the real and imaginary parts.
*/
typedef void (*simsimd_metric_dense_punned_t)(void const *a, void const *b, simsimd_size_t n, simsimd_distance_t *d);
/**
* @brief Type-punned function pointer for sparse vector representations and similarity measures.
*
* @param[in] a Pointer to the first data array, generally a sorted array of integers.
* @param[in] b Pointer to the second data array, generally a sorted array of integers.
* @param[in] a_length Number of scalar words in the first input array.
* @param[in] b_length Number of scalar words in the second input array.
* @param[out] d Output value as a double-precision float, generally without decimals.
*/
typedef void (*simsimd_metric_sparse_punned_t)( //
void const *a, void const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *d);
/**
* @brief Type-punned function pointer for curved vector spaces and similarity measures.
*
* @param[in] a Pointer to the first data array.
* @param[in] b Pointer to the second data array.
* @param[in] c Pointer to the metric tensor array or some covariance matrix.
* @param[in] n Number of scalar words in the input arrays.
* @param[out] d Output value as a double-precision float.
*/
typedef void (*simsimd_metric_curved_punned_t)( //
void const *a, void const *b, void const *c, //
simsimd_size_t n, simsimd_distance_t *d);
/**
* @brief Type-punned function pointer for FMA operations on dense vector representations.
* Implements the `y = alpha * a * b + beta * c` operation.
*
* @param[in] a Pointer to the first data array.
* @param[in] b Pointer to the second data array.
* @param[in] c Pointer to the third data array.
* @param[in] n Number of scalar words in the input arrays.
* @param[in] alpha Scaling factor for the first two arrays.
* @param[in] beta Scaling factor for the third array.
* @param[out] y Output value in the same precision as the input arrays.
*/
typedef void (*simsimd_kernel_fma_punned_t)( //
void const *a, void const *b, void const *c, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, void *y);
/**
* @brief Type-punned function pointer for Weighted Sum operations on dense vector representations.
* Implements the `y = alpha * a + beta * b` operation.
*
* @param[in] a Pointer to the first data array.
* @param[in] b Pointer to the second data array.
* @param[in] n Number of scalar words in the input arrays.
* @param[in] alpha Scaling factor for the first array.
* @param[in] beta Scaling factor for the second array.
* @param[out] y Output value in the same precision as the input arrays.
*/
typedef void (*simsimd_kernel_wsum_punned_t)( //
void const *a, void const *b, //
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, void *y);
/**
* @brief Type-punned function pointer for a SimSIMD public interface.
*
* Can be a `simsimd_metric_dense_punned_t`, `simsimd_metric_sparse_punned_t`, `simsimd_metric_curved_punned_t`,
* `simsimd_kernel_fma_punned_t`, or `simsimd_kernel_wsum_punned_t`.
*/
typedef void (*simsimd_kernel_punned_t)(void *);
#if SIMSIMD_DYNAMIC_DISPATCH
SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void);
SIMSIMD_DYNAMIC void simsimd_find_kernel_punned( //
simsimd_metric_kind_t kind, //
simsimd_datatype_t datatype, //
simsimd_capability_t supported, //
simsimd_capability_t allowed, //
simsimd_kernel_punned_t *kernel_output, //
simsimd_capability_t *capability_output);
SIMSIMD_DYNAMIC int simsimd_flush_denormals(void);
#else
SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities(void);
SIMSIMD_PUBLIC void simsimd_find_kernel_punned( //
simsimd_metric_kind_t kind, //
simsimd_datatype_t datatype, //
simsimd_capability_t supported, //
simsimd_capability_t allowed, //
simsimd_kernel_punned_t *kernel_output, //
simsimd_capability_t *capability_output);
SIMSIMD_PUBLIC int simsimd_flush_denormals(void);
#endif
#if _SIMSIMD_TARGET_X86
/**
* @brief Function to flush denormalized numbers to zero on x86 CPUs.
* @note This should be called on each thread before any SIMD operations to avoid performance penalties.
* @return 1 if the operation was successful, 0 otherwise.
*/
SIMSIMD_PUBLIC int _simsimd_flush_denormals_x86(void) {
#if defined(_MSC_VER)
unsigned int mxcsr = _mm_getcsr();
mxcsr |= 1 << 15; // bit 15 = Flush-To-Zero (FTZ)
mxcsr |= 1 << 6; // bit 6 = Denormals-Are-Zero (DAZ)
_mm_setcsr(mxcsr);
#else // GCC, Clang, ICC
unsigned int mxcsr;
__asm__ __volatile__("stmxcsr %0" : "=m"(mxcsr));
mxcsr |= 1 << 15; // bit 15 = Flush-To-Zero (FTZ)
mxcsr |= 1 << 6; // bit 6 = Denormals-Are-Zero (DAZ)
__asm__ __volatile__("ldmxcsr %0" : : "m"(mxcsr));
#endif
return 1;
}
/**
* @brief Function to determine the SIMD capabilities of the current 64-bit x86 machine at @b runtime.
* @return A bitmask of the SIMD capabilities represented as a `simsimd_capability_t` enum value.
*/
SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_x86(void) {
/// The states of 4 registers populated for a specific "cpuid" assembly call
union four_registers_t {
int array[4];
struct separate_t {
unsigned eax, ebx, ecx, edx;
} named;
} info1, info7, info7sub1;
#if defined(_MSC_VER)
__cpuidex(info1.array, 1, 0);
__cpuidex(info7.array, 7, 0);
__cpuidex(info7sub1.array, 7, 1);
#else // GCC, Clang, ICC
__asm__ __volatile__( //
"cpuid"
: "=a"(info1.named.eax), "=b"(info1.named.ebx), "=c"(info1.named.ecx), "=d"(info1.named.edx)
: "a"(1), "c"(0));
__asm__ __volatile__( //
"cpuid"
: "=a"(info7.named.eax), "=b"(info7.named.ebx), "=c"(info7.named.ecx), "=d"(info7.named.edx)
: "a"(7), "c"(0));
__asm__ __volatile__( //
"cpuid"
: "=a"(info7sub1.named.eax), "=b"(info7sub1.named.ebx), "=c"(info7sub1.named.ecx), "=d"(info7sub1.named.edx)
: "a"(7), "c"(1));
#endif
// Check for AVX2 (Function ID 7, EBX register)
// https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L148
unsigned supports_avx2 = (info7.named.ebx & 0x00000020) != 0;
// Check for F16C (Function ID 1, ECX register)
// https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L107
unsigned supports_f16c = (info1.named.ecx & 0x20000000) != 0;
unsigned supports_fma = (info1.named.ecx & 0x00001000) != 0;
// Check for AVX512F (Function ID 7, EBX register)
// https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L155
unsigned supports_avx512f = (info7.named.ebx & 0x00010000) != 0;
// Check for AVX512FP16 (Function ID 7, EDX register)
// https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L198C9-L198C23
unsigned supports_avx512fp16 = (info7.named.edx & 0x00800000) != 0;
// Check for AVX512VNNI (Function ID 7, ECX register)
unsigned supports_avx512vnni = (info7.named.ecx & 0x00000800) != 0;
// Check for AVX512IFMA (Function ID 7, EBX register)
unsigned supports_avx512ifma = (info7.named.ebx & 0x00200000) != 0;
// Check for AVX512BITALG (Function ID 7, ECX register)
unsigned supports_avx512bitalg = (info7.named.ecx & 0x00001000) != 0;
// Check for AVX512VBMI2 (Function ID 7, ECX register)
unsigned supports_avx512vbmi2 = (info7.named.ecx & 0x00000040) != 0;
// Check for AVX512VPOPCNTDQ (Function ID 7, ECX register)
unsigned supports_avx512vpopcntdq = (info7.named.ecx & 0x00004000) != 0;
// Check for AVX512BF16 (Function ID 7, Sub-leaf 1, EAX register)
// https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L205
unsigned supports_avx512bf16 = (info7sub1.named.eax & 0x00000020) != 0;
// Clang doesn't show the VP2INTERSECT flag, but we can get it from QEMU
// https://stackoverflow.com/a/68289220/2766161
unsigned supports_avx512vp2intersect = (info7.named.edx & 0x00000100) != 0;
// Convert specific features into CPU generations
unsigned supports_haswell = supports_avx2 && supports_f16c && supports_fma;
unsigned supports_skylake = supports_avx512f;
unsigned supports_ice = supports_avx512vnni && supports_avx512ifma && supports_avx512bitalg &&
supports_avx512vbmi2 && supports_avx512vpopcntdq;
unsigned supports_genoa = supports_avx512bf16;
unsigned supports_sapphire = supports_avx512fp16;
// We don't want to accidentally enable AVX512VP2INTERSECT on Intel Tiger Lake CPUs
unsigned supports_turin = supports_avx512vp2intersect && supports_avx512bf16;
unsigned supports_sierra = 0;
return (simsimd_capability_t)( //
(simsimd_cap_haswell_k * supports_haswell) | //
(simsimd_cap_skylake_k * supports_skylake) | //
(simsimd_cap_ice_k * supports_ice) | //
(simsimd_cap_genoa_k * supports_genoa) | //
(simsimd_cap_sapphire_k * supports_sapphire) | //
(simsimd_cap_turin_k * supports_turin) | //
(simsimd_cap_sierra_k * supports_sierra) | //
(simsimd_cap_serial_k));
}
#endif // _SIMSIMD_TARGET_X86
#if _SIMSIMD_TARGET_ARM
/* Compiling the next section one may get: selected processor does not support system register name 'id_aa64zfr0_el1'.
* Suppressing assembler errors is very complicated, so when dealing with older ARM CPUs it's simpler to compile this
* function targeting newer ones.
*/
#pragma GCC push_options
#pragma GCC target("arch=armv8.5-a+sve")
#pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function)
#if _SIMSIMD_HAS_POSIX_EXTENSIONS
/** @brief SIGILL handler for `mrs` instruction testing on Linux ARM */
static sigjmp_buf _simsimd_mrs_test_jump_buffer;
static void _simsimd_mrs_test_sigill_handler(int sig) {
(void)sig; // Unused parameter
siglongjmp(_simsimd_mrs_test_jump_buffer, 1);
}
#endif
/**
* @brief Function to flush denormalized numbers to zero on Arm CPUs.
* @note This should be called on each thread before any SIMD operations to avoid performance penalties.
* @note On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
* @return 1 if the operation was successful, 0 otherwise.
*/
SIMSIMD_PUBLIC int _simsimd_flush_denormals_arm(void) {
#if defined(_SIMSIMD_DEFINED_APPLE)
// https://stackoverflow.com/a/19904907/2766161
// https://stackoverflow.com/a/78252076/2766161
int is_success = fesetenv(FE_DFL_DISABLE_DENORMS_ENV) == 0;
return is_success;
#elif defined(_SIMSIMD_DEFINED_LINUX)
// For Linux, we can toggle bits in the Floating-point Control Register (FPCR)
// https://developer.arm.com/documentation/ddi0601/2024-12/AArch64-Registers/FPCR--Floating-point-Control-Register
uint64_t fpcr;
__asm__ volatile("mrs %0, fpcr" : "=r"(fpcr));
fpcr |= (1 << 19); // bit 19 = FZ16 (Flush half-precision to zero)
fpcr |= (1 << 24); // bit 24 = FZ (Flush subnormals to zero)
fpcr |= (1 << 25); // bit 25 = DN (Force Default NaN instead of preserving payload)
__asm__ volatile("msr fpcr, %0" : : "r"(fpcr));
return 1;
#else
return 0;
#endif
}
/**
* @brief Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime.
* @return A bitmask of the SIMD capabilities represented as a `simsimd_capability_t` enum value.
*/
SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_arm(void) {
#if defined(_SIMSIMD_DEFINED_APPLE)
// On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
unsigned supports_neon = 0, supports_fp16 = 0, supports_bf16 = 0, supports_i8mm = 0;
size_t size = sizeof(supports_neon);
if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
if (sysctlbyname("hw.optional.arm.FEAT_FP16", &supports_fp16, &size, NULL, 0) != 0) supports_fp16 = 0;
if (sysctlbyname("hw.optional.arm.FEAT_BF16", &supports_bf16, &size, NULL, 0) != 0) supports_bf16 = 0;
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &supports_i8mm, &size, NULL, 0) != 0) supports_i8mm = 0;
return (simsimd_capability_t)( //
(simsimd_cap_neon_k * (supports_neon)) | //
(simsimd_cap_neon_f16_k * (supports_neon && supports_fp16)) | //
(simsimd_cap_neon_bf16_k * (supports_neon && supports_bf16)) | //
(simsimd_cap_neon_i8_k * (supports_neon && supports_i8mm)) | //
(simsimd_cap_serial_k));
#elif defined(_SIMSIMD_DEFINED_LINUX)
// Depending on the environment, reading system registers may cause SIGILL.
// One option to avoid the crash is to use `getauxval(AT_HWCAP)` and `getauxval(AT_HWCAP2)`,
// Linux APIs, but those aren't as informative as reading the registers directly.
// So before reading the ID registers, we set up a signal handler to catch SIGILL
// and probe one of the registers, reverting back to the old signal handler afterwards.
//
// This issue was originally observed in: https://github.com/ashvardanian/SimSIMD/issues/279
#if _SIMSIMD_HAS_POSIX_EXTENSIONS
struct sigaction action_new, action_old;
action_new.sa_handler = _simsimd_mrs_test_sigill_handler;
sigemptyset(&action_new.sa_mask);
action_new.sa_flags = 0;
int mrs_works = 0;
if (sigaction(SIGILL, &action_new, &action_old) == 0) {
if (sigsetjmp(_simsimd_mrs_test_jump_buffer, 1) == 0) {
unsigned long midr_value;
__asm__ __volatile__("mrs %0, MIDR_EL1" : "=r"(midr_value));
mrs_works = 1;
}
sigaction(SIGILL, &action_old, NULL);
}
// Early exit if `mrs` doesn't work - return conservative NEON-only capabilities
if (!mrs_works) return (simsimd_capability_t)(simsimd_cap_neon_k | simsimd_cap_serial_k);
#else // _SIMSIMD_HAS_POSIX_EXTENSIONS
// Without POSIX signal handlers, fall back to conservative NEON capabilities.
return (simsimd_capability_t)(simsimd_cap_neon_k | simsimd_cap_serial_k);
#endif // _SIMSIMD_HAS_POSIX_EXTENSIONS
// Read CPUID registers directly
unsigned long id_aa64isar0_el1 = 0, id_aa64isar1_el1 = 0, id_aa64pfr0_el1 = 0, id_aa64zfr0_el1 = 0;
// Now let's unpack the status flags from ID_AA64ISAR0_EL1
// https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0?lang=en
__asm__ __volatile__("mrs %0, ID_AA64ISAR0_EL1" : "=r"(id_aa64isar0_el1));
// DP, bits [47:44] of ID_AA64ISAR0_EL1
unsigned supports_integer_dot_products = ((id_aa64isar0_el1 >> 44) & 0xF) >= 1;
// Now let's unpack the status flags from ID_AA64ISAR1_EL1
// https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR1-EL1--AArch64-Instruction-Set-Attribute-Register-1?lang=en
__asm__ __volatile__("mrs %0, ID_AA64ISAR1_EL1" : "=r"(id_aa64isar1_el1));
// I8MM, bits [55:52] of ID_AA64ISAR1_EL1
unsigned supports_i8mm = ((id_aa64isar1_el1 >> 52) & 0xF) >= 1;
// BF16, bits [47:44] of ID_AA64ISAR1_EL1
unsigned supports_bf16 = ((id_aa64isar1_el1 >> 44) & 0xF) >= 1;
// Now let's unpack the status flags from ID_AA64PFR0_EL1
// https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64PFR0-EL1--AArch64-Processor-Feature-Register-0?lang=en
__asm__ __volatile__("mrs %0, ID_AA64PFR0_EL1" : "=r"(id_aa64pfr0_el1));
// SVE, bits [35:32] of ID_AA64PFR0_EL1
unsigned supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1;
// AdvSIMD, bits [23:20] of ID_AA64PFR0_EL1 can be used to check for `fp16` support
// - 0b0000: integers, single, double precision arithmetic
// - 0b0001: includes support for half-precision floating-point arithmetic
// - 0b1111: NEON is not supported?!
// That's a really weird way to encode lack of NEON support, but it's important to
// check in case we are running on R-profile CPUs.
unsigned supports_fp16 = ((id_aa64pfr0_el1 >> 20) & 0xF) == 0x1;
unsigned supports_neon = ((id_aa64pfr0_el1 >> 20) & 0xF) != 0xF;
// Now let's unpack the status flags from ID_AA64ZFR0_EL1
// https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ZFR0-EL1--SVE-Feature-ID-Register-0?lang=en
if (supports_sve) __asm__ __volatile__("mrs %0, ID_AA64ZFR0_EL1" : "=r"(id_aa64zfr0_el1));
// I8MM, bits [47:44] of ID_AA64ZFR0_EL1
unsigned supports_sve_i8mm = ((id_aa64zfr0_el1 >> 44) & 0xF) >= 1;
// BF16, bits [23:20] of ID_AA64ZFR0_EL1
unsigned supports_sve_bf16 = ((id_aa64zfr0_el1 >> 20) & 0xF) >= 1;
// SVEver, bits [3:0] can be used to check for capability levels:
// - 0b0000: SVE is implemented
// - 0b0001: SVE2 is implemented
// - 0b0010: SVE2.1 is implemented
// This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
unsigned supports_sve2p1 = ((id_aa64zfr0_el1) & 0xF) >= 2;
return (simsimd_capability_t)( //
(simsimd_cap_neon_k * (supports_neon)) | //
(simsimd_cap_neon_f16_k * (supports_neon && supports_fp16)) | //
(simsimd_cap_neon_bf16_k * (supports_neon && supports_bf16)) | //
(simsimd_cap_neon_i8_k * (supports_neon && supports_i8mm && supports_integer_dot_products)) | //
(simsimd_cap_sve_k * (supports_sve)) | //
(simsimd_cap_sve_f16_k * (supports_sve && supports_fp16)) | //
(simsimd_cap_sve_bf16_k * (supports_sve && supports_sve_bf16)) | //
(simsimd_cap_sve_i8_k * (supports_sve && supports_sve_i8mm)) | //
(simsimd_cap_sve2_k * (supports_sve2)) | //
(simsimd_cap_sve2p1_k * (supports_sve2p1)) | //
(simsimd_cap_serial_k));
#elif defined(_SIMSIMD_DEFINED_WINDOWS)
unsigned supports_neon = 0, supports_dp = 0;
// On Windows ARM, use the `IsProcessorFeaturePresent` API for capability detection.
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent
#if defined(PF_ARM_V8_INSTRUCTIONS_AVAILABLE)
supports_neon = IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE);
#endif
#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
supports_dp = IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE);
#endif
// Windows API doesn't provide reliable detection for FP16, BF16.
return (simsimd_capability_t)( //
(simsimd_cap_neon_k * (supports_neon)) | //
(simsimd_cap_neon_i8_k * (supports_neon && supports_dp)) | //
(simsimd_cap_serial_k));
#else // Unknown platform
// Conservative fallback for unknown platforms: NEON is mandatory in ARMv8-A (ARM64)
return (simsimd_capability_t)(simsimd_cap_neon_k | simsimd_cap_serial_k);
#endif
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
/**
* @brief Function to flush @b denormalized numbers to zero to avoid performance penalties.
* @return 1 if the operation was successful, 0 otherwise.
*
* When facing denormalized values Fused-Multiply-Add (FMA) operations can be up to 30x slower,
* as measured on Intel Sapphire Rapids: https://github.com/ashvardanian/ParallelReductionsBenchmark
*/
SIMSIMD_PUBLIC int _simsimd_flush_denormals(void) {
#if _SIMSIMD_TARGET_X86
return _simsimd_flush_denormals_x86();
#endif // _SIMSIMD_TARGET_X86
#if _SIMSIMD_TARGET_ARM
return _simsimd_flush_denormals_arm();
#endif // _SIMSIMD_TARGET_ARM
return 0;
}
/**
* @brief Function to determine the SIMD capabilities of the current 64-bit x86 machine at @b runtime.
* @return A bitmask of the SIMD capabilities represented as a `simsimd_capability_t` enum value.
*/
SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_implementation(void) {
#if _SIMSIMD_TARGET_X86
return _simsimd_capabilities_x86();
#endif // _SIMSIMD_TARGET_X86
#if _SIMSIMD_TARGET_ARM
return _simsimd_capabilities_arm();
#endif // _SIMSIMD_TARGET_ARM
return simsimd_cap_serial_k;
}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wcast-function-type"
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wcast-function-type"
#ifdef __cplusplus //! option "-Wvolatile" is valid for C++/ObjC++ but not for C
#pragma GCC diagnostic ignored "-Wvolatile"
#pragma clang diagnostic ignored "-Wvolatile"
#endif
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f64(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE
if (v & simsimd_cap_sve_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f64_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f64_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f64_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f64_sve, *c = simsimd_cap_sve_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON
if (v & simsimd_cap_neon_k) switch (k) {
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f64_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f64_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f64_neon, *c = simsimd_cap_neon_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_SKYLAKE
if (v & simsimd_cap_skylake_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f64_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f64_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f64_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f64_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f64_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f64_skylake, *c = simsimd_cap_skylake_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_HASWELL
if (v & simsimd_cap_haswell_k) switch (k) {
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f64_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f64_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f64_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f64_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f64_haswell, *c = simsimd_cap_haswell_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f64_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f64_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f64_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f64_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_js_k: *m = (m_t)&simsimd_js_f64_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f64_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f64_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_mahalanobis_k: *m = (m_t)&simsimd_mahalanobis_f64_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f64_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f64_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE
if (v & simsimd_cap_sve_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f32_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f32_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f32_sve, *c = simsimd_cap_sve_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON
if (v & simsimd_cap_neon_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f32_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f32_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f32_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_js_k: *m = (m_t)&simsimd_js_f32_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f32_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f32_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f32_neon, *c = simsimd_cap_neon_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_SKYLAKE
if (v & simsimd_cap_skylake_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f32_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f32_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f32_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_js_k: *m = (m_t)&simsimd_js_f32_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f32_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f32_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_mahalanobis_k:
*m = (m_t)&simsimd_mahalanobis_f32_skylake, *c = simsimd_cap_skylake_k;
return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f32_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f32_skylake, *c = simsimd_cap_skylake_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_HASWELL
if (v & simsimd_cap_haswell_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f32_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f32_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f32_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f32_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f32_haswell, *c = simsimd_cap_haswell_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f32_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f32_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f32_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_js_k: *m = (m_t)&simsimd_js_f32_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f32_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f32_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_mahalanobis_k: *m = (m_t)&simsimd_mahalanobis_f32_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f32_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f32_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f16(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE_F16
if (v & simsimd_cap_sve_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16_sve, *c = simsimd_cap_sve_f16_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f16_sve, *c = simsimd_cap_sve_f16_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f16_sve, *c = simsimd_cap_sve_f16_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f16_sve, *c = simsimd_cap_sve_f16_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON_F16
if (v & simsimd_cap_neon_f16_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f16_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f16_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f16_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_js_k: *m = (m_t)&simsimd_js_f16_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f16_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f16_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_mahalanobis_k: *m = (m_t)&simsimd_mahalanobis_f16_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f16_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f16_neon, *c = simsimd_cap_neon_f16_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_SAPPHIRE
if (v & simsimd_cap_sapphire_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f16_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f16_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f16_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_js_k: *m = (m_t)&simsimd_js_f16_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f16_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f16_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_mahalanobis_k:
*m = (m_t)&simsimd_mahalanobis_f16_sapphire, *c = simsimd_cap_sapphire_k;
return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f16_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f16_sapphire, *c = simsimd_cap_sapphire_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_HASWELL
if (v & simsimd_cap_haswell_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_js_k: *m = (m_t)&simsimd_js_f16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_mahalanobis_k:
*m = (m_t)&simsimd_mahalanobis_f16_haswell, *c = simsimd_cap_haswell_k;
return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f16_haswell, *c = simsimd_cap_haswell_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_f16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_f16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_f16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_js_k: *m = (m_t)&simsimd_js_f16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_mahalanobis_k: *m = (m_t)&simsimd_mahalanobis_f16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f16_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_bf16(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE_BF16
if (v & simsimd_cap_sve_bf16_k) switch (k) {
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_bf16_sve, *c = simsimd_cap_sve_bf16_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_bf16_sve, *c = simsimd_cap_sve_bf16_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_bf16_sve, *c = simsimd_cap_sve_bf16_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON_BF16
if (v & simsimd_cap_neon_bf16_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_bf16_neon, *c = simsimd_cap_neon_bf16_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_bf16_neon, *c = simsimd_cap_neon_bf16_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_bf16_neon, *c = simsimd_cap_neon_bf16_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_bf16_neon, *c = simsimd_cap_neon_bf16_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_bf16_neon, *c = simsimd_cap_neon_bf16_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_bf16_neon, *c = simsimd_cap_neon_bf16_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_GENOA
if (v & simsimd_cap_genoa_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_bf16_genoa, *c = simsimd_cap_genoa_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_bf16_genoa, *c = simsimd_cap_genoa_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_bf16_genoa, *c = simsimd_cap_genoa_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_bf16_genoa, *c = simsimd_cap_genoa_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_bf16_genoa, *c = simsimd_cap_genoa_k; return;
case simsimd_metric_mahalanobis_k: *m = (m_t)&simsimd_mahalanobis_bf16_genoa, *c = simsimd_cap_genoa_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_SKYLAKE
if (v & simsimd_cap_skylake_k) switch (k) {
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_bf16_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_bf16_skylake, *c = simsimd_cap_skylake_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_HASWELL
if (v & simsimd_cap_haswell_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_bf16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_bf16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_bf16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_bf16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_bf16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_mahalanobis_k:
*m = (m_t)&simsimd_mahalanobis_bf16_haswell, *c = simsimd_cap_haswell_k;
return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_bf16_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_bf16_haswell, *c = simsimd_cap_haswell_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_bf16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_bf16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_bf16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_bf16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_js_k: *m = (m_t)&simsimd_js_bf16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_bf16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_bf16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_mahalanobis_k:
*m = (m_t)&simsimd_mahalanobis_bf16_serial, *c = simsimd_cap_serial_k;
return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_bf16_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_bf16_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_i8(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_NEON_I8
if (v & simsimd_cap_neon_i8_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_i8_neon, *c = simsimd_cap_neon_i8_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_i8_neon, *c = simsimd_cap_neon_i8_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_i8_neon, *c = simsimd_cap_neon_i8_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_i8_neon, *c = simsimd_cap_neon_i8_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON_F16 //! Scaling of 8-bit integers is performed using 16-bit floats.
if (v & simsimd_cap_neon_f16_k) switch (k) {
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_i8_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_i8_neon, *c = simsimd_cap_neon_f16_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_SAPPHIRE //! Scaling of 8-bit integers is performed using 16-bit floats.
if (v & simsimd_cap_sapphire_k) switch (k) {
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_i8_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_i8_sapphire, *c = simsimd_cap_sapphire_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_ICE
if (v & simsimd_cap_ice_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_i8_ice, *c = simsimd_cap_ice_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_i8_ice, *c = simsimd_cap_ice_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_i8_ice, *c = simsimd_cap_ice_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_i8_ice, *c = simsimd_cap_ice_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_HASWELL
if (v & simsimd_cap_haswell_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_i8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_i8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_i8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_i8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_i8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_i8_haswell, *c = simsimd_cap_haswell_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_i8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_i8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_i8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_i8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_i8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_i8_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u8(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_NEON_I8
if (v & simsimd_cap_neon_i8_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_u8_neon, *c = simsimd_cap_neon_i8_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_u8_neon, *c = simsimd_cap_neon_i8_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_u8_neon, *c = simsimd_cap_neon_i8_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_u8_neon, *c = simsimd_cap_neon_i8_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON_F16 //! Scaling of 8-bit integers is performed using 16-bit floats.
if (v & simsimd_cap_neon_f16_k) switch (k) {
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_u8_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_u8_neon, *c = simsimd_cap_neon_f16_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_SAPPHIRE //! Scaling of 8-bit integers is performed using 16-bit floats.
if (v & simsimd_cap_sapphire_k) switch (k) {
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_u8_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_u8_sapphire, *c = simsimd_cap_sapphire_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_ICE
if (v & simsimd_cap_ice_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_u8_ice, *c = simsimd_cap_ice_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_u8_ice, *c = simsimd_cap_ice_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_u8_ice, *c = simsimd_cap_ice_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_u8_ice, *c = simsimd_cap_ice_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_HASWELL
if (v & simsimd_cap_haswell_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_u8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_u8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_u8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_u8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_u8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_u8_haswell, *c = simsimd_cap_haswell_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_u8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_u8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_u8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_u8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_u8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_u8_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_b8(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE
if (v & simsimd_cap_sve_k) switch (k) {
case simsimd_metric_hamming_k: *m = (m_t)&simsimd_hamming_b8_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_jaccard_k: *m = (m_t)&simsimd_jaccard_b8_sve, *c = simsimd_cap_sve_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON
if (v & simsimd_cap_neon_k) switch (k) {
case simsimd_metric_hamming_k: *m = (m_t)&simsimd_hamming_b8_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_jaccard_k: *m = (m_t)&simsimd_jaccard_b8_neon, *c = simsimd_cap_neon_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_ICE
if (v & simsimd_cap_ice_k) switch (k) {
case simsimd_metric_hamming_k: *m = (m_t)&simsimd_hamming_b8_ice, *c = simsimd_cap_ice_k; return;
case simsimd_metric_jaccard_k: *m = (m_t)&simsimd_jaccard_b8_ice, *c = simsimd_cap_ice_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_HASWELL
if (v & simsimd_cap_haswell_k) switch (k) {
case simsimd_metric_hamming_k: *m = (m_t)&simsimd_hamming_b8_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_jaccard_k: *m = (m_t)&simsimd_jaccard_b8_haswell, *c = simsimd_cap_haswell_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_hamming_k: *m = (m_t)&simsimd_hamming_b8_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_jaccard_k: *m = (m_t)&simsimd_jaccard_b8_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f64c(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE
if (v & simsimd_cap_sve_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f64c_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f64c_sve, *c = simsimd_cap_sve_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_SKYLAKE
if (v & simsimd_cap_skylake_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f64c_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f64c_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f64c_skylake, *c = simsimd_cap_skylake_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f64c_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f64c_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f64c_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32c(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE
if (v & simsimd_cap_sve_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32c_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f32c_sve, *c = simsimd_cap_sve_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON
if (v & simsimd_cap_neon_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32c_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f32c_neon, *c = simsimd_cap_neon_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f32c_neon, *c = simsimd_cap_neon_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_SKYLAKE
if (v & simsimd_cap_skylake_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32c_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f32c_skylake, *c = simsimd_cap_skylake_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f32c_skylake, *c = simsimd_cap_skylake_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_HASWELL
if (v & simsimd_cap_haswell_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32c_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f32c_haswell, *c = simsimd_cap_haswell_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f32c_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f32c_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f32c_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f16c(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE_F16
if (v & simsimd_cap_sve_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16c_sve, *c = simsimd_cap_sve_f16_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f16c_sve, *c = simsimd_cap_sve_f16_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON_F16
if (v & simsimd_cap_neon_f16_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16c_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f16c_neon, *c = simsimd_cap_neon_f16_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f16c_neon, *c = simsimd_cap_neon_bf16_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_SAPPHIRE
if (v & simsimd_cap_sapphire_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16c_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f16c_sapphire, *c = simsimd_cap_sapphire_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f16c_sapphire, *c = simsimd_cap_sapphire_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_HASWELL
if (v & simsimd_cap_haswell_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16c_haswell, *c = simsimd_cap_haswell_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f16c_haswell, *c = simsimd_cap_haswell_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_f16c_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_f16c_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_f16c_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_bf16c(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_NEON_BF16
if (v & simsimd_cap_neon_bf16_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_bf16c_neon, *c = simsimd_cap_neon_bf16_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_bf16c_neon, *c = simsimd_cap_neon_bf16_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_bf16c_neon, *c = simsimd_cap_neon_bf16_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_GENOA
if (v & simsimd_cap_genoa_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_bf16c_genoa, *c = simsimd_cap_genoa_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_bf16c_genoa, *c = simsimd_cap_genoa_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_bf16c_genoa, *c = simsimd_cap_genoa_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_bf16c_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_vdot_k: *m = (m_t)&simsimd_vdot_bf16c_serial, *c = simsimd_cap_serial_k; return;
case simsimd_metric_bilinear_k: *m = (m_t)&simsimd_bilinear_bf16c_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u16(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE2
if (v & simsimd_cap_sve2_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_sve2, *c = simsimd_cap_sve2_k; return;
case simsimd_metric_spdot_counts_k: *m = (m_t)&simsimd_spdot_counts_u16_sve2, *c = simsimd_cap_sve2_k; return;
#if SIMSIMD_TARGET_SVE_BF16 //! We also need `bf16` support for weights
case simsimd_metric_spdot_weights_k: *m = (m_t)&simsimd_spdot_weights_u16_sve2, *c = simsimd_cap_sve2_k; return;
#endif
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON
if (v & simsimd_cap_neon_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_neon, *c = simsimd_cap_neon_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_TURIN
if (v & simsimd_cap_turin_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_turin, *c = simsimd_cap_turin_k; return;
case simsimd_metric_spdot_counts_k: *m = (m_t)&simsimd_spdot_counts_u16_turin, *c = simsimd_cap_turin_k; return;
case simsimd_metric_spdot_weights_k:
*m = (m_t)&simsimd_spdot_weights_u16_turin, *c = simsimd_cap_turin_k;
return;
default: break;
}
#endif
#if SIMSIMD_TARGET_ICE
if (v & simsimd_cap_ice_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_ice, *c = simsimd_cap_skylake_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u32(simsimd_capability_t v, simsimd_metric_kind_t k,
simsimd_kernel_punned_t *m, simsimd_capability_t *c) {
typedef simsimd_kernel_punned_t m_t;
#if SIMSIMD_TARGET_SVE2
if (v & simsimd_cap_sve2_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_sve2, *c = simsimd_cap_sve2_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON
if (v & simsimd_cap_neon_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_neon, *c = simsimd_cap_neon_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_TURIN
if (v & simsimd_cap_turin_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_turin, *c = simsimd_cap_skylake_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_ICE
if (v & simsimd_cap_ice_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_ice, *c = simsimd_cap_skylake_k; return;
default: break;
}
#endif
if (v & simsimd_cap_serial_k) switch (k) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_serial, *c = simsimd_cap_serial_k; return;
default: break;
}
}
/**
* @brief Determines the best suited metric implementation based on the given datatype,
* supported and allowed by hardware capabilities.
*
* @param kind The kind of metric to be evaluated.
* @param datatype The data type for which the metric needs to be evaluated.
* @param supported The hardware capabilities supported by the CPU.
* @param allowed The hardware capabilities allowed for use.
* @param kernel_output Output variable for the selected similarity function.
* @param capability_output Output variable for the utilized hardware capabilities.
*/
SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_implementation( //
simsimd_metric_kind_t kind, //
simsimd_datatype_t datatype, //
simsimd_capability_t supported, //
simsimd_capability_t allowed, //
simsimd_kernel_punned_t *kernel_output, //
simsimd_capability_t *capability_output) {
// Modern compilers abso-freaking-lutely love optimizing-out my logic!
// Just marking the variables as `volatile` is not enough, so we have
// to add inline assembly to further discourage them!
#if defined(_MSC_VER)
_ReadWriteBarrier();
#else
__asm__ __volatile__("" ::: "memory");
#endif
simsimd_kernel_punned_t *m = kernel_output;
simsimd_capability_t *c = capability_output;
simsimd_capability_t viable = (simsimd_capability_t)(supported & allowed);
switch (datatype) {
case simsimd_datatype_f64_k: _simsimd_find_kernel_punned_f64(viable, kind, m, c); return;
case simsimd_datatype_f32_k: _simsimd_find_kernel_punned_f32(viable, kind, m, c); return;
case simsimd_datatype_f16_k: _simsimd_find_kernel_punned_f16(viable, kind, m, c); return;
case simsimd_datatype_bf16_k: _simsimd_find_kernel_punned_bf16(viable, kind, m, c); return;
case simsimd_datatype_i8_k: _simsimd_find_kernel_punned_i8(viable, kind, m, c); return;
case simsimd_datatype_u8_k: _simsimd_find_kernel_punned_u8(viable, kind, m, c); return;
case simsimd_datatype_b8_k: _simsimd_find_kernel_punned_b8(viable, kind, m, c); return;
case simsimd_datatype_f32c_k: _simsimd_find_kernel_punned_f32c(viable, kind, m, c); return;
case simsimd_datatype_f64c_k: _simsimd_find_kernel_punned_f64c(viable, kind, m, c); return;
case simsimd_datatype_f16c_k: _simsimd_find_kernel_punned_f16c(viable, kind, m, c); return;
case simsimd_datatype_bf16c_k: _simsimd_find_kernel_punned_bf16c(viable, kind, m, c); return;
case simsimd_datatype_u16_k: _simsimd_find_kernel_punned_u16(viable, kind, m, c); return;
case simsimd_datatype_u32_k: _simsimd_find_kernel_punned_u32(viable, kind, m, c); return;
// These data-types are not supported yet
case simsimd_datatype_i4x2_k: break;
case simsimd_datatype_i16_k: break;
case simsimd_datatype_i32_k: break;
case simsimd_datatype_i64_k: break;
case simsimd_datatype_u64_k: break;
case simsimd_datatype_unknown_k: break;
default: break;
}
// Replace with zeros if no suitable implementation was found
*m = (simsimd_kernel_punned_t)0;
*c = (simsimd_capability_t)0;
// Modern compilers abso-freaking-lutely love optimizing-out my logic!
// Just marking the variables as `volatile` is not enough, so we have
// to add inline assembly to further discourage them!
#if defined(_MSC_VER)
_ReadWriteBarrier();
#else
__asm__ __volatile__("" ::: "memory");
#endif
}
#pragma GCC diagnostic pop
#pragma clang diagnostic pop
/**
* @brief Selects the most suitable metric implementation based on the given metric kind, datatype,
* and allowed capabilities. @b Don't call too often and prefer caching the `simsimd_capabilities()`.
*
* @param kind The kind of metric to be evaluated.
* @param datatype The data type for which the metric needs to be evaluated.
* @param allowed The hardware capabilities allowed for use.
* @return A function pointer to the selected metric implementation.
*/
SIMSIMD_PUBLIC simsimd_kernel_punned_t simsimd_metric_punned( //
simsimd_metric_kind_t kind, //
simsimd_datatype_t datatype, //
simsimd_capability_t allowed) {
simsimd_kernel_punned_t result = 0;
simsimd_capability_t c = simsimd_cap_serial_k;
simsimd_capability_t supported = simsimd_capabilities();
simsimd_find_kernel_punned(kind, datatype, supported, allowed, &result, &c);
return result;
}
#if SIMSIMD_DYNAMIC_DISPATCH
/* Run-time feature-testing functions
* - Check if the CPU supports NEON or SVE extensions on Arm
* - Check if the CPU supports AVX2 and F16C extensions on Haswell x86 CPUs and newer
* - Check if the CPU supports AVX512F and AVX512BW extensions on Skylake x86 CPUs and newer
* - Check if the CPU supports AVX512VNNI, AVX512IFMA, AVX512BITALG, AVX512VBMI2, and AVX512VPOPCNTDQ
* extensions on Ice Lake x86 CPUs and newer
* - Check if the CPU supports AVX512BF16 extensions on Genoa x86 CPUs and newer
* - Check if the CPU supports AVX512FP16 extensions on Sapphire Rapids x86 CPUs and newer
* - Check if the CPU supports AVX2VP2INTERSECT extensions on Turin x86 CPUs and newer
*
* @return 1 if the CPU supports the SIMD instruction set, 0 otherwise.
*/
SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void);
SIMSIMD_DYNAMIC int simsimd_flush_denormals(void);
SIMSIMD_DYNAMIC int simsimd_uses_dynamic_dispatch(void);
SIMSIMD_DYNAMIC int simsimd_uses_neon(void);
SIMSIMD_DYNAMIC int simsimd_uses_neon_f16(void);
SIMSIMD_DYNAMIC int simsimd_uses_neon_bf16(void);
SIMSIMD_DYNAMIC int simsimd_uses_neon_i8(void);
SIMSIMD_DYNAMIC int simsimd_uses_sve(void);
SIMSIMD_DYNAMIC int simsimd_uses_sve_f16(void);
SIMSIMD_DYNAMIC int simsimd_uses_sve_bf16(void);
SIMSIMD_DYNAMIC int simsimd_uses_sve_i8(void);
SIMSIMD_DYNAMIC int simsimd_uses_sve2(void);
SIMSIMD_DYNAMIC int simsimd_uses_haswell(void);
SIMSIMD_DYNAMIC int simsimd_uses_skylake(void);
SIMSIMD_DYNAMIC int simsimd_uses_ice(void);
SIMSIMD_DYNAMIC int simsimd_uses_genoa(void);
SIMSIMD_DYNAMIC int simsimd_uses_sapphire(void);
SIMSIMD_DYNAMIC int simsimd_uses_turin(void);
SIMSIMD_DYNAMIC int simsimd_uses_sierra(void);
/* Inner products
* - Dot product: the sum of the products of the corresponding elements of two vectors.
* - Complex Dot product: dot product with a conjugate first argument.
* - Complex Conjugate Dot product: dot product with a conjugate first argument.
*
* @param a The first vector.
* @param b The second vector.
* @param n The number of elements in the vectors. Even for complex variants (the number of scalars).
* @param d The output distance value.
*
* @note The dot product can be negative, to use as a distance, take `1 - a * b`.
* @note The dot product is zero if and only if the two vectors are orthogonal.
* @note Defined only for floating-point and integer data types.
*/
SIMSIMD_DYNAMIC void simsimd_dot_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_dot_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_dot_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_dot_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_dot_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_dot_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_dot_f16c(simsimd_f16c_t const *a, simsimd_f16c_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_dot_bf16c(simsimd_bf16c_t const *a, simsimd_bf16c_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_dot_f32c(simsimd_f32c_t const *a, simsimd_f32c_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_dot_f64c(simsimd_f64c_t const *a, simsimd_f64c_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_vdot_f16c(simsimd_f16c_t const *a, simsimd_f16c_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_vdot_bf16c(simsimd_bf16c_t const *a, simsimd_bf16c_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_vdot_f32c(simsimd_f32c_t const *a, simsimd_f32c_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_vdot_f64c(simsimd_f64c_t const *a, simsimd_f64c_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
/* Spatial distances
* - Cosine distance: the cosine of the angle between two vectors.
* - L2 squared distance: the squared Euclidean distance between two vectors.
*
* @param a The first vector.
* @param b The second vector.
* @param n The number of elements in the vectors.
* @param d The output distance value.
*
* @note The output distance value is non-negative.
* @note The output distance value is zero if and only if the two vectors are identical.
* @note Defined only for floating-point and integer data types.
*/
SIMSIMD_DYNAMIC void simsimd_cos_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_cos_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_cos_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_cos_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_cos_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_cos_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2sq_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2sq_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2sq_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2sq_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2sq_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2sq_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_l2_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
/* Binary distances
* - Hamming distance: the number of positions at which the corresponding bits are different.
* - Jaccard distance: ratio of bit-level matching positions (intersection) to the total number of positions (union).
*
* @param a The first binary vector.
* @param b The second binary vector.
* @param n The number of 8-bit words in the vectors.
* @param d The output distance value.
*
* @note The output distance value is non-negative.
* @note The output distance value is zero if and only if the two vectors are identical.
* @note Defined only for binary data.
*/
SIMSIMD_DYNAMIC void simsimd_hamming_b8(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_jaccard_b8(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
/* Probability distributions
* - Jensen-Shannon divergence: a measure of similarity between two probability distributions.
* - Kullback-Leibler divergence: a measure of how one probability distribution diverges from a second.
*
* @param a The first discrete probability distribution.
* @param b The second discrete probability distribution.
* @param n The number of elements in the discrete distributions.
* @param d The output divergence value.
*
* @note The distributions are assumed to be normalized.
* @note The output divergence value is non-negative.
* @note The output divergence value is zero if and only if the two distributions are identical.
* @note Defined only for floating-point data types.
*/
SIMSIMD_DYNAMIC void simsimd_kl_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_kl_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_kl_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_kl_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_js_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_js_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_js_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
SIMSIMD_DYNAMIC void simsimd_js_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d);
#else
/* Compile-time feature-testing functions
* - Check if the CPU supports NEON or SVE extensions on Arm
* - Check if the CPU supports AVX2 and F16C extensions on Haswell x86 CPUs and newer
* - Check if the CPU supports AVX512F and AVX512BW extensions on Skylake x86 CPUs and newer
* - Check if the CPU supports AVX512VNNI, AVX512IFMA, AVX512BITALG, AVX512VBMI2, and AVX512VPOPCNTDQ
* extensions on Ice Lake x86 CPUs and newer
* - Check if the CPU supports AVX512BF16 extensions on Genoa x86 CPUs and newer
* - Check if the CPU supports AVX512FP16 extensions on Sapphire Rapids x86 CPUs and newer
*
* @return 1 if the CPU supports the SIMD instruction set, 0 otherwise.
*/
// clang-format off
SIMSIMD_PUBLIC int simsimd_uses_neon(void) { return _SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_NEON; }
SIMSIMD_PUBLIC int simsimd_uses_neon_f16(void) { return _SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_NEON_F16 ; }
SIMSIMD_PUBLIC int simsimd_uses_neon_bf16(void) { return _SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_NEON_BF16; }
SIMSIMD_PUBLIC int simsimd_uses_neon_i8(void) { return _SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_NEON_I8; }
SIMSIMD_PUBLIC int simsimd_uses_sve(void) { return _SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE; }
SIMSIMD_PUBLIC int simsimd_uses_sve_f16(void) { return _SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE_F16; }
SIMSIMD_PUBLIC int simsimd_uses_sve_bf16(void) { return _SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE_BF16; }
SIMSIMD_PUBLIC int simsimd_uses_sve_i8(void) { return _SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE_I8; }
SIMSIMD_PUBLIC int simsimd_uses_sve2(void) { return _SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE2; }
SIMSIMD_PUBLIC int simsimd_uses_haswell(void) { return _SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_HASWELL; }
SIMSIMD_PUBLIC int simsimd_uses_skylake(void) { return _SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_SKYLAKE; }
SIMSIMD_PUBLIC int simsimd_uses_ice(void) { return _SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_ICE; }
SIMSIMD_PUBLIC int simsimd_uses_genoa(void) { return _SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_GENOA; }
SIMSIMD_PUBLIC int simsimd_uses_sapphire(void) { return _SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_SAPPHIRE; }
SIMSIMD_PUBLIC int simsimd_uses_turin(void) { return _SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_TURIN; }
SIMSIMD_PUBLIC int simsimd_uses_sierra(void) { return _SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_SIERRA; }
SIMSIMD_PUBLIC int simsimd_uses_dynamic_dispatch(void) { return 0; }
SIMSIMD_PUBLIC int simsimd_flush_denormals(void) { return _simsimd_flush_denormals(); }
SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities(void) { return _simsimd_capabilities_implementation(); }
SIMSIMD_PUBLIC void simsimd_find_kernel_punned( //
simsimd_metric_kind_t kind, //
simsimd_datatype_t datatype, //
simsimd_capability_t supported, //
simsimd_capability_t allowed, //
simsimd_kernel_punned_t* kernel_output, //
simsimd_capability_t* capability_output) {
_simsimd_find_kernel_punned_implementation(kind, datatype, supported, allowed, kernel_output, capability_output);
}
// clang-format on
/* Inner products
* - Dot product: the sum of the products of the corresponding elements of two vectors.
* - Complex Dot product: dot product with a conjugate first argument.
* - Complex Conjugate Dot product: dot product with a conjugate first argument.
*
* @param a The first vector.
* @param b The second vector.
* @param n The number of elements in the vectors. Even for complex variants (the number of scalars).
* @param d The output distance value.
*
* @note The dot product can be negative, to use as a distance, take `1 - a * b`.
* @note The dot product is zero if and only if the two vectors are orthogonal.
* @note Defined only for floating-point and integer data types.
*/
SIMSIMD_PUBLIC void simsimd_dot_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_I8
simsimd_dot_i8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_dot_i8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_dot_i8_haswell(a, b, n, d);
#else
simsimd_dot_i8_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_dot_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_I8
simsimd_dot_u8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_dot_u8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_dot_u8_haswell(a, b, n, d);
#else
simsimd_dot_u8_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_dot_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE_F16
simsimd_dot_f16_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_dot_f16_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SAPPHIRE
simsimd_dot_f16_sapphire(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_dot_f16_haswell(a, b, n, d);
#else
simsimd_dot_f16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_dot_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_GENOA
simsimd_dot_bf16_genoa(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_dot_bf16_haswell(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_dot_bf16_neon(a, b, n, d);
#else
simsimd_dot_bf16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_dot_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_dot_f32_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_dot_f32_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_dot_f32_skylake(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_dot_f32_haswell(a, b, n, d);
#else
simsimd_dot_f32_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_dot_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_dot_f64_sve(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_dot_f64_skylake(a, b, n, d);
#else
simsimd_dot_f64_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_dot_f16c(simsimd_f16c_t const *a, simsimd_f16c_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE_F16
simsimd_dot_f16c_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_dot_f16c_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SAPPHIRE
simsimd_dot_f16c_sapphire(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_dot_f16c_haswell(a, b, n, d);
#else
simsimd_dot_f16c_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_dot_bf16c(simsimd_bf16c_t const *a, simsimd_bf16c_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_GENOA
simsimd_dot_bf16c_genoa(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_dot_bf16c_neon(a, b, n, d);
#else
simsimd_dot_bf16c_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_dot_f32c(simsimd_f32c_t const *a, simsimd_f32c_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_dot_f32c_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_dot_f32c_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_dot_f32c_skylake(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_dot_f32c_haswell(a, b, n, d);
#else
simsimd_dot_f32c_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_dot_f64c(simsimd_f64c_t const *a, simsimd_f64c_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_dot_f64c_sve(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_dot_f64c_skylake(a, b, n, d);
#else
simsimd_dot_f64c_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_vdot_f16c(simsimd_f16c_t const *a, simsimd_f16c_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_vdot_f16c_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_dot_f16c_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SAPPHIRE
simsimd_dot_f16c_sapphire(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_dot_f16c_haswell(a, b, n, d);
#else
simsimd_vdot_f16c_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_vdot_bf16c(simsimd_bf16c_t const *a, simsimd_bf16c_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_GENOA
simsimd_vdot_bf16c_genoa(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_dot_bf16c_neon(a, b, n, d);
#else
simsimd_vdot_bf16c_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_vdot_f32c(simsimd_f32c_t const *a, simsimd_f32c_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_vdot_f32c_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_dot_f32c_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_dot_f32c_skylake(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_dot_f32c_haswell(a, b, n, d);
#else
simsimd_vdot_f32c_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_vdot_f64c(simsimd_f64c_t const *a, simsimd_f64c_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_vdot_f64c_sve(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_vdot_f64c_skylake(a, b, n, d);
#else
simsimd_vdot_f64c_serial(a, b, n, d);
#endif
}
/* Spatial distances
* - Cosine distance: the cosine of the angle between two vectors.
* - L2 squared distance: the squared Euclidean distance between two vectors.
*
* @param a The first vector.
* @param b The second vector.
* @param n The number of elements in the vectors.
* @param d The output distance value.
*
* @note The output distance value is non-negative.
* @note The output distance value is zero if and only if the two vectors are identical.
* @note Defined only for floating-point and integer data types.
*/
SIMSIMD_PUBLIC void simsimd_cos_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_I8
simsimd_cos_i8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_cos_i8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_cos_i8_haswell(a, b, n, d);
#else
simsimd_cos_i8_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_cos_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_I8
simsimd_cos_u8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_cos_u8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_cos_u8_haswell(a, b, n, d);
#else
simsimd_cos_u8_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_cos_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE_F16
simsimd_cos_f16_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_cos_f16_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SAPPHIRE
simsimd_cos_f16_sapphire(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_cos_f16_haswell(a, b, n, d);
#else
simsimd_cos_f16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_cos_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_GENOA
simsimd_cos_bf16_genoa(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_cos_bf16_haswell(a, b, n, d);
#elif SIMSIMD_TARGET_SVE_BF16
simsimd_cos_bf16_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_cos_bf16_neon(a, b, n, d);
#else
simsimd_cos_bf16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_cos_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_cos_f32_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_cos_f32_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_cos_f32_skylake(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_cos_f32_haswell(a, b, n, d);
#else
simsimd_cos_f32_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_cos_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_cos_f64_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_cos_f64_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_cos_f64_skylake(a, b, n, d);
#else
simsimd_cos_f64_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2sq_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_I8
simsimd_l2sq_i8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_l2sq_i8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2sq_i8_haswell(a, b, n, d);
#else
simsimd_l2sq_i8_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2sq_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_I8
simsimd_l2sq_u8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_l2sq_u8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2sq_u8_haswell(a, b, n, d);
#else
simsimd_l2sq_u8_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2sq_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE_F16
simsimd_l2sq_f16_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_l2sq_f16_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SAPPHIRE
simsimd_l2sq_f16_sapphire(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2sq_f16_haswell(a, b, n, d);
#else
simsimd_l2sq_f16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2sq_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_GENOA
simsimd_l2sq_bf16_genoa(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2sq_bf16_haswell(a, b, n, d);
#elif SIMSIMD_TARGET_SVE_BF16
simsimd_l2sq_bf16_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_l2sq_bf16_neon(a, b, n, d);
#else
simsimd_l2sq_bf16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2sq_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_l2sq_f32_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_l2sq_f32_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_l2sq_f32_skylake(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2sq_f32_haswell(a, b, n, d);
#else
simsimd_l2sq_f32_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2sq_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_l2sq_f64_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_l2sq_f64_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_l2sq_f64_skylake(a, b, n, d);
#else
simsimd_l2sq_f64_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_I8
simsimd_l2_i8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_l2_i8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2_i8_haswell(a, b, n, d);
#else
simsimd_l2_i8_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_I8
simsimd_l2_u8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_l2_u8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2_u8_haswell(a, b, n, d);
#else
simsimd_l2_u8_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE_F16
simsimd_l2_f16_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_l2_f16_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SAPPHIRE
simsimd_l2_f16_sapphire(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2_f16_haswell(a, b, n, d);
#else
simsimd_l2_f16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_GENOA
simsimd_l2_bf16_genoa(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2_bf16_haswell(a, b, n, d);
#elif SIMSIMD_TARGET_SVE_BF16
simsimd_l2_bf16_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_l2_bf16_neon(a, b, n, d);
#else
simsimd_l2_bf16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_l2_f32_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_l2_f32_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_l2_f32_skylake(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_l2_f32_haswell(a, b, n, d);
#else
simsimd_l2_f32_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_l2_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_l2_f64_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_l2_f64_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_l2_f64_skylake(a, b, n, d);
#else
simsimd_l2_f64_serial(a, b, n, d);
#endif
}
/* Binary distances
* - Hamming distance: the number of positions at which the corresponding bits are different.
* - Jaccard distance: ratio of bit-level matching positions (intersection) to the total number of positions (union).
*
* @param a The first binary vector.
* @param b The second binary vector.
* @param n The number of 8-bit words in the vectors.
* @param d The output distance value.
*
* @note The output distance value is non-negative.
* @note The output distance value is zero if and only if the two vectors are identical.
* @note Defined only for binary data.
*/
SIMSIMD_PUBLIC void simsimd_hamming_b8(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_hamming_b8_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_hamming_b8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_hamming_b8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_hamming_b8_haswell(a, b, n, d);
#else
simsimd_hamming_b8_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_jaccard_b8(simsimd_b8_t const *a, simsimd_b8_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE
simsimd_jaccard_b8_sve(a, b, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_jaccard_b8_neon(a, b, n, d);
#elif SIMSIMD_TARGET_ICE
simsimd_jaccard_b8_ice(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_jaccard_b8_haswell(a, b, n, d);
#else
simsimd_jaccard_b8_serial(a, b, n, d);
#endif
}
/* Probability distributions
* - Jensen-Shannon divergence: a measure of similarity between two probability distributions.
* - Kullback-Leibler divergence: a measure of how one probability distribution diverges from a second.
*
* @param a The first discrete probability distribution.
* @param b The second discrete probability distribution.
* @param n The number of elements in the discrete distributions.
* @param d The output divergence value.
*
* @note The distributions are assumed to be normalized.
* @note The output divergence value is non-negative.
* @note The output divergence value is zero if and only if the two distributions are identical.
* @note Defined only for floating-point data types.
*/
SIMSIMD_PUBLIC void simsimd_kl_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_F16
simsimd_kl_f16_neon(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_kl_f16_haswell(a, b, n, d);
#else
simsimd_kl_f16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_kl_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
simsimd_kl_bf16_serial(a, b, n, d);
}
SIMSIMD_PUBLIC void simsimd_kl_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON
simsimd_kl_f32_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_kl_f32_skylake(a, b, n, d);
#else
simsimd_kl_f32_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_kl_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
simsimd_kl_f64_serial(a, b, n, d);
}
SIMSIMD_PUBLIC void simsimd_js_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON_F16
simsimd_js_f16_neon(a, b, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_js_f16_haswell(a, b, n, d);
#else
simsimd_js_f16_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_js_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
simsimd_js_bf16_serial(a, b, n, d);
}
SIMSIMD_PUBLIC void simsimd_js_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
#if SIMSIMD_TARGET_NEON
simsimd_js_f32_neon(a, b, n, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_js_f32_skylake(a, b, n, d);
#else
simsimd_js_f32_serial(a, b, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_js_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *d) {
simsimd_js_f64_serial(a, b, n, d);
}
/* Set operations
*
* @param a The first sorted array of integers.
* @param b The second sorted array of integers.
* @param a_length The number of elements in the first array.
* @param b_length The number of elements in the second array.
* @param d The output for the number of elements in the intersection.
*/
SIMSIMD_PUBLIC void simsimd_intersect_u16(simsimd_u16_t const *a, simsimd_u16_t const *b, simsimd_size_t a_length,
simsimd_size_t b_length, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE2
simsimd_intersect_u16_sve2(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_NEON
simsimd_intersect_u16_neon(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_TURIN
simsimd_intersect_u16_turin(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_ICE
simsimd_intersect_u16_ice(a, b, a_length, b_length, d);
#else
simsimd_intersect_u16_serial(a, b, a_length, b_length, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_intersect_u32(simsimd_u32_t const *a, simsimd_u32_t const *b, simsimd_size_t a_length,
simsimd_size_t b_length, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE2
simsimd_intersect_u32_sve2(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_NEON
simsimd_intersect_u32_neon(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_TURIN
simsimd_intersect_u32_turin(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_ICE
simsimd_intersect_u32_ice(a, b, a_length, b_length, d);
#else
simsimd_intersect_u32_serial(a, b, a_length, b_length, d);
#endif
}
/* Weighted set operations
*
* @param a The first sorted array of integers.
* @param b The second sorted array of integers.
* @param a_weights The weights for the first array.
* @param b_weights The weights for the second array.
* @param a_length The number of elements in the first array.
* @param b_length The number of elements in the second array.
* @param d The output for the number of elements in the intersection.
*/
SIMSIMD_PUBLIC void simsimd_spdot_counts_u16(simsimd_u16_t const *a, simsimd_u16_t const *b,
simsimd_i16_t const *a_weights, simsimd_i16_t const *b_weights,
simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE2
simsimd_spdot_counts_u16_sve2(a, b, a_weights, b_weights, a_length, b_length, d);
#elif SIMSIMD_TARGET_TURIN
simsimd_spdot_counts_u16_turin(a, b, a_weights, b_weights, a_length, b_length, d);
#else
simsimd_spdot_counts_u16_serial(a, b, a_weights, b_weights, a_length, b_length, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_spdot_weights_u16(simsimd_u16_t const *a, simsimd_u16_t const *b,
simsimd_bf16_t const *a_weights, simsimd_bf16_t const *b_weights,
simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SVE2
simsimd_spdot_weights_u16_sve2(a, b, a_weights, b_weights, a_length, b_length, d);
#elif SIMSIMD_TARGET_TURIN
simsimd_spdot_weights_u16_turin(a, b, a_weights, b_weights, a_length, b_length, d);
#else
simsimd_spdot_weights_u16_serial(a, b, a_weights, b_weights, a_length, b_length, d);
#endif
}
/* Curved space distances
*
* @param a The first vector of floating point values.
* @param b The second vector of floating point values.
* @param c The metric tensor or covariance matrix.
* @param n The number of dimensions in the vectors.
* @param d The output for the number of elements in the intersection.
*/
SIMSIMD_PUBLIC void simsimd_bilinear_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_bilinear_f64_skylake(a, b, c, n, d);
#else
simsimd_bilinear_f64_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_bilinear_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_bilinear_f32_skylake(a, b, c, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_bilinear_f32_neon(a, b, c, n, d);
#else
simsimd_bilinear_f32_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_bilinear_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SAPPHIRE
simsimd_bilinear_f16_sapphire(a, b, c, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_bilinear_f16_haswell(a, b, c, n, d);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_bilinear_f16_neon(a, b, c, n, d);
#else
simsimd_bilinear_f16_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_bilinear_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_GENOA
simsimd_bilinear_bf16_genoa(a, b, c, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_bilinear_bf16_haswell(a, b, c, n, d);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_bilinear_bf16_neon(a, b, c, n, d);
#else
simsimd_bilinear_bf16_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_bilinear_f64c(simsimd_f64c_t const *a, simsimd_f64c_t const *b, simsimd_f64c_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_bilinear_f64c_skylake(a, b, c, n, d);
#else
simsimd_bilinear_f64c_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_bilinear_f32c(simsimd_f32c_t const *a, simsimd_f32c_t const *b, simsimd_f32c_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_bilinear_f32c_skylake(a, b, c, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_bilinear_f32c_neon(a, b, c, n, d);
#else
simsimd_bilinear_f32c_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_bilinear_f16c(simsimd_f16c_t const *a, simsimd_f16c_t const *b, simsimd_f16c_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SAPPHIRE
simsimd_bilinear_f16c_sapphire(a, b, c, n, d);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_bilinear_f16c_neon(a, b, c, n, d);
#else
simsimd_bilinear_f16c_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_bilinear_bf16c(simsimd_bf16c_t const *a, simsimd_bf16c_t const *b, simsimd_bf16c_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_GENOA
simsimd_bilinear_bf16c_genoa(a, b, c, n, d);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_bilinear_bf16c_neon(a, b, c, n, d);
#else
simsimd_bilinear_bf16c_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_mahalanobis_f64_skylake(a, b, c, n, d);
#else
simsimd_mahalanobis_f64_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_mahalanobis_f32_skylake(a, b, c, n, d);
#elif SIMSIMD_TARGET_NEON
simsimd_mahalanobis_f32_neon(a, b, c, n, d);
#else
simsimd_mahalanobis_f32_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_SAPPHIRE
simsimd_mahalanobis_f16_sapphire(a, b, c, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_mahalanobis_f16_haswell(a, b, c, n, d);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_mahalanobis_f16_neon(a, b, c, n, d);
#else
simsimd_mahalanobis_f16_serial(a, b, c, n, d);
#endif
}
SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c,
simsimd_size_t n, simsimd_distance_t *d) {
#if SIMSIMD_TARGET_GENOA
simsimd_mahalanobis_bf16_genoa(a, b, c, n, d);
#elif SIMSIMD_TARGET_HASWELL
simsimd_mahalanobis_bf16_haswell(a, b, c, n, d);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_mahalanobis_bf16_neon(a, b, c, n, d);
#else
simsimd_mahalanobis_bf16_serial(a, b, c, n, d);
#endif
}
/* Elementwise operations
*
* @param a The first vector of integral or floating point values.
* @param b The second vector of integral or floating point values.
* @param c The third vector of integral or floating point values.
* @param n The number of dimensions in the vectors.
* @param alpha The first scaling factor.
* @param beta The first scaling factor.
* @param r The output vector or integral or floating point values.
*/
SIMSIMD_PUBLIC void simsimd_wsum_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *r) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_wsum_f64_skylake(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_wsum_f64_haswell(a, b, n, alpha, beta, r);
#else
simsimd_wsum_f64_serial(a, b, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_wsum_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *r) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_wsum_f32_skylake(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_wsum_f32_haswell(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON
simsimd_wsum_f32_neon(a, b, n, alpha, beta, r);
#else
simsimd_wsum_f32_serial(a, b, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_wsum_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *r) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_wsum_bf16_skylake(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_wsum_bf16_haswell(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_wsum_bf16_neon(a, b, n, alpha, beta, r);
#else
simsimd_wsum_bf16_serial(a, b, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_wsum_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *r) {
#if SIMSIMD_TARGET_SAPPHIRE
simsimd_wsum_f16_sapphire(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_wsum_f16_haswell(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_wsum_f16_neon(a, b, n, alpha, beta, r);
#else
simsimd_wsum_f16_serial(a, b, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_wsum_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *r) {
#if SIMSIMD_TARGET_SAPPHIRE
simsimd_wsum_i8_sapphire(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_wsum_i8_haswell(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_wsum_i8_neon(a, b, n, alpha, beta, r);
#else
simsimd_wsum_i8_serial(a, b, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_wsum_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *r) {
#if SIMSIMD_TARGET_SAPPHIRE
simsimd_wsum_u8_sapphire(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_wsum_u8_haswell(a, b, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_wsum_u8_neon(a, b, n, alpha, beta, r);
#else
simsimd_wsum_u8_serial(a, b, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_fma_f64(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t const *c,
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta,
simsimd_f64_t *r) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_fma_f64_skylake(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_fma_f64_haswell(a, b, c, n, alpha, beta, r);
#else
simsimd_fma_f64_serial(a, b, c, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_fma_f32(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t const *c,
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta,
simsimd_f32_t *r) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_fma_f32_skylake(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_fma_f32_haswell(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON
simsimd_fma_f32_neon(a, b, c, n, alpha, beta, r);
#else
simsimd_fma_f32_serial(a, b, c, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_fma_bf16(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t const *c,
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta,
simsimd_bf16_t *r) {
#if SIMSIMD_TARGET_SKYLAKE
simsimd_fma_bf16_skylake(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_fma_bf16_haswell(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON_BF16
simsimd_fma_bf16_neon(a, b, c, n, alpha, beta, r);
#else
simsimd_fma_bf16_serial(a, b, c, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_fma_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t const *c,
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta,
simsimd_f16_t *r) {
#if SIMSIMD_TARGET_SAPPHIRE
simsimd_fma_f16_sapphire(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_fma_f16_haswell(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_fma_f16_neon(a, b, c, n, alpha, beta, r);
#else
simsimd_fma_f16_serial(a, b, c, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_fma_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t const *c,
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta,
simsimd_i8_t *r) {
#if SIMSIMD_TARGET_SAPPHIRE
simsimd_fma_i8_sapphire(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_fma_i8_haswell(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_fma_i8_neon(a, b, c, n, alpha, beta, r);
#else
simsimd_fma_i8_serial(a, b, c, n, alpha, beta, r);
#endif
}
SIMSIMD_PUBLIC void simsimd_fma_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t const *c,
simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta,
simsimd_u8_t *r) {
#if SIMSIMD_TARGET_SAPPHIRE
simsimd_fma_u8_sapphire(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_HASWELL
simsimd_fma_u8_haswell(a, b, c, n, alpha, beta, r);
#elif SIMSIMD_TARGET_NEON_F16
simsimd_fma_u8_neon(a, b, c, n, alpha, beta, r);
#else
simsimd_fma_u8_serial(a, b, c, n, alpha, beta, r);
#endif
}
#endif
#ifdef __cplusplus
}
#endif
#endif // SIMSIMD_H
simsimd-6.5.12/include/simsimd/sparse.h 0000644 0000000 0000000 00000211676 10461020230 0016137 0 ustar 0000000 0000000 /**
* @file sparse.h
* @brief SIMD-accelerated functions for Sparse Vectors.
* @author Ash Vardanian
* @date March 21, 2024
*
* Contains:
* - Set Intersection ~ Jaccard Distance
* - Sparse Dot Products, outputting the count and weighted product
*
* For datatypes:
* - u16: for vocabularies under 64 thousand tokens
* - u32: for vocabularies under 4 billion tokens
* - u16 indicies + i16 weights: for weighted word counts
* - u16 indicies + bf16 weights: for sparse matrices
*
* For hardware architectures:
* - x86: Ice Lake, Turin
* - Arm: SVE2
*
* Interestingly, to implement sparse distances and products, the most important function
* is analogous to `std::set_intersection`, that outputs the intersection of two sorted
* sequences. The naive implementation of that function would look like:
*
* std::size_t intersection_size = 0;
* while (i != a_length && j != b_length) {
* scalar_t ai = a[i], bj = b[j];
* intersection_size += ai == bj;
* i += ai < bj;
* j += ai >= bj;
* }
*
* Assuming we are dealing with sparse arrays, most of the time we are just evaluating
* branches and skipping entries. So what if we could skip multiple entries at a time
* searching for the next chunk, where an intersection is possible. For weighted arrays:
*
* double product = 0;
* while (i != a_length && j != b_length) {
* scalar_t ai = a[i], bj = b[j];
* product += ai == bj ? a_weights[i] * b_weights[j] : 0;
* i += ai < bj;
* j += ai >= bj;
* }
*
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
*/
#ifndef SIMSIMD_SPARSE_H
#define SIMSIMD_SPARSE_H
#include "types.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Implements the serial set intersection algorithm, similar to `std::set_intersection in C++ STL`,
* but uses clever galloping logic, if the arrays significantly differ in size.
*/
SIMSIMD_PUBLIC void simsimd_intersect_u16_serial( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_intersect_u32_serial( //
simsimd_u32_t const *a, simsimd_u32_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_spdot_counts_u16_serial( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_i16_t const *a_weights, simsimd_i16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_serial( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_bf16_t const *a_weights, simsimd_bf16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
/* Implements the most naive set intersection algorithm, similar to `std::set_intersection in C++ STL`,
* naively enumerating the elements of two arrays.
*/
SIMSIMD_PUBLIC void simsimd_intersect_u16_accurate( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_intersect_u32_accurate( //
simsimd_u32_t const *a, simsimd_u32_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_spdot_counts_u16_accurate( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_i16_t const *a_weights, simsimd_i16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_accurate( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_bf16_t const *a_weights, simsimd_bf16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
/* SIMD-powered backends for Arm SVE, mostly using 32-bit arithmetic over variable-length platform-defined word sizes.
* Designed for Arm Graviton 3, Microsoft Cobalt, as well as Nvidia Grace and newer Ampere Altra CPUs.
*/
SIMSIMD_PUBLIC void simsimd_intersect_u16_sve2( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_intersect_u32_sve2( //
simsimd_u32_t const *a, simsimd_u32_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_spdot_counts_u16_sve2( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_i16_t const *a_weights, simsimd_i16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_sve2( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_bf16_t const *a_weights, simsimd_bf16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
/* SIMD-powered backends for various generations of AVX512 CPUs.
* Skylake is handy, as it supports masked loads and other operations, avoiding the need for the tail loop.
* Ice Lake, however, is needed even for the most basic kernels to perform integer matching.
*/
SIMSIMD_PUBLIC void simsimd_intersect_u16_ice( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_intersect_u32_ice( //
simsimd_u32_t const *a, simsimd_u32_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
/* SIMD-powered backends for AMD Turin CPUs with cheap VP2INTERSECT instructions.
* On the Intel side, only mobile Tiger Lake support them, but have prohibitively high latency.
*/
SIMSIMD_PUBLIC void simsimd_intersect_u16_turin( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_intersect_u32_turin( //
simsimd_u32_t const *a, simsimd_u32_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_spdot_counts_u16_turin( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_i16_t const *a_weights, simsimd_i16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_turin( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_bf16_t const *a_weights, simsimd_bf16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results);
#define SIMSIMD_MAKE_INTERSECT_LINEAR(name, input_type, counter_type) \
SIMSIMD_PUBLIC void simsimd_intersect_##input_type##_##name( \
simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, simsimd_size_t a_length, \
simsimd_size_t b_length, simsimd_distance_t *result) { \
simsimd_##counter_type##_t intersection_size = 0; \
simsimd_size_t i = 0, j = 0; \
while (i != a_length && j != b_length) { \
simsimd_##input_type##_t ai = a[i]; \
simsimd_##input_type##_t bj = b[j]; \
intersection_size += ai == bj; \
i += ai < bj; \
j += ai >= bj; \
} \
*result = intersection_size; \
}
SIMSIMD_MAKE_INTERSECT_LINEAR(accurate, u16, size) // simsimd_intersect_u16_accurate
SIMSIMD_MAKE_INTERSECT_LINEAR(accurate, u32, size) // simsimd_intersect_u32_accurate
#define SIMSIMD_MAKE_INTERSECT_WEIGHTED(name, variation, input_type, counter_type, weight_type, accumulator_type, \
load_and_convert) \
SIMSIMD_PUBLIC void simsimd_##variation##_##input_type##_##name( \
simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, \
simsimd_##weight_type##_t const *a_weights, simsimd_##weight_type##_t const *b_weights, \
simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t *results) { \
simsimd_##counter_type##_t intersection_size = 0; \
simsimd_##accumulator_type##_t weights_product = 0; \
simsimd_size_t i = 0, j = 0; \
while (i != a_length && j != b_length) { \
simsimd_##input_type##_t ai = a[i]; \
simsimd_##input_type##_t bj = b[j]; \
int matches = ai == bj; \
simsimd_##accumulator_type##_t awi = load_and_convert(a_weights + i); \
simsimd_##accumulator_type##_t bwi = load_and_convert(b_weights + i); \
weights_product += matches * awi * bwi; \
intersection_size += matches; \
i += ai < bj; \
j += ai >= bj; \
} \
results[0] = intersection_size; \
results[1] = weights_product; \
}
SIMSIMD_MAKE_INTERSECT_WEIGHTED(accurate, spdot_counts, u16, size, i16, i64,
SIMSIMD_DEREFERENCE) // simsimd_spdot_counts_u16_accurate
SIMSIMD_MAKE_INTERSECT_WEIGHTED(accurate, spdot_weights, u16, size, bf16, f64,
SIMSIMD_BF16_TO_F32) // simsimd_spdot_weights_u16_accurate
#define SIMSIMD_MAKE_INTERSECT_GALLOPING(name, input_type, counter_type) \
SIMSIMD_PUBLIC simsimd_size_t simsimd_galloping_search_##input_type(simsimd_##input_type##_t const *array, \
simsimd_size_t start, simsimd_size_t length, \
simsimd_##input_type##_t val) { \
simsimd_size_t low = start; \
simsimd_size_t high = start + 1; \
while (high < length && array[high] < val) { \
low = high; \
high = (2 * high < length) ? 2 * high : length; \
} \
while (low < high) { \
simsimd_size_t mid = low + (high - low) / 2; \
if (array[mid] < val) { low = mid + 1; } \
else { high = mid; } \
} \
return low; \
} \
\
SIMSIMD_PUBLIC void simsimd_intersect_##input_type##_##name( \
simsimd_##input_type##_t const *shorter, simsimd_##input_type##_t const *longer, \
simsimd_size_t shorter_length, simsimd_size_t longer_length, simsimd_distance_t *result) { \
/* Swap arrays if necessary, as we want "longer" to be larger than "shorter" */ \
if (longer_length < shorter_length) { \
simsimd_##input_type##_t const *temp = shorter; \
shorter = longer; \
longer = temp; \
simsimd_size_t temp_length = shorter_length; \
shorter_length = longer_length; \
longer_length = temp_length; \
} \
\
/* Use the accurate implementation if galloping is not beneficial */ \
if (longer_length < 64 * shorter_length) { \
simsimd_intersect_##input_type##_accurate(shorter, longer, shorter_length, longer_length, result); \
return; \
} \
\
/* Perform galloping, shrinking the target range */ \
simsimd_##counter_type##_t intersection_size = 0; \
simsimd_size_t j = 0; \
for (simsimd_size_t i = 0; i < shorter_length; ++i) { \
simsimd_##input_type##_t shorter_i = shorter[i]; \
j = simsimd_galloping_search_##input_type(longer, j, longer_length, shorter_i); \
if (j < longer_length && longer[j] == shorter_i) { intersection_size++; } \
} \
*result = intersection_size; \
}
SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u16, size) // simsimd_intersect_u16_serial
SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u32, size) // simsimd_intersect_u32_serial
SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_counts, u16, size, i16, i32,
SIMSIMD_DEREFERENCE) // simsimd_spdot_counts_u16_serial
SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_weights, u16, size, bf16, f32,
SIMSIMD_BF16_TO_F32) // simsimd_spdot_weights_u16_serial
/* The AVX-512 implementations are inspired by the "Faster-Than-Native Alternatives
* for x86 VP2INTERSECT Instructions" paper by Guille Diez-Canas, 2022.
*
* https://github.com/mozonaut/vp2intersect
* https://arxiv.org/pdf/2112.06342.pdf
*
* For R&D purposes, it's important to keep the following latencies in mind:
*
* - `_mm512_permutex_epi64` - needs F - 3 cycles latency
* - `_mm512_shuffle_epi8` - needs BW - 1 cycle latency
* - `_mm512_permutexvar_epi16` - needs BW - 4-6 cycles latency
* - `_mm512_permutexvar_epi8` - needs VBMI - 3 cycles latency
*/
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_ICE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "lzcnt", "popcnt", "avx512bw", "avx512vbmi2")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,lzcnt,popcnt,avx512bw,avx512vbmi2"))), \
apply_to = function)
/**
* @brief Analogous to `_mm512_2intersect_epi16_mask`, but compatible with Ice Lake CPUs,
* slightly faster than the native Tiger Lake implementation, but returns only one mask.
*/
SIMSIMD_INTERNAL simsimd_u32_t _simsimd_intersect_u16x32_ice(__m512i a, __m512i b) {
__m512i a1 = _mm512_alignr_epi32(a, a, 4);
__m512i a2 = _mm512_alignr_epi32(a, a, 8);
__m512i a3 = _mm512_alignr_epi32(a, a, 12);
__m512i b1 = _mm512_shuffle_epi32(b, _MM_PERM_ADCB);
__m512i b2 = _mm512_shuffle_epi32(b, _MM_PERM_BADC);
__m512i b3 = _mm512_shuffle_epi32(b, _MM_PERM_CBAD);
__m512i b01 = _mm512_shrdi_epi32(b, b, 16);
__m512i b11 = _mm512_shrdi_epi32(b1, b1, 16);
__m512i b21 = _mm512_shrdi_epi32(b2, b2, 16);
__m512i b31 = _mm512_shrdi_epi32(b3, b3, 16);
__mmask32 nm00 = _mm512_cmpneq_epi16_mask(a, b);
__mmask32 nm01 = _mm512_cmpneq_epi16_mask(a1, b);
__mmask32 nm02 = _mm512_cmpneq_epi16_mask(a2, b);
__mmask32 nm03 = _mm512_cmpneq_epi16_mask(a3, b);
__mmask32 nm10 = _mm512_mask_cmpneq_epi16_mask(nm00, a, b01);
__mmask32 nm11 = _mm512_mask_cmpneq_epi16_mask(nm01, a1, b01);
__mmask32 nm12 = _mm512_mask_cmpneq_epi16_mask(nm02, a2, b01);
__mmask32 nm13 = _mm512_mask_cmpneq_epi16_mask(nm03, a3, b01);
__mmask32 nm20 = _mm512_mask_cmpneq_epi16_mask(nm10, a, b1);
__mmask32 nm21 = _mm512_mask_cmpneq_epi16_mask(nm11, a1, b1);
__mmask32 nm22 = _mm512_mask_cmpneq_epi16_mask(nm12, a2, b1);
__mmask32 nm23 = _mm512_mask_cmpneq_epi16_mask(nm13, a3, b1);
__mmask32 nm30 = _mm512_mask_cmpneq_epi16_mask(nm20, a, b11);
__mmask32 nm31 = _mm512_mask_cmpneq_epi16_mask(nm21, a1, b11);
__mmask32 nm32 = _mm512_mask_cmpneq_epi16_mask(nm22, a2, b11);
__mmask32 nm33 = _mm512_mask_cmpneq_epi16_mask(nm23, a3, b11);
__mmask32 nm40 = _mm512_mask_cmpneq_epi16_mask(nm30, a, b2);
__mmask32 nm41 = _mm512_mask_cmpneq_epi16_mask(nm31, a1, b2);
__mmask32 nm42 = _mm512_mask_cmpneq_epi16_mask(nm32, a2, b2);
__mmask32 nm43 = _mm512_mask_cmpneq_epi16_mask(nm33, a3, b2);
__mmask32 nm50 = _mm512_mask_cmpneq_epi16_mask(nm40, a, b21);
__mmask32 nm51 = _mm512_mask_cmpneq_epi16_mask(nm41, a1, b21);
__mmask32 nm52 = _mm512_mask_cmpneq_epi16_mask(nm42, a2, b21);
__mmask32 nm53 = _mm512_mask_cmpneq_epi16_mask(nm43, a3, b21);
__mmask32 nm60 = _mm512_mask_cmpneq_epi16_mask(nm50, a, b3);
__mmask32 nm61 = _mm512_mask_cmpneq_epi16_mask(nm51, a1, b3);
__mmask32 nm62 = _mm512_mask_cmpneq_epi16_mask(nm52, a2, b3);
__mmask32 nm63 = _mm512_mask_cmpneq_epi16_mask(nm53, a3, b3);
__mmask32 nm70 = _mm512_mask_cmpneq_epi16_mask(nm60, a, b31);
__mmask32 nm71 = _mm512_mask_cmpneq_epi16_mask(nm61, a1, b31);
__mmask32 nm72 = _mm512_mask_cmpneq_epi16_mask(nm62, a2, b31);
__mmask32 nm73 = _mm512_mask_cmpneq_epi16_mask(nm63, a3, b31);
return ~(simsimd_u32_t)(nm70 & simsimd_u32_rol(nm71, 8) & simsimd_u32_rol(nm72, 16) & simsimd_u32_ror(nm73, 8));
}
/**
* @brief Analogous to `_mm512_2intersect_epi32`, but compatible with Ice Lake CPUs,
* slightly faster than the native Tiger Lake implementation, but returns only one mask.
*
* Some latencies to keep in mind:
*
* - `_mm512_shuffle_epi32` - "VPSHUFD (ZMM, ZMM, I8)":
* - 1 cycle latency on Ice Lake: 1*p5
* - 1 cycle latency on Genoa: 1*FP123
* - `_mm512_mask_cmpneq_epi32_mask` - "VPCMPD (K, ZMM, ZMM, I8)":
* - 3 cycle latency on Ice Lake: 1*p5
* - 1 cycle latency on Genoa: 1*FP01
* - `_mm512_alignr_epi32` - "VPALIGNR (ZMM, ZMM, ZMM, I8)":
* - 1 cycle latency on Ice Lake: 1*p5
* - 2 cycle latency on Genoa: 1*FP12
* - `_mm512_conflict_epi32` - "VPCONFLICTD (ZMM, ZMM)":
* - up to 26 cycles latency on Ice Lake: 11*p0+9*p05+17*p5
* - up to 7 cycle latency on Genoa: 1*FP01+1*FP12
*/
SIMSIMD_INTERNAL simsimd_u16_t _simsimd_intersect_u32x16_ice(__m512i a, __m512i b) {
__m512i a1 = _mm512_alignr_epi32(a, a, 4);
__m512i b1 = _mm512_shuffle_epi32(b, _MM_PERM_ADCB);
__mmask16 nm00 = _mm512_cmpneq_epi32_mask(a, b);
__m512i a2 = _mm512_alignr_epi32(a, a, 8);
__m512i a3 = _mm512_alignr_epi32(a, a, 12);
__mmask16 nm01 = _mm512_cmpneq_epi32_mask(a1, b);
__mmask16 nm02 = _mm512_cmpneq_epi32_mask(a2, b);
__mmask16 nm03 = _mm512_cmpneq_epi32_mask(a3, b);
__mmask16 nm10 = _mm512_mask_cmpneq_epi32_mask(nm00, a, b1);
__mmask16 nm11 = _mm512_mask_cmpneq_epi32_mask(nm01, a1, b1);
__m512i b2 = _mm512_shuffle_epi32(b, _MM_PERM_BADC);
__mmask16 nm12 = _mm512_mask_cmpneq_epi32_mask(nm02, a2, b1);
__mmask16 nm13 = _mm512_mask_cmpneq_epi32_mask(nm03, a3, b1);
__mmask16 nm20 = _mm512_mask_cmpneq_epi32_mask(nm10, a, b2);
__m512i b3 = _mm512_shuffle_epi32(b, _MM_PERM_CBAD);
__mmask16 nm21 = _mm512_mask_cmpneq_epi32_mask(nm11, a1, b2);
__mmask16 nm22 = _mm512_mask_cmpneq_epi32_mask(nm12, a2, b2);
__mmask16 nm23 = _mm512_mask_cmpneq_epi32_mask(nm13, a3, b2);
__mmask16 nm0 = _mm512_mask_cmpneq_epi32_mask(nm20, a, b3);
__mmask16 nm1 = _mm512_mask_cmpneq_epi32_mask(nm21, a1, b3);
__mmask16 nm2 = _mm512_mask_cmpneq_epi32_mask(nm22, a2, b3);
__mmask16 nm3 = _mm512_mask_cmpneq_epi32_mask(nm23, a3, b3);
return ~(simsimd_u16_t)(nm0 & simsimd_u16_rol(nm1, 4) & simsimd_u16_rol(nm2, 8) & simsimd_u16_ror(nm3, 4));
}
SIMSIMD_PUBLIC void simsimd_intersect_u16_ice( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
// The baseline implementation for very small arrays (2 registers or less) can be quite simple:
if (a_length < 64 && b_length < 64) {
simsimd_intersect_u16_serial(a, b, a_length, b_length, results);
return;
}
simsimd_u16_t const *const a_end = a + a_length;
simsimd_u16_t const *const b_end = b + b_length;
simsimd_size_t c = 0;
union vec_t {
__m512i zmm;
simsimd_u16_t u16[32];
simsimd_u8_t u8[64];
} a_vec, b_vec;
while (a + 32 <= a_end && b + 32 <= b_end) {
a_vec.zmm = _mm512_loadu_si512((__m512i const *)a);
b_vec.zmm = _mm512_loadu_si512((__m512i const *)b);
// Intersecting registers with `_simsimd_intersect_u16x32_ice` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u16_t a_min;
simsimd_u16_t a_max = a_vec.u16[31];
simsimd_u16_t b_min = b_vec.u16[0];
simsimd_u16_t b_max = b_vec.u16[31];
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && a + 64 <= a_end) {
a += 32;
a_vec.zmm = _mm512_loadu_si512((__m512i const *)a);
a_max = a_vec.u16[31];
}
a_min = a_vec.u16[0];
while (b_max < a_min && b + 64 <= b_end) {
b += 32;
b_vec.zmm = _mm512_loadu_si512((__m512i const *)b);
b_max = b_vec.u16[31];
}
b_min = b_vec.u16[0];
__m512i a_last_broadcasted = _mm512_set1_epi16(*(short const *)&a_max);
__m512i b_last_broadcasted = _mm512_set1_epi16(*(short const *)&b_max);
__mmask32 a_step_mask = _mm512_cmple_epu16_mask(a_vec.zmm, b_last_broadcasted);
__mmask32 b_step_mask = _mm512_cmple_epu16_mask(b_vec.zmm, a_last_broadcasted);
a += 32 - _lzcnt_u32((simsimd_u32_t)a_step_mask);
b += 32 - _lzcnt_u32((simsimd_u32_t)b_step_mask);
// Now we are likely to have some overlap, so we can intersect the registers
__mmask32 a_matches = _simsimd_intersect_u16x32_ice(a_vec.zmm, b_vec.zmm);
// The paper also contained a very nice procedure for exporting the matches,
// but we don't need it here:
// _mm512_mask_compressstoreu_epi16(c, a_matches, a_vec);
c += _mm_popcnt_u32(a_matches); // MSVC has no `_popcnt32`
}
simsimd_intersect_u16_serial(a, b, a_end - a, b_end - b, results);
*results += c;
}
SIMSIMD_PUBLIC void simsimd_intersect_u32_ice( //
simsimd_u32_t const *a, simsimd_u32_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
// The baseline implementation for very small arrays (2 registers or less) can be quite simple:
if (a_length < 32 && b_length < 32) {
simsimd_intersect_u32_serial(a, b, a_length, b_length, results);
return;
}
simsimd_u32_t const *const a_end = a + a_length;
simsimd_u32_t const *const b_end = b + b_length;
simsimd_size_t c = 0;
union vec_t {
__m512i zmm;
simsimd_u32_t u32[16];
simsimd_u8_t u8[64];
} a_vec, b_vec;
while (a + 16 <= a_end && b + 16 <= b_end) {
a_vec.zmm = _mm512_loadu_si512((__m512i const *)a);
b_vec.zmm = _mm512_loadu_si512((__m512i const *)b);
// Intersecting registers with `_simsimd_intersect_u32x16_ice` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u32_t a_min;
simsimd_u32_t a_max = a_vec.u32[15];
simsimd_u32_t b_min = b_vec.u32[0];
simsimd_u32_t b_max = b_vec.u32[15];
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && a + 32 <= a_end) {
a += 16;
a_vec.zmm = _mm512_loadu_si512((__m512i const *)a);
a_max = a_vec.u32[15];
}
a_min = a_vec.u32[0];
while (b_max < a_min && b + 32 <= b_end) {
b += 16;
b_vec.zmm = _mm512_loadu_si512((__m512i const *)b);
b_max = b_vec.u32[15];
}
b_min = b_vec.u32[0];
__m512i a_last_broadcasted = _mm512_set1_epi32(*(int const *)&a_max);
__m512i b_last_broadcasted = _mm512_set1_epi32(*(int const *)&b_max);
__mmask16 a_step_mask = _mm512_cmple_epu32_mask(a_vec.zmm, b_last_broadcasted);
__mmask16 b_step_mask = _mm512_cmple_epu32_mask(b_vec.zmm, a_last_broadcasted);
a += 32 - _lzcnt_u32((simsimd_u32_t)a_step_mask);
b += 32 - _lzcnt_u32((simsimd_u32_t)b_step_mask);
// Now we are likely to have some overlap, so we can intersect the registers
__mmask16 a_matches = _simsimd_intersect_u32x16_ice(a_vec.zmm, b_vec.zmm);
// The paper also contained a very nice procedure for exporting the matches,
// but we don't need it here:
// _mm512_mask_compressstoreu_epi32(c, a_matches, a_vec);
c += _mm_popcnt_u32(a_matches); // MSVC has no `_popcnt32`
}
simsimd_intersect_u32_serial(a, b, a_end - a, b_end - b, results);
*results += c;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_ICE
#if SIMSIMD_TARGET_TURIN
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi", "bmi2", "lzcnt", "popcnt", "avx512bw", "avx512vbmi2", \
"avx512bf16", "avx512vnni", "avx512vp2intersect", "avx512dq")
#pragma clang attribute push( \
__attribute__((target( \
"avx2,avx512f,avx512vl,bmi,bmi2,lzcnt,popcnt,avx512bw,avx512vbmi2,avx512bf16,avx512vnni,avx512vp2intersect,avx512dq"))), \
apply_to = function)
SIMSIMD_PUBLIC void simsimd_intersect_u16_turin( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
//! There is no such thing as `_mm512_2intersect_epi16`, only the 32-bit variant!
//! So instead of jumping through 32 entries at a time, like on Ice Lake, we will
//! step through 16 entries at a time.
simsimd_u16_t const *const a_end = a + a_length;
simsimd_u16_t const *const b_end = b + b_length;
simsimd_size_t c = 0;
union vec_t {
__m256i ymm;
simsimd_u16_t u16[16];
simsimd_u8_t u8[32];
} a_vec, b_vec;
// Broadcast index for last element (hoisted outside loop)
__m256i const last_idx = _mm256_set1_epi16(15);
while (a + 16 <= a_end && b + 16 <= b_end) {
a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
// Intersect the registers
__m512i a_i32_vec = _mm512_cvtepu16_epi32(a_vec.ymm);
__m512i b_i32_vec = _mm512_cvtepu16_epi32(b_vec.ymm);
__mmask16 a_matches_any_in_b, b_matches_any_in_a;
_mm512_2intersect_epi32(a_i32_vec, b_i32_vec, &a_matches_any_in_b, &b_matches_any_in_a);
// The paper also contained a very nice procedure for exporting the matches,
// but we don't need it here:
// _mm512_mask_compressstoreu_epi16(c, a_matches_any_in_b, a_vec);
c += _mm_popcnt_u32(a_matches_any_in_b); // MSVC has no `_popcnt32`
__m256i a_last_broadcasted = _mm256_permutexvar_epi16(last_idx, a_vec.ymm);
__m256i b_last_broadcasted = _mm256_permutexvar_epi16(last_idx, b_vec.ymm);
__mmask16 a_step_mask = _mm256_cmple_epu16_mask(a_vec.ymm, b_last_broadcasted);
__mmask16 b_step_mask = _mm256_cmple_epu16_mask(b_vec.ymm, a_last_broadcasted);
a += _tzcnt_u32(~(simsimd_u32_t)a_step_mask | 0x10000);
b += _tzcnt_u32(~(simsimd_u32_t)b_step_mask | 0x10000);
}
simsimd_intersect_u16_serial(a, b, a_end - a, b_end - b, results);
*results += c;
}
SIMSIMD_PUBLIC void simsimd_intersect_u32_turin( //
simsimd_u32_t const *a, simsimd_u32_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
simsimd_u32_t const *const a_end = a + a_length;
simsimd_u32_t const *const b_end = b + b_length;
simsimd_size_t c = 0;
union vec_t {
__m512i zmm;
simsimd_u32_t u32[16];
simsimd_u8_t u8[64];
} a_vec, b_vec;
// Broadcast index for last element (hoisted outside loop)
__m512i const last_idx = _mm512_set1_epi32(15);
while (a + 16 <= a_end && b + 16 <= b_end) {
a_vec.zmm = _mm512_loadu_si512((__m512i const *)a);
b_vec.zmm = _mm512_loadu_si512((__m512i const *)b);
// Intersect the registers
__mmask16 a_matches_any_in_b, b_matches_any_in_a;
_mm512_2intersect_epi32(a_vec.zmm, b_vec.zmm, &a_matches_any_in_b, &b_matches_any_in_a);
// The paper also contained a very nice procedure for exporting the matches,
// but we don't need it here:
// _mm512_mask_compressstoreu_epi32(c, a_matches_any_in_b, a_vec);
c += _mm_popcnt_u32(a_matches_any_in_b); // MSVC has no `_popcnt32`
// Pure SIMD broadcasts - no scalar extraction needed
__m512i a_last_broadcasted = _mm512_permutexvar_epi32(last_idx, a_vec.zmm);
__m512i b_last_broadcasted = _mm512_permutexvar_epi32(last_idx, b_vec.zmm);
__mmask16 a_step_mask = _mm512_cmple_epu32_mask(a_vec.zmm, b_last_broadcasted);
__mmask16 b_step_mask = _mm512_cmple_epu32_mask(b_vec.zmm, a_last_broadcasted);
a += _tzcnt_u32(~(simsimd_u32_t)a_step_mask | 0x10000);
b += _tzcnt_u32(~(simsimd_u32_t)b_step_mask | 0x10000);
}
simsimd_intersect_u32_serial(a, b, a_end - a, b_end - b, results);
*results += c;
}
SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_turin( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_bf16_t const *a_weights, simsimd_bf16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
// The baseline implementation for very small arrays (2 registers or less) can be quite simple:
if (a_length < 64 && b_length < 64) {
simsimd_spdot_weights_u16_serial(a, b, a_weights, b_weights, a_length, b_length, results);
return;
}
//! There is no such thing as `_mm512_2intersect_epi16`, only the 32-bit variant!
//! So instead of jumping through 32 entries at a time, like on Ice Lake, we will
//! step through 16 entries at a time.
simsimd_u16_t const *const a_end = a + a_length;
simsimd_u16_t const *const b_end = b + b_length;
simsimd_size_t intersection_size = 0;
union vec_t {
__m256i ymm;
__m256 ymmps;
simsimd_u16_t u16[16];
simsimd_u8_t u8[32];
} a_vec, b_vec, product_vec;
product_vec.ymmps = _mm256_setzero_ps();
// Broadcast index for last element (hoisted outside loop)
__m256i const last_idx = _mm256_set1_epi16(15);
while (a + 16 <= a_end && b + 16 <= b_end) {
a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
// Intersecting registers with `_mm512_2intersect_epi16_mask` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u16_t a_min;
simsimd_u16_t a_max = a_vec.u16[15];
simsimd_u16_t b_min = b_vec.u16[0];
simsimd_u16_t b_max = b_vec.u16[15];
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && a + 32 <= a_end) {
a += 16, a_weights += 16;
a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
a_max = a_vec.u16[15];
}
a_min = a_vec.u16[0];
while (b_max < a_min && b + 32 <= b_end) {
b += 16, b_weights += 16;
b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
b_max = b_vec.u16[15];
}
b_min = b_vec.u16[0];
// Now we are likely to have some overlap, so we can intersect the registers
__m512i a_i32_vec = _mm512_cvtepu16_epi32(a_vec.ymm);
__m512i b_i32_vec = _mm512_cvtepu16_epi32(b_vec.ymm);
__mmask16 a_matches_any_in_b, b_matches_any_in_a;
_mm512_2intersect_epi32(a_i32_vec, b_i32_vec, &a_matches_any_in_b, &b_matches_any_in_a);
// The paper also contained a very nice procedure for exporting the matches,
// but we don't need it here:
// _mm512_mask_compressstoreu_epi16(intersection_size, a_matches_any_in_b, a_vec);
int a_matches_count_in_b = _mm_popcnt_u32(a_matches_any_in_b); // MSVC has no `_popcnt32`
intersection_size += a_matches_count_in_b;
// Load and shift all the relevant weights to the start of the vector before doing the dot product
if (a_matches_count_in_b) {
__m256i a_weights_vec = _mm256_lddqu_si256((__m256i const *)a_weights);
a_weights_vec = _mm256_maskz_compress_epi16(a_matches_any_in_b, a_weights_vec);
__m256i b_weights_vec = _mm256_lddqu_si256((__m256i const *)b_weights);
b_weights_vec = _mm256_maskz_compress_epi16(b_matches_any_in_a, b_weights_vec);
product_vec.ymmps = _mm256_dpbf16_ps(product_vec.ymmps, (__m256bh)a_weights_vec, (__m256bh)b_weights_vec);
}
__m256i a_last_broadcasted = _mm256_permutexvar_epi16(last_idx, a_vec.ymm);
__m256i b_last_broadcasted = _mm256_permutexvar_epi16(last_idx, b_vec.ymm);
__mmask16 a_step_mask = _mm256_cmple_epu16_mask(a_vec.ymm, b_last_broadcasted);
__mmask16 b_step_mask = _mm256_cmple_epu16_mask(b_vec.ymm, a_last_broadcasted);
simsimd_size_t a_step = _tzcnt_u32(~(simsimd_u32_t)a_step_mask | 0x10000);
simsimd_size_t b_step = _tzcnt_u32(~(simsimd_u32_t)b_step_mask | 0x10000);
a += a_step, a_weights += a_step;
b += b_step, b_weights += b_step;
}
simsimd_spdot_weights_u16_serial(a, b, a_weights, b_weights, a_end - a, b_end - b, results);
results[0] += intersection_size;
results[1] += _mm512_reduce_add_ps(_mm512_insertf32x8(_mm512_setzero_ps(), product_vec.ymmps, 0));
}
SIMSIMD_PUBLIC void simsimd_spdot_counts_u16_turin( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_i16_t const *a_weights, simsimd_i16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
// The baseline implementation for very small arrays (2 registers or less) can be quite simple:
if (a_length < 64 && b_length < 64) {
simsimd_spdot_counts_u16_serial(a, b, a_weights, b_weights, a_length, b_length, results);
return;
}
//! There is no such thing as `_mm512_2intersect_epi16`, only the 32-bit variant!
//! So instead of jumping through 32 entries at a time, like on Ice Lake, we will
//! step through 16 entries at a time.
simsimd_u16_t const *const a_end = a + a_length;
simsimd_u16_t const *const b_end = b + b_length;
simsimd_size_t intersection_size = 0;
union vec_t {
__m256i ymm;
simsimd_u16_t u16[16];
simsimd_u8_t u8[32];
} a_vec, b_vec, product_vec;
product_vec.ymm = _mm256_setzero_si256();
// Broadcast index for last element (hoisted outside loop)
__m256i const last_idx = _mm256_set1_epi16(15);
while (a + 16 <= a_end && b + 16 <= b_end) {
a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
// Intersecting registers with `_mm512_2intersect_epi16_mask` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u16_t a_min;
simsimd_u16_t a_max = a_vec.u16[15];
simsimd_u16_t b_min = b_vec.u16[0];
simsimd_u16_t b_max = b_vec.u16[15];
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && a + 32 <= a_end) {
a += 16, a_weights += 16;
a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
a_max = a_vec.u16[15];
}
a_min = a_vec.u16[0];
while (b_max < a_min && b + 32 <= b_end) {
b += 16, b_weights += 16;
b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
b_max = b_vec.u16[15];
}
b_min = b_vec.u16[0];
// Now we are likely to have some overlap, so we can intersect the registers
__m512i a_i32_vec = _mm512_cvtepu16_epi32(a_vec.ymm);
__m512i b_i32_vec = _mm512_cvtepu16_epi32(b_vec.ymm);
__mmask16 a_matches_any_in_b, b_matches_any_in_a;
_mm512_2intersect_epi32(a_i32_vec, b_i32_vec, &a_matches_any_in_b, &b_matches_any_in_a);
// The paper also contained a very nice procedure for exporting the matches,
// but we don't need it here:
// _mm512_mask_compressstoreu_epi16(intersection_size, a_matches_any_in_b, a_vec);
int a_matches_count_in_b = _mm_popcnt_u32(a_matches_any_in_b); // MSVC has no `_popcnt32`
intersection_size += a_matches_count_in_b;
// Load and shift all the relevant weights to the start of the vector before doing the dot product
if (a_matches_count_in_b) {
__m256i a_weights_vec = _mm256_lddqu_si256((__m256i const *)a_weights);
a_weights_vec = _mm256_maskz_compress_epi16(a_matches_any_in_b, a_weights_vec);
__m256i b_weights_vec = _mm256_lddqu_si256((__m256i const *)b_weights);
b_weights_vec = _mm256_maskz_compress_epi16(b_matches_any_in_a, b_weights_vec);
product_vec.ymm = _mm256_dpwssds_epi32(product_vec.ymm, a_weights_vec, b_weights_vec);
}
__m256i a_last_broadcasted = _mm256_permutexvar_epi16(last_idx, a_vec.ymm);
__m256i b_last_broadcasted = _mm256_permutexvar_epi16(last_idx, b_vec.ymm);
__mmask16 a_step_mask = _mm256_cmple_epu16_mask(a_vec.ymm, b_last_broadcasted);
__mmask16 b_step_mask = _mm256_cmple_epu16_mask(b_vec.ymm, a_last_broadcasted);
simsimd_size_t a_step = _tzcnt_u32(~(simsimd_u32_t)a_step_mask | 0x10000);
simsimd_size_t b_step = _tzcnt_u32(~(simsimd_u32_t)b_step_mask | 0x10000);
a += a_step, a_weights += a_step;
b += b_step, b_weights += b_step;
}
simsimd_spdot_counts_u16_serial(a, b, a_weights, b_weights, a_end - a, b_end - b, results);
results[0] += intersection_size;
results[1] += _mm512_reduce_add_epi32(_mm512_inserti64x4(_mm512_setzero_si512(), product_vec.ymm, 0));
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_TURIN
#endif // _SIMSIMD_TARGET_X86
#if _SIMSIMD_TARGET_ARM
#if SIMSIMD_TARGET_NEON
#pragma GCC push_options
#pragma GCC target("arch=armv8-a")
#pragma clang attribute push(__attribute__((target("arch=armv8-a"))), apply_to = function)
/**
* @brief Uses `vshrn` to produce a bitmask, similar to `movemask` in SSE.
* https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
*/
SIMSIMD_INTERNAL simsimd_u64_t _simsimd_u8_to_u4_neon(uint8x16_t vec) {
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0);
}
SIMSIMD_INTERNAL int _simsimd_clz_u64(simsimd_u64_t x) {
// On GCC and Clang use the builtin, otherwise use the generic implementation
#if defined(__GNUC__) || defined(__clang__)
return __builtin_clzll(x);
#else
int n = 0;
while ((x & 0x8000000000000000ull) == 0) n++, x <<= 1;
return n;
#endif
}
SIMSIMD_INTERNAL uint32x4_t _simsimd_intersect_u32x4_neon(uint32x4_t a, uint32x4_t b) {
uint32x4_t b1 = vextq_u32(b, b, 1);
uint32x4_t b2 = vextq_u32(b, b, 2);
uint32x4_t b3 = vextq_u32(b, b, 3);
uint32x4_t nm00 = vceqq_u32(a, b);
uint32x4_t nm01 = vceqq_u32(a, b1);
uint32x4_t nm02 = vceqq_u32(a, b2);
uint32x4_t nm03 = vceqq_u32(a, b3);
uint32x4_t nm = vorrq_u32(vorrq_u32(nm00, nm01), vorrq_u32(nm02, nm03));
return nm;
}
SIMSIMD_INTERNAL uint16x8_t _simsimd_intersect_u16x8_neon(uint16x8_t a, uint16x8_t b) {
uint16x8_t b1 = vextq_u16(b, b, 1);
uint16x8_t b2 = vextq_u16(b, b, 2);
uint16x8_t b3 = vextq_u16(b, b, 3);
uint16x8_t b4 = vextq_u16(b, b, 4);
uint16x8_t b5 = vextq_u16(b, b, 5);
uint16x8_t b6 = vextq_u16(b, b, 6);
uint16x8_t b7 = vextq_u16(b, b, 7);
uint16x8_t nm00 = vceqq_u16(a, b);
uint16x8_t nm01 = vceqq_u16(a, b1);
uint16x8_t nm02 = vceqq_u16(a, b2);
uint16x8_t nm03 = vceqq_u16(a, b3);
uint16x8_t nm04 = vceqq_u16(a, b4);
uint16x8_t nm05 = vceqq_u16(a, b5);
uint16x8_t nm06 = vceqq_u16(a, b6);
uint16x8_t nm07 = vceqq_u16(a, b7);
uint16x8_t nm = vorrq_u16(vorrq_u16(vorrq_u16(nm00, nm01), vorrq_u16(nm02, nm03)),
vorrq_u16(vorrq_u16(nm04, nm05), vorrq_u16(nm06, nm07)));
return nm;
}
SIMSIMD_PUBLIC void simsimd_intersect_u16_neon( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
// The baseline implementation for very small arrays (2 registers or less) can be quite simple:
if (a_length < 32 && b_length < 32) {
simsimd_intersect_u16_serial(a, b, a_length, b_length, results);
return;
}
simsimd_u16_t const *const a_end = a + a_length;
simsimd_u16_t const *const b_end = b + b_length;
union vec_t {
uint16x8_t u16x8;
simsimd_u16_t u16[8];
simsimd_u8_t u8[16];
} a_vec, b_vec, c_counts_vec;
c_counts_vec.u16x8 = vdupq_n_u16(0);
while (a + 8 <= a_end && b + 8 <= b_end) {
a_vec.u16x8 = vld1q_u16(a);
b_vec.u16x8 = vld1q_u16(b);
// Intersecting registers with `_simsimd_intersect_u16x8_neon` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u16_t a_min;
simsimd_u16_t a_max = a_vec.u16[7];
simsimd_u16_t b_min = b_vec.u16[0];
simsimd_u16_t b_max = b_vec.u16[7];
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && a + 16 <= a_end) {
a += 8;
a_vec.u16x8 = vld1q_u16(a);
a_max = a_vec.u16[7];
}
a_min = a_vec.u16[0];
while (b_max < a_min && b + 16 <= b_end) {
b += 8;
b_vec.u16x8 = vld1q_u16(b);
b_max = b_vec.u16[7];
}
b_min = b_vec.u16[0];
// Now we are likely to have some overlap, so we can intersect the registers.
// We can do it by performing a population count at every cycle, but it's not the cheapest in terms of cycles.
//
// simsimd_u64_t a_matches = __builtin_popcountll(
// _simsimd_u8_to_u4_neon(vreinterpretq_u8_u16(
// _simsimd_intersect_u16x8_neon(a_vec.u16x8, b_vec.u16x8))));
// c += a_matches / 8;
//
// Alternatively, we can we can transform match-masks into "ones", accumulate them between the cycles,
// and merge all together in the end.
uint16x8_t a_matches = _simsimd_intersect_u16x8_neon(a_vec.u16x8, b_vec.u16x8);
c_counts_vec.u16x8 = vaddq_u16(c_counts_vec.u16x8, vandq_u16(a_matches, vdupq_n_u16(1)));
// Counting leading zeros is tricky. On Arm we can use inline Assembly to get the result,
// but MSVC doesn't support that:
//
// SIMSIMD_INTERNAL int _simsimd_clz_u64(simsimd_u64_t value) {
// simsimd_u64_t result;
// __asm__("clz %x0, %x1" : "=r"(result) : "r"(value));
// return (int)result;
// }
//
// Alternatively, we can use the `vclz_u32` NEON intrinsic.
// It will compute the leading zeros number for both `a_step` and `b_step` in parallel.
uint16x8_t a_last_broadcasted = vdupq_n_u16(a_max);
uint16x8_t b_last_broadcasted = vdupq_n_u16(b_max);
simsimd_u64_t a_step = _simsimd_clz_u64(_simsimd_u8_to_u4_neon( //
vreinterpretq_u8_u16(vcleq_u16(a_vec.u16x8, b_last_broadcasted))));
simsimd_u64_t b_step = _simsimd_clz_u64(_simsimd_u8_to_u4_neon( //
vreinterpretq_u8_u16(vcleq_u16(b_vec.u16x8, a_last_broadcasted))));
a += (64 - a_step) / 8;
b += (64 - b_step) / 8;
}
simsimd_intersect_u16_serial(a, b, a_end - a, b_end - b, results);
*results += vaddvq_u16(c_counts_vec.u16x8);
}
SIMSIMD_PUBLIC void simsimd_intersect_u32_neon( //
simsimd_u32_t const *a, simsimd_u32_t const *b, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
// The baseline implementation for very small arrays (2 registers or less) can be quite simple:
if (a_length < 32 && b_length < 32) {
simsimd_intersect_u32_serial(a, b, a_length, b_length, results);
return;
}
simsimd_u32_t const *const a_end = a + a_length;
simsimd_u32_t const *const b_end = b + b_length;
union vec_t {
uint32x4_t u32x4;
simsimd_u32_t u32[4];
simsimd_u8_t u8[16];
} a_vec, b_vec, c_counts_vec;
c_counts_vec.u32x4 = vdupq_n_u32(0);
while (a + 4 <= a_end && b + 4 <= b_end) {
a_vec.u32x4 = vld1q_u32(a);
b_vec.u32x4 = vld1q_u32(b);
// Intersecting registers with `_simsimd_intersect_u32x4_neon` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u32_t a_min;
simsimd_u32_t a_max = a_vec.u32[3];
simsimd_u32_t b_min = b_vec.u32[0];
simsimd_u32_t b_max = b_vec.u32[3];
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && a + 8 <= a_end) {
a += 4;
a_vec.u32x4 = vld1q_u32(a);
a_max = a_vec.u32[3];
}
a_min = a_vec.u32[0];
while (b_max < a_min && b + 8 <= b_end) {
b += 4;
b_vec.u32x4 = vld1q_u32(b);
b_max = b_vec.u32[3];
}
b_min = b_vec.u32[0];
// Now we are likely to have some overlap, so we can intersect the registers
// We can do it by performing a population count at every cycle, but it's not the cheapest in terms of cycles.
//
// simsimd_u64_t a_matches = __builtin_popcountll(
// _simsimd_u8_to_u4_neon(vreinterpretq_u8_u32(
// _simsimd_intersect_u32x4_neon(a_vec.u32x4, b_vec.u32x4))));
// c += a_matches / 16;
//
// Alternatively, we can we can transform match-masks into "ones", accumulate them between the cycles,
// and merge all together in the end.
uint32x4_t a_matches = _simsimd_intersect_u32x4_neon(a_vec.u32x4, b_vec.u32x4);
c_counts_vec.u32x4 = vaddq_u32(c_counts_vec.u32x4, vandq_u32(a_matches, vdupq_n_u32(1)));
uint32x4_t a_last_broadcasted = vdupq_n_u32(a_max);
uint32x4_t b_last_broadcasted = vdupq_n_u32(b_max);
simsimd_u64_t a_step = _simsimd_clz_u64(_simsimd_u8_to_u4_neon( //
vreinterpretq_u8_u32(vcleq_u32(a_vec.u32x4, b_last_broadcasted))));
simsimd_u64_t b_step = _simsimd_clz_u64(_simsimd_u8_to_u4_neon( //
vreinterpretq_u8_u32(vcleq_u32(b_vec.u32x4, a_last_broadcasted))));
a += (64 - a_step) / 16;
b += (64 - b_step) / 16;
}
simsimd_intersect_u32_serial(a, b, a_end - a, b_end - b, results);
*results += vaddvq_u32(c_counts_vec.u32x4);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON
/* SVE2 introduces many new integer-oriented instructions, extending some of the NEON functionality
* to variable-length SVE registers. Those include "compare multiple" intrinsics:
*
* - `svmatch[_u16]` that matches each scalar in first vector against all members of a 128-bit lane in the second.
* - `svhistcnt[_s32]_z` does something similar, performing an inclusive prefix scan.
* - `svtbx[_u16]` does extended table lookup
*
* Other notable instructions:
*
* - `DUP`: Broadcast indexed predicate element
* https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/DUP--predicate---Broadcast-indexed-predicate-element-?lang=en
* - `SCLAMP` and `UCLAMP`: clamp values, i.e. combined min+max
* https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/SCLAMP--Signed-clamp-to-minimum-maximum-vector-?lang=en
* https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/UCLAMP--Unsigned-clamp-to-minimum-maximum-vector-?lang=en
* - `TBLQ`: Table lookup quadword
* https://developer.arm.com/documentation/ddi0602/2022-12/SVE-Instructions/TBLQ--Programmable-table-lookup-within-each-quadword-vector-segment--zeroing--?lang=en
*
* Great resources for SVE2 intrinsics:
*
* > ARM’s Scalable Vector Extensions: A Critical Look at SVE2 For Integer Workloads
* https://gist.github.com/zingaburga/805669eb891c820bd220418ee3f0d6bd
*/
#if SIMSIMD_TARGET_SVE2
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve+sve2")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_intersect_u16_sve2( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_size_t a_length,
simsimd_size_t b_length, //
simsimd_distance_t *results) {
// A single SVE lane is 128 bits wide, so one lane fits 8 values.
simsimd_size_t const register_size = svcnth();
simsimd_size_t const lanes_count = register_size / 8;
simsimd_size_t a_idx = 0, b_idx = 0;
simsimd_size_t c = 0;
while (a_idx < a_length && b_idx < b_length) {
// Load `a_member` and broadcast it, load `b_members_vec` from memory
svbool_t a_progress = svwhilelt_b16_u64(a_idx, a_length);
svbool_t b_progress = svwhilelt_b16_u64(b_idx, b_length);
svuint16_t a_vec = svld1_u16(a_progress, a + a_idx);
svuint16_t b_vec = svld1_u16(b_progress, b + b_idx);
// Intersecting registers with `svmatch_u16` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u16_t a_min;
simsimd_u16_t a_max = svlastb(a_progress, a_vec);
simsimd_u16_t b_min = svlasta(svpfalse_b(), b_vec);
simsimd_u16_t b_max = svlastb(b_progress, b_vec);
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && (a_idx + register_size) <= a_length) {
a_idx += register_size;
a_progress = svwhilelt_b16_u64(a_idx, a_length);
a_vec = svld1_u16(a_progress, a + a_idx);
a_max = svlastb(a_progress, a_vec);
}
a_min = svlasta(svpfalse_b(), a_vec);
while (b_max < a_min && (b_idx + register_size) <= b_length) {
b_idx += register_size;
b_progress = svwhilelt_b16_u64(b_idx, b_length);
b_vec = svld1_u16(b_progress, b + b_idx);
b_max = svlastb(b_progress, b_vec);
}
b_min = svlasta(svpfalse_b(), b_vec);
// Before we evaluate the intersection size, obfurscating the order in `b_vec`,
// let's estimate how much we will need to advance the pointers afterwards.
// For that, we don't even need to broadcast the values in SVE, as the whole
// register can be compared against a scalar:
//
// svuint16_t a_last_broadcasted = svdup_n_u16(a_max);
// svuint16_t b_last_broadcasted = svdup_n_u16(b_max);
svbool_t a_mask = svcmple_n_u16(a_progress, a_vec, b_max);
svbool_t b_mask = svcmple_n_u16(b_progress, b_vec, a_max);
simsimd_u64_t a_step = svcntp_b16(a_progress, a_mask);
simsimd_u64_t b_step = svcntp_b16(b_progress, b_mask);
// Compare `a_vec` with each lane of `b_vec`
svbool_t equal_mask = svmatch_u16(a_progress, a_vec, b_vec);
for (simsimd_size_t i = 1; i < lanes_count; i++) {
b_vec = svext_u16(b_vec, b_vec, 8);
equal_mask = svorr_z(svptrue_b16(), equal_mask, svmatch_u16(a_progress, a_vec, b_vec));
}
simsimd_size_t equal_count = svcntp_b16(svptrue_b16(), equal_mask);
// Advance
a_idx += a_step;
b_idx += b_step;
c += equal_count;
}
*results = c;
}
SIMSIMD_PUBLIC void simsimd_intersect_u32_sve2(simsimd_u32_t const *a, simsimd_u32_t const *b, simsimd_size_t a_length,
simsimd_size_t b_length, simsimd_distance_t *results) {
// A single SVE lane is 128 bits wide, so one lane fits 4 values.
simsimd_size_t const register_size = svcntw();
simsimd_size_t const lanes_count = register_size / 4;
simsimd_size_t a_idx = 0, b_idx = 0;
simsimd_size_t c = 0;
while (a_idx < a_length && b_idx < b_length) {
// Load `a_member` and broadcast it, load `b_members_vec` from memory
svbool_t a_progress = svwhilelt_b32_u64(a_idx, a_length);
svbool_t b_progress = svwhilelt_b32_u64(b_idx, b_length);
svuint32_t a_vec = svld1_u32(a_progress, a + a_idx);
svuint32_t b_vec = svld1_u32(b_progress, b + b_idx);
// Intersecting registers with `svmatch_u16` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u32_t a_min;
simsimd_u32_t a_max = svlastb(a_progress, a_vec);
simsimd_u32_t b_min = svlasta(svpfalse_b(), b_vec);
simsimd_u32_t b_max = svlastb(b_progress, b_vec);
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && (a_idx + register_size) <= a_length) {
a_idx += register_size;
a_progress = svwhilelt_b32_u64(a_idx, a_length);
a_vec = svld1_u32(a_progress, a + a_idx);
a_max = svlastb(a_progress, a_vec);
}
a_min = svlasta(svpfalse_b(), a_vec);
while (b_max < a_min && (b_idx + register_size) <= b_length) {
b_idx += register_size;
b_progress = svwhilelt_b32_u64(b_idx, b_length);
b_vec = svld1_u32(b_progress, b + b_idx);
b_max = svlastb(b_progress, b_vec);
}
b_min = svlasta(svpfalse_b(), b_vec);
// Before we evaluate the intersection size, obfurscating the order in `b_vec`,
// let's estimate how much we will need to advance the pointers afterwards.
// For that, we don't even need to broadcast the values in SVE, as the whole
// register can be compared against a scalar:
//
// svuint32_t a_last_broadcasted = svdup_n_u32(a_max);
// svuint32_t b_last_broadcasted = svdup_n_u32(b_max);
svbool_t a_mask = svcmple_n_u32(a_progress, a_vec, b_max);
svbool_t b_mask = svcmple_n_u32(b_progress, b_vec, a_max);
simsimd_u64_t a_step = svcntp_b32(a_progress, a_mask);
simsimd_u64_t b_step = svcntp_b32(b_progress, b_mask);
// Comparing `a_vec` with each lane of `b_vec` can't be done with `svmatch`,
// the same way as in `simsimd_intersect_u16_sve2`, as that instruction is only
// available for 8-bit and 16-bit integers.
//
// svbool_t equal_mask = svpfalse_b();
// for (simsimd_size_t i = 0; i < register_size; i++) {
// equal_mask = svorr_z(svptrue_b32(), equal_mask, svcmpeq_u32(a_progress, a_vec, b_vec));
// b_vec = svext_u32(b_vec, b_vec, 1);
// }
// simsimd_size_t equal_count = svcntp_b32(a_progress, equal_mask);
//
// Alternatively, one can use histogram instructions, like `svhistcnt_u32_z`.
// They practically compute the prefix-matching count, which is equivalent to
// the lower triangle of the row-major intersection matrix.
// To compute the upper triangle, we can reverse (with `svrev_b32`) the order of
// elements and repeat the operation, accumulating the results for top and bottom.
// Let's look at 4x element registers as an example:
//
// ⊐ α = {A, B, C, D}, β = {X, Y, Z, W}:
//
// hist(α, β): hist(α_rev, β_rev):
//
// X Y Z W W Z Y X
// A 1 0 0 0 D 1 0 0 0
// B 1 1 0 0 C 1 1 0 0
// C 1 1 1 0 B 1 1 1 0
// D 1 1 1 1 A 1 1 1 1
//
svuint32_t hist_lower = svhistcnt_u32_z(a_progress, a_vec, b_vec);
svuint32_t a_rev_vec = svrev_u32(a_vec);
svuint32_t b_rev_vec = svrev_u32(b_vec);
svuint32_t hist_upper = svrev_u32(svhistcnt_u32_z(svptrue_b32(), a_rev_vec, b_rev_vec));
svuint32_t hist = svorr_u32_x(a_progress, hist_lower, hist_upper);
svbool_t equal_mask = svcmpne_n_u32(a_progress, hist, 0);
simsimd_size_t equal_count = svcntp_b32(a_progress, equal_mask);
// Advance
a_idx += a_step;
b_idx += b_step;
c += equal_count;
}
*results = c;
}
SIMSIMD_PUBLIC void simsimd_spdot_counts_u16_sve2( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_i16_t const *a_weights, simsimd_i16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
// A single SVE lane is 128 bits wide, so one lane fits 8 values.
simsimd_size_t const register_size = svcnth();
simsimd_size_t const lanes_count = register_size / 8;
simsimd_size_t a_idx = 0, b_idx = 0;
svint64_t product_vec = svdupq_n_s64(0, 0);
simsimd_size_t intersection_size = 0;
while (a_idx < a_length && b_idx < b_length) {
// Load `a_member` and broadcast it, load `b_members_vec` from memory
svbool_t a_progress = svwhilelt_b16_u64(a_idx, a_length);
svbool_t b_progress = svwhilelt_b16_u64(b_idx, b_length);
svuint16_t a_vec = svld1_u16(a_progress, a + a_idx);
svuint16_t b_vec = svld1_u16(b_progress, b + b_idx);
// Intersecting registers with `svmatch_u16` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u16_t a_min;
simsimd_u16_t a_max = svlastb(a_progress, a_vec);
simsimd_u16_t b_min = svlasta(svpfalse_b(), b_vec);
simsimd_u16_t b_max = svlastb(b_progress, b_vec);
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && (a_idx + register_size) <= a_length) {
a_idx += register_size;
a_progress = svwhilelt_b16_u64(a_idx, a_length);
a_vec = svld1_u16(a_progress, a + a_idx);
a_max = svlastb(a_progress, a_vec);
}
a_min = svlasta(svpfalse_b(), a_vec);
while (b_max < a_min && (b_idx + register_size) <= b_length) {
b_idx += register_size;
b_progress = svwhilelt_b16_u64(b_idx, b_length);
b_vec = svld1_u16(b_progress, b + b_idx);
b_max = svlastb(b_progress, b_vec);
}
b_min = svlasta(svpfalse_b(), b_vec);
// Before we evaluate the intersection size, obfurscating the order in `b_vec`,
// let's estimate how much we will need to advance the pointers afterwards.
// For that, we don't even need to broadcast the values in SVE, as the whole
// register can be compared against a scalar:
//
// svuint16_t a_last_broadcasted = svdup_n_u16(a_max);
// svuint16_t b_last_broadcasted = svdup_n_u16(b_max);
svbool_t a_mask = svcmple_n_u16(a_progress, a_vec, b_max);
svbool_t b_mask = svcmple_n_u16(b_progress, b_vec, a_max);
simsimd_u64_t a_step = svcntp_b16(a_progress, a_mask);
simsimd_u64_t b_step = svcntp_b16(b_progress, b_mask);
// Compare `a_vec` with each lane of `b_vec`
svint16_t a_weights_vec = svld1_s16(a_progress, a_weights + a_idx);
svint16_t b_weights_vec = svld1_s16(b_progress, b_weights + b_idx);
for (simsimd_size_t i = 0; i < lanes_count; i++) {
svbool_t equal_mask = svmatch_u16(a_progress, a_vec, b_vec);
svint16_t b_equal_weights_vec = svsel_s16(equal_mask, b_weights_vec, svdup_n_s16(0.f));
product_vec = svdot_s64(product_vec, a_weights_vec, b_equal_weights_vec);
b_vec = svext_u16(b_vec, b_vec, 8);
intersection_size += svcntp_b16(svptrue_b16(), equal_mask);
}
// Advance
a_idx += a_step;
b_idx += b_step;
}
results[0] = (simsimd_distance_t)intersection_size;
results[1] = svaddv_s64(svptrue_b64(), product_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SVE2
#if SIMSIMD_TARGET_SVE2 && SIMSIMD_TARGET_SVE_BF16
#pragma GCC push_options
#pragma GCC target("arch=armv8.6-a+sve+sve2+bf16")
#pragma clang attribute push(__attribute__((target("arch=armv8.6-a+sve+sve2+bf16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_sve2( //
simsimd_u16_t const *a, simsimd_u16_t const *b, //
simsimd_bf16_t const *a_weights, simsimd_bf16_t const *b_weights, //
simsimd_size_t a_length, simsimd_size_t b_length, //
simsimd_distance_t *results) {
// A single SVE lane is 128 bits wide, so one lane fits 8 values.
simsimd_size_t const register_size = svcnth();
simsimd_size_t const lanes_count = register_size / 8;
simsimd_size_t a_idx = 0, b_idx = 0;
svfloat32_t product_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
simsimd_size_t intersection_size = 0;
while (a_idx < a_length && b_idx < b_length) {
// Load `a_member` and broadcast it, load `b_members_vec` from memory
svbool_t a_progress = svwhilelt_b16_u64(a_idx, a_length);
svbool_t b_progress = svwhilelt_b16_u64(b_idx, b_length);
svuint16_t a_vec = svld1_u16(a_progress, a + a_idx);
svuint16_t b_vec = svld1_u16(b_progress, b + b_idx);
// Intersecting registers with `svmatch_u16` involves a lot of shuffling
// and comparisons, so we want to avoid it if the slices don't overlap at all..
simsimd_u16_t a_min;
simsimd_u16_t a_max = svlastb(a_progress, a_vec);
simsimd_u16_t b_min = svlasta(svpfalse_b(), b_vec);
simsimd_u16_t b_max = svlastb(b_progress, b_vec);
// If the slices don't overlap, advance the appropriate pointer
while (a_max < b_min && (a_idx + register_size) <= a_length) {
a_idx += register_size;
a_progress = svwhilelt_b16_u64(a_idx, a_length);
a_vec = svld1_u16(a_progress, a + a_idx);
a_max = svlastb(a_progress, a_vec);
}
a_min = svlasta(svpfalse_b(), a_vec);
while (b_max < a_min && (b_idx + register_size) <= b_length) {
b_idx += register_size;
b_progress = svwhilelt_b16_u64(b_idx, b_length);
b_vec = svld1_u16(b_progress, b + b_idx);
b_max = svlastb(b_progress, b_vec);
}
b_min = svlasta(svpfalse_b(), b_vec);
// Before we evaluate the intersection size, obfurscating the order in `b_vec`,
// let's estimate how much we will need to advance the pointers afterwards.
// For that, we don't even need to broadcast the values in SVE, as the whole
// register can be compared against a scalar:
//
// svuint16_t a_last_broadcasted = svdup_n_u16(a_max);
// svuint16_t b_last_broadcasted = svdup_n_u16(b_max);
svbool_t a_mask = svcmple_n_u16(a_progress, a_vec, b_max);
svbool_t b_mask = svcmple_n_u16(b_progress, b_vec, a_max);
simsimd_u64_t a_step = svcntp_b16(a_progress, a_mask);
simsimd_u64_t b_step = svcntp_b16(b_progress, b_mask);
// Compare `a_vec` with each lane of `b_vec`
svbfloat16_t a_weights_vec = svld1_bf16(a_progress, (__bf16 const *)a_weights + a_idx);
svbfloat16_t b_weights_vec = svld1_bf16(b_progress, (__bf16 const *)b_weights + b_idx);
for (simsimd_size_t i = 0; i < lanes_count; i++) {
svbool_t equal_mask = svmatch_u16(a_progress, a_vec, b_vec);
//! The `svsel_bf16` intrinsic is broken in many compilers, not returning the correct type.
//! So we reinterprete floats as integers and apply `svsel_s16`, but the `svreinterpret_s16_bs16`
//! and `svreinterpret_bf16_s16` are not always properly defined!
svint16_t b_equal_weights_vec =
svsel_s16(equal_mask, svreinterpret_s16_bf16(b_weights_vec), svdup_n_s16(0));
product_vec = svbfdot_f32(product_vec, a_weights_vec, svreinterpret_bf16_s16(b_equal_weights_vec));
b_vec = svext_u16(b_vec, b_vec, 8);
intersection_size += svcntp_b16(svptrue_b16(), equal_mask);
}
// Advance
a_idx += a_step;
b_idx += b_step;
}
results[0] = (simsimd_distance_t)intersection_size;
results[1] = svaddv_f32(svptrue_b32(), product_vec);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SVE2 && SIMSIMD_TARGET_SVE_BF16
#endif // _SIMSIMD_TARGET_ARM
#ifdef __cplusplus
}
#endif
#endif
simsimd-6.5.12/include/simsimd/spatial.h 0000644 0000000 0000000 00000355717 10461020230 0016304 0 ustar 0000000 0000000 /**
* @file spatial.h
* @brief SIMD-accelerated Spatial Similarity Measures.
* @author Ash Vardanian
* @date March 14, 2023
*
* Contains:
* - L2 (Euclidean) regular and squared distance
* - Cosine (Angular) distance - @b not similarity!
*
* For datatypes:
* - 64-bit IEEE floating point numbers
* - 32-bit IEEE floating point numbers
* - 16-bit IEEE floating point numbers
* - 16-bit brain floating point numbers
* - 8-bit unsigned integral numbers
* - 8-bit signed integral numbers
* - 4-bit signed integral numbers
*
* For hardware architectures:
* - Arm: NEON, SVE
* - x86: Haswell, Skylake, Ice Lake, Genoa, Sapphire
*
* x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
* Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
*/
#ifndef SIMSIMD_SPATIAL_H
#define SIMSIMD_SPATIAL_H
#include "types.h"
#include "dot.h" // `_simsimd_reduce_f32x8_haswell`
#ifdef __cplusplus
extern "C" {
#endif
// clang-format off
/* Serial backends for all numeric types.
* By default they use 32-bit arithmetic, unless the arguments themselves contain 64-bit floats.
* For double-precision computation check out the "*_accurate" variants of those "*_serial" functions.
*/
SIMSIMD_PUBLIC void simsimd_l2_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* Double-precision serial backends for all numeric types.
* For single-precision computation check out the "*_serial" counterparts of those "*_accurate" functions.
*/
SIMSIMD_PUBLIC void simsimd_l2_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for Arm NEON, mostly using 32-bit arithmetic over 128-bit words.
* By far the most portable backend, covering most Arm v8 devices, over a billion phones, and almost all
* server CPUs produced before 2023.
*/
SIMSIMD_PUBLIC void simsimd_l2_f64_neon(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_neon(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f64_neon(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for Arm SVE, mostly using 32-bit arithmetic over variable-length platform-defined word sizes.
* Designed for Arm Graviton 3, Microsoft Cobalt, as well as Nvidia Grace and newer Ampere Altra CPUs.
*/
SIMSIMD_PUBLIC void simsimd_l2_f32_sve(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_sve(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f32_sve(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f16_sve(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_sve(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f16_sve(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_bf16_sve(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_sve(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_bf16_sve(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f64_sve(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_sve(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f64_sve(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for AVX2 CPUs of Haswell generation and newer, using 32-bit arithmetic over 256-bit words.
* First demonstrated in 2011, at least one Haswell-based processor was still being sold in 2022 — the Pentium G3420.
* Practically all modern x86 CPUs support AVX2, FMA, and F16C, making it a perfect baseline for SIMD algorithms.
* On other hand, there is no need to implement AVX2 versions of `f32` and `f64` functions, as those are
* properly vectorized by recent compilers.
*/
SIMSIMD_PUBLIC void simsimd_l2_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f32_haswell(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_haswell(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f32_haswell(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f64_haswell(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_haswell(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f64_haswell(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for AVX512 CPUs of Skylake generation and newer, using 32-bit arithmetic over 512-bit words.
* Skylake was launched in 2015, and discontinued in 2019. Skylake had support for F, CD, VL, DQ, and BW extensions,
* as well as masked operations. This is enough to supersede auto-vectorization on `f32` and `f64` types.
*
* Sadly, we can't effectively interleave different kinds of arithmetic instructions to utilize more ports:
*
* > Like Intel server architectures since Skylake-X, SPR cores feature two 512-bit FMA units, and organize them in a similar fashion.
* > One 512-bit FMA unit is created by fusing two 256-bit ones on port 0 and port 1. The other is added to port 5, as a server-specific
* > core extension. The FMA units on port 0 and 1 are configured into 2×256-bit or 1×512-bit mode depending on whether 512-bit FMA
* > instructions are present in the scheduler. That means a mix of 256-bit and 512-bit FMA instructions will not achieve higher IPC
* > than executing 512-bit instructions alone.
*
* Source: https://chipsandcheese.com/p/a-peek-at-sapphire-rapids
*/
SIMSIMD_PUBLIC void simsimd_l2_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for AVX512 CPUs of Ice Lake generation and newer, using mixed arithmetic over 512-bit words.
* Ice Lake added VNNI, VPOPCNTDQ, IFMA, VBMI, VAES, GFNI, VBMI2, BITALG, VPCLMULQDQ, and other extensions for integral operations.
* Sapphire Rapids added tiled matrix operations, but we are most interested in the new mixed-precision FMA instructions.
*/
SIMSIMD_PUBLIC void simsimd_l2_i4x2_ice(simsimd_i4x2_t const* a, simsimd_i4x2_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_i4x2_ice(simsimd_i4x2_t const* a, simsimd_i4x2_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_i4x2_ice(simsimd_i4x2_t const* a, simsimd_i4x2_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
SIMSIMD_PUBLIC void simsimd_cos_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result);
/* SIMD-powered backends for AVX-INT8-VNNI extensions on Xeon 6 CPUs, including Sierra Forest and Granite Rapids.
* The packs many "efficiency" cores into a single socket, avoiding heavy 512-bit operations, and focusing on 256-bit ones.
*/
SIMSIMD_PUBLIC void simsimd_cos_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result);
// clang-format on
#define SIMSIMD_MAKE_L2SQ(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_l2sq_##input_type##_##name(simsimd_##input_type##_t const *a, \
simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t *result) { \
simsimd_##accumulator_type##_t d2 = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \
simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \
d2 += (ai - bi) * (ai - bi); \
} \
*result = d2; \
}
#define SIMSIMD_MAKE_L2(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_l2_##input_type##_##name(simsimd_##input_type##_t const *a, \
simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t *result) { \
simsimd_l2sq_##input_type##_##name(a, b, n, result); \
*result = SIMSIMD_SQRT(*result); \
}
#define SIMSIMD_MAKE_COS(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_cos_##input_type##_##name(simsimd_##input_type##_t const *a, \
simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t *result) { \
simsimd_##accumulator_type##_t ab = 0, a2 = 0, b2 = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \
simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \
ab += ai * bi; \
a2 += ai * ai; \
b2 += bi * bi; \
} \
if (a2 == 0 && b2 == 0) { *result = 0; } \
else if (ab == 0) { *result = 1; } \
else { \
simsimd_distance_t unclipped_result = 1 - ab * SIMSIMD_RSQRT(a2) * SIMSIMD_RSQRT(b2); \
*result = unclipped_result > 0 ? unclipped_result : 0; \
} \
}
SIMSIMD_MAKE_COS(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_cos_f64_serial
SIMSIMD_MAKE_L2SQ(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_l2sq_f64_serial
SIMSIMD_MAKE_L2(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_l2_f64_serial
SIMSIMD_MAKE_COS(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_cos_f32_serial
SIMSIMD_MAKE_L2SQ(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_l2sq_f32_serial
SIMSIMD_MAKE_L2(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_l2_f32_serial
SIMSIMD_MAKE_COS(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_cos_f16_serial
SIMSIMD_MAKE_L2SQ(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_l2sq_f16_serial
SIMSIMD_MAKE_L2(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_l2_f16_serial
SIMSIMD_MAKE_COS(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_cos_bf16_serial
SIMSIMD_MAKE_L2SQ(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_l2sq_bf16_serial
SIMSIMD_MAKE_L2(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_l2_bf16_serial
SIMSIMD_MAKE_COS(serial, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_cos_i8_serial
SIMSIMD_MAKE_L2SQ(serial, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2sq_i8_serial
SIMSIMD_MAKE_L2(serial, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2_i8_serial
SIMSIMD_MAKE_COS(serial, u8, i32, SIMSIMD_DEREFERENCE) // simsimd_cos_u8_serial
SIMSIMD_MAKE_L2SQ(serial, u8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2sq_u8_serial
SIMSIMD_MAKE_L2(serial, u8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2_u8_serial
SIMSIMD_MAKE_COS(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_cos_f32_accurate
SIMSIMD_MAKE_L2SQ(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_l2sq_f32_accurate
SIMSIMD_MAKE_L2(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_l2_f32_accurate
SIMSIMD_MAKE_COS(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_cos_f16_accurate
SIMSIMD_MAKE_L2SQ(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_l2sq_f16_accurate
SIMSIMD_MAKE_L2(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_l2_f16_accurate
SIMSIMD_MAKE_COS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_cos_bf16_accurate
SIMSIMD_MAKE_L2SQ(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_l2sq_bf16_accurate
SIMSIMD_MAKE_L2(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_l2_bf16_accurate
#if _SIMSIMD_TARGET_ARM
#if SIMSIMD_TARGET_NEON
#pragma GCC push_options
#pragma GCC target("arch=armv8-a+simd")
#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
SIMSIMD_INTERNAL simsimd_f32_t _simsimd_sqrt_f32_neon(simsimd_f32_t x) {
return vget_lane_f32(vsqrt_f32(vdup_n_f32(x)), 0);
}
SIMSIMD_INTERNAL simsimd_f64_t _simsimd_sqrt_f64_neon(simsimd_f64_t x) {
return vget_lane_f64(vsqrt_f64(vdup_n_f64(x)), 0);
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_neon(simsimd_f32_t ab, simsimd_f32_t a2,
simsimd_f32_t b2) {
if (a2 == 0 && b2 == 0) return 0;
if (ab == 0) return 1;
simsimd_f32_t squares_arr[2] = {a2, b2};
float32x2_t squares = vld1_f32(squares_arr);
// Unlike x86, Arm NEON manuals don't explicitly mention the accuracy of their `rsqrt` approximation.
// Third-party research suggests that it's less accurate than SSE instructions, having an error of 1.5*2^-12.
// One or two rounds of Newton-Raphson refinement are recommended to improve the accuracy.
// https://github.com/lighttransport/embree-aarch64/issues/24
// https://github.com/lighttransport/embree-aarch64/blob/3f75f8cb4e553d13dced941b5fefd4c826835a6b/common/math/math.h#L137-L145
float32x2_t rsqrts = vrsqrte_f32(squares);
// Perform two rounds of Newton-Raphson refinement:
// https://en.wikipedia.org/wiki/Newton%27s_method
rsqrts = vmul_f32(rsqrts, vrsqrts_f32(vmul_f32(squares, rsqrts), rsqrts));
rsqrts = vmul_f32(rsqrts, vrsqrts_f32(vmul_f32(squares, rsqrts), rsqrts));
vst1_f32(squares_arr, rsqrts);
simsimd_distance_t result = 1 - ab * squares_arr[0] * squares_arr[1];
return result > 0 ? result : 0;
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_neon(simsimd_f64_t ab, simsimd_f64_t a2,
simsimd_f64_t b2) {
if (a2 == 0 && b2 == 0) return 0;
if (ab == 0) return 1;
simsimd_f64_t squares_arr[2] = {a2, b2};
float64x2_t squares = vld1q_f64(squares_arr);
// Unlike x86, Arm NEON manuals don't explicitly mention the accuracy of their `rsqrt` approximation.
// Third-party research suggests that it's less accurate than SSE instructions, having an error of 1.5*2^-12.
// One or two rounds of Newton-Raphson refinement are recommended to improve the accuracy.
// https://github.com/lighttransport/embree-aarch64/issues/24
// https://github.com/lighttransport/embree-aarch64/blob/3f75f8cb4e553d13dced941b5fefd4c826835a6b/common/math/math.h#L137-L145
float64x2_t rsqrts = vrsqrteq_f64(squares);
// Perform two rounds of Newton-Raphson refinement:
// https://en.wikipedia.org/wiki/Newton%27s_method
rsqrts = vmulq_f64(rsqrts, vrsqrtsq_f64(vmulq_f64(squares, rsqrts), rsqrts));
rsqrts = vmulq_f64(rsqrts, vrsqrtsq_f64(vmulq_f64(squares, rsqrts), rsqrts));
vst1q_f64(squares_arr, rsqrts);
simsimd_distance_t result = 1 - ab * squares_arr[0] * squares_arr[1];
return result > 0 ? result : 0;
}
SIMSIMD_PUBLIC void simsimd_l2_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f32_neon(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vld1q_f32(a + i);
float32x4_t b_vec = vld1q_f32(b + i);
float32x4_t diff_vec = vsubq_f32(a_vec, b_vec);
sum_vec = vfmaq_f32(sum_vec, diff_vec, diff_vec);
}
simsimd_f32_t sum = vaddvq_f32(sum_vec);
for (; i < n; ++i) {
simsimd_f32_t diff = a[i] - b[i];
sum += diff * diff;
}
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_cos_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t ab_vec = vdupq_n_f32(0), a2_vec = vdupq_n_f32(0), b2_vec = vdupq_n_f32(0);
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vld1q_f32(a + i);
float32x4_t b_vec = vld1q_f32(b + i);
ab_vec = vfmaq_f32(ab_vec, a_vec, b_vec);
a2_vec = vfmaq_f32(a2_vec, a_vec, a_vec);
b2_vec = vfmaq_f32(b2_vec, b_vec, b_vec);
}
simsimd_f32_t ab = vaddvq_f32(ab_vec), a2 = vaddvq_f32(a2_vec), b2 = vaddvq_f32(b2_vec);
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f64_neon(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f64_neon(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f64_neon(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f64_neon(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float64x2_t sum_vec = vdupq_n_f64(0);
simsimd_size_t i = 0;
for (; i + 2 <= n; i += 2) {
float64x2_t a_vec = vld1q_f64(a + i);
float64x2_t b_vec = vld1q_f64(b + i);
float64x2_t diff_vec = vsubq_f64(a_vec, b_vec);
sum_vec = vfmaq_f64(sum_vec, diff_vec, diff_vec);
}
simsimd_f64_t sum = vaddvq_f64(sum_vec);
for (; i < n; ++i) {
simsimd_f64_t diff = a[i] - b[i];
sum += diff * diff;
}
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_cos_f64_neon(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float64x2_t ab_vec = vdupq_n_f64(0), a2_vec = vdupq_n_f64(0), b2_vec = vdupq_n_f64(0);
simsimd_size_t i = 0;
for (; i + 2 <= n; i += 2) {
float64x2_t a_vec = vld1q_f64(a + i);
float64x2_t b_vec = vld1q_f64(b + i);
ab_vec = vfmaq_f64(ab_vec, a_vec, b_vec);
a2_vec = vfmaq_f64(a2_vec, a_vec, a_vec);
b2_vec = vfmaq_f64(b2_vec, b_vec, b_vec);
}
simsimd_f64_t ab = vaddvq_f64(ab_vec), a2 = vaddvq_f64(a2_vec), b2 = vaddvq_f64(b2_vec);
for (; i < n; ++i) {
simsimd_f64_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f64_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON
#if SIMSIMD_TARGET_NEON_F16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd+fp16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f16_neon(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t a_vec, b_vec;
float32x4_t sum_vec = vdupq_n_f32(0);
simsimd_l2sq_f16_neon_cycle:
if (n < 4) {
a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n));
b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n));
n = 0;
}
else {
a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)a));
b_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)b));
n -= 4, a += 4, b += 4;
}
float32x4_t diff_vec = vsubq_f32(a_vec, b_vec);
sum_vec = vfmaq_f32(sum_vec, diff_vec, diff_vec);
if (n) goto simsimd_l2sq_f16_neon_cycle;
*result = vaddvq_f32(sum_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t ab_vec = vdupq_n_f32(0), a2_vec = vdupq_n_f32(0), b2_vec = vdupq_n_f32(0);
float32x4_t a_vec, b_vec;
simsimd_cos_f16_neon_cycle:
if (n < 4) {
a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n));
b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n));
n = 0;
}
else {
a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)a));
b_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)b));
n -= 4, a += 4, b += 4;
}
ab_vec = vfmaq_f32(ab_vec, a_vec, b_vec);
a2_vec = vfmaq_f32(a2_vec, a_vec, a_vec);
b2_vec = vfmaq_f32(b2_vec, b_vec, b_vec);
if (n) goto simsimd_cos_f16_neon_cycle;
simsimd_f32_t ab = vaddvq_f32(ab_vec), a2 = vaddvq_f32(a2_vec), b2 = vaddvq_f32(b2_vec);
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_F16
#if SIMSIMD_TARGET_NEON_BF16
#pragma GCC push_options
#pragma GCC target("arch=armv8.6-a+simd+bf16")
#pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_cos_bf16_neon(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
// Similar to `simsimd_cos_i8_neon`, we can use the `BFMMLA` instruction through
// the `vbfmmlaq_f32` intrinsic to compute matrix products and later drop 1/4 of values.
// The only difference is that `zip` isn't provided for `bf16` and we need to reinterpret back
// and forth before zipping. Same as with integers, on modern Arm CPUs, this "smart"
// approach is actually slower by around 25%.
//
// float32x4_t products_low_vec = vdupq_n_f32(0.0f);
// float32x4_t products_high_vec = vdupq_n_f32(0.0f);
// for (; i + 8 <= n; i += 8) {
// bfloat16x8_t a_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const*)a + i);
// bfloat16x8_t b_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const*)b + i);
// int16x8_t a_vec_s16 = vreinterpretq_s16_bf16(a_vec);
// int16x8_t b_vec_s16 = vreinterpretq_s16_bf16(b_vec);
// int16x8x2_t y_w_vecs_s16 = vzipq_s16(a_vec_s16, b_vec_s16);
// bfloat16x8_t y_vec = vreinterpretq_bf16_s16(y_w_vecs_s16.val[0]);
// bfloat16x8_t w_vec = vreinterpretq_bf16_s16(y_w_vecs_s16.val[1]);
// bfloat16x4_t a_low = vget_low_bf16(a_vec);
// bfloat16x4_t b_low = vget_low_bf16(b_vec);
// bfloat16x4_t a_high = vget_high_bf16(a_vec);
// bfloat16x4_t b_high = vget_high_bf16(b_vec);
// bfloat16x8_t x_vec = vcombine_bf16(a_low, b_low);
// bfloat16x8_t v_vec = vcombine_bf16(a_high, b_high);
// products_low_vec = vbfmmlaq_f32(products_low_vec, x_vec, y_vec);
// products_high_vec = vbfmmlaq_f32(products_high_vec, v_vec, w_vec);
// }
// float32x4_t products_vec = vaddq_f32(products_high_vec, products_low_vec);
// simsimd_f32_t a2 = products_vec[0], ab = products_vec[1], b2 = products_vec[3];
//
// Another way of accomplishing the same thing is to process the odd and even elements separately,
// using special `vbfmlaltq_f32` and `vbfmlalbq_f32` intrinsics:
//
// ab_high_vec = vbfmlaltq_f32(ab_high_vec, a_vec, b_vec);
// ab_low_vec = vbfmlalbq_f32(ab_low_vec, a_vec, b_vec);
// a2_high_vec = vbfmlaltq_f32(a2_high_vec, a_vec, a_vec);
// a2_low_vec = vbfmlalbq_f32(a2_low_vec, a_vec, a_vec);
// b2_high_vec = vbfmlaltq_f32(b2_high_vec, b_vec, b_vec);
// b2_low_vec = vbfmlalbq_f32(b2_low_vec, b_vec, b_vec);
//
float32x4_t ab_vec = vdupq_n_f32(0);
float32x4_t a2_vec = vdupq_n_f32(0);
float32x4_t b2_vec = vdupq_n_f32(0);
bfloat16x8_t a_vec, b_vec;
simsimd_cos_bf16_neon_cycle:
if (n < 8) {
a_vec = _simsimd_partial_load_bf16x8_neon(a, n);
b_vec = _simsimd_partial_load_bf16x8_neon(b, n);
n = 0;
}
else {
a_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)a);
b_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)b);
n -= 8, a += 8, b += 8;
}
ab_vec = vbfdotq_f32(ab_vec, a_vec, b_vec);
a2_vec = vbfdotq_f32(a2_vec, a_vec, a_vec);
b2_vec = vbfdotq_f32(b2_vec, b_vec, b_vec);
if (n) goto simsimd_cos_bf16_neon_cycle;
// Avoid `simsimd_approximate_inverse_square_root` on Arm NEON
simsimd_f32_t ab = vaddvq_f32(ab_vec), a2 = vaddvq_f32(a2_vec), b2 = vaddvq_f32(b2_vec);
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_bf16_neon(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_bf16_neon(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_neon(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t diff_high_vec, diff_low_vec;
float32x4_t sum_high_vec = vdupq_n_f32(0), sum_low_vec = vdupq_n_f32(0);
simsimd_l2sq_bf16_neon_cycle:
if (n < 8) {
bfloat16x8_t a_vec = _simsimd_partial_load_bf16x8_neon(a, n);
bfloat16x8_t b_vec = _simsimd_partial_load_bf16x8_neon(b, n);
diff_high_vec = vsubq_f32(vcvt_f32_bf16(vget_high_bf16(a_vec)), vcvt_f32_bf16(vget_high_bf16(b_vec)));
diff_low_vec = vsubq_f32(vcvt_f32_bf16(vget_low_bf16(a_vec)), vcvt_f32_bf16(vget_low_bf16(b_vec)));
n = 0;
}
else {
bfloat16x8_t a_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)a);
bfloat16x8_t b_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)b);
diff_high_vec = vsubq_f32(vcvt_f32_bf16(vget_high_bf16(a_vec)), vcvt_f32_bf16(vget_high_bf16(b_vec)));
diff_low_vec = vsubq_f32(vcvt_f32_bf16(vget_low_bf16(a_vec)), vcvt_f32_bf16(vget_low_bf16(b_vec)));
n -= 8, a += 8, b += 8;
}
sum_high_vec = vfmaq_f32(sum_high_vec, diff_high_vec, diff_high_vec);
sum_low_vec = vfmaq_f32(sum_low_vec, diff_low_vec, diff_low_vec);
if (n) goto simsimd_l2sq_bf16_neon_cycle;
*result = vaddvq_f32(vaddq_f32(sum_high_vec, sum_low_vec));
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_BF16
#if SIMSIMD_TARGET_NEON_I8
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+dotprod+i8mm")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+dotprod+i8mm"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_i8_neon(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_i8_neon(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_i8_neon(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
// The naive approach is to upcast 8-bit signed integers into 16-bit signed integers
// for subtraction, then multiply within 16-bit integers and accumulate the results
// into 32-bit integers. This approach is slow on modern Arm CPUs. On Graviton 4,
// that approach results in 17 GB/s of throughput, compared to 39 GB/s for `i8`
// dot-products.
//
// Luckily we can use the `vabdq_s8` which technically returns `i8` values, but it's a
// matter of reinterpret-casting! That approach boosts us to 33 GB/s of throughput.
uint32x4_t d2_vec = vdupq_n_u32(0);
simsimd_size_t i = 0;
for (; i + 16 <= n; i += 16) {
int8x16_t a_vec = vld1q_s8(a + i);
int8x16_t b_vec = vld1q_s8(b + i);
uint8x16_t d_vec = vreinterpretq_u8_s8(vabdq_s8(a_vec, b_vec));
d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
}
simsimd_u32_t d2 = vaddvq_u32(d2_vec);
for (; i < n; ++i) {
simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
d2 += (simsimd_u32_t)(n * n);
}
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
// Variant 1.
// If the 128-bit `vdot_s32` intrinsic is unavailable, we can use the 64-bit `vdot_s32`.
//
// int32x4_t ab_vec = vdupq_n_s32(0);
// int32x4_t a2_vec = vdupq_n_s32(0);
// int32x4_t b2_vec = vdupq_n_s32(0);
// for (simsimd_size_t i = 0; i != n; i += 8) {
// int16x8_t a_vec = vmovl_s8(vld1_s8(a + i));
// int16x8_t b_vec = vmovl_s8(vld1_s8(b + i));
// int16x8_t ab_part_vec = vmulq_s16(a_vec, b_vec);
// int16x8_t a2_part_vec = vmulq_s16(a_vec, a_vec);
// int16x8_t b2_part_vec = vmulq_s16(b_vec, b_vec);
// ab_vec = vaddq_s32(ab_vec, vaddq_s32(vmovl_s16(vget_high_s16(ab_part_vec)), //
// vmovl_s16(vget_low_s16(ab_part_vec))));
// a2_vec = vaddq_s32(a2_vec, vaddq_s32(vmovl_s16(vget_high_s16(a2_part_vec)), //
// vmovl_s16(vget_low_s16(a2_part_vec))));
// b2_vec = vaddq_s32(b2_vec, vaddq_s32(vmovl_s16(vget_high_s16(b2_part_vec)), //
// vmovl_s16(vget_low_s16(b2_part_vec))));
// }
//
// Variant 2.
// With the 128-bit `vdotq_s32` intrinsic, we can use the following code:
//
// for (; i + 16 <= n; i += 16) {
// int8x16_t a_vec = vld1q_s8(a + i);
// int8x16_t b_vec = vld1q_s8(b + i);
// ab_vec = vdotq_s32(ab_vec, a_vec, b_vec);
// a2_vec = vdotq_s32(a2_vec, a_vec, a_vec);
// b2_vec = vdotq_s32(b2_vec, b_vec, b_vec);
// }
//
// Variant 3.
// To use MMLA instructions, we need to reorganize the contents of the vectors.
// On input we have `a_vec` and `b_vec`:
//
// a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]
// b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]
//
// We will be multiplying matrices of size 2x8 and 8x2. So we need to perform a few shuffles:
//
// X =
// a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
// b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]
// Y =
// a[0], b[0],
// a[1], b[1],
// a[2], b[2],
// a[3], b[3],
// a[4], b[4],
// a[5], b[5],
// a[6], b[6],
// a[7], b[7]
//
// V =
// a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15],
// b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]
// W =
// a[8], b[8],
// a[9], b[9],
// a[10], b[10],
// a[11], b[11],
// a[12], b[12],
// a[13], b[13],
// a[14], b[14],
// a[15], b[15]
//
// Performing matrix multiplications we can aggregate into a matrix `products_low_vec` and `products_high_vec`:
//
// X * X, X * Y V * W, V * V
// Y * X, Y * Y W * W, W * V
//
// Of those values we need only 3/4, as the (X * Y) and (Y * X) are the same.
//
// int32x4_t products_low_vec = vdupq_n_s32(0), products_high_vec = vdupq_n_s32(0);
// int8x16_t a_low_b_low_vec, a_high_b_high_vec;
// for (; i + 16 <= n; i += 16) {
// int8x16_t a_vec = vld1q_s8(a + i);
// int8x16_t b_vec = vld1q_s8(b + i);
// int8x16x2_t y_w_vecs = vzipq_s8(a_vec, b_vec);
// int8x16_t x_vec = vcombine_s8(vget_low_s8(a_vec), vget_low_s8(b_vec));
// int8x16_t v_vec = vcombine_s8(vget_high_s8(a_vec), vget_high_s8(b_vec));
// products_low_vec = vmmlaq_s32(products_low_vec, x_vec, y_w_vecs.val[0]);
// products_high_vec = vmmlaq_s32(products_high_vec, v_vec, y_w_vecs.val[1]);
// }
// int32x4_t products_vec = vaddq_s32(products_high_vec, products_low_vec);
// simsimd_i32_t a2 = products_vec[0];
// simsimd_i32_t ab = products_vec[1];
// simsimd_i32_t b2 = products_vec[3];
//
// That solution is elegant, but it requires the additional `+i8mm` extension and is currently slower,
// at least on AWS Graviton 3.
int32x4_t ab_vec = vdupq_n_s32(0);
int32x4_t a2_vec = vdupq_n_s32(0);
int32x4_t b2_vec = vdupq_n_s32(0);
for (; i + 16 <= n; i += 16) {
int8x16_t a_vec = vld1q_s8(a + i);
int8x16_t b_vec = vld1q_s8(b + i);
ab_vec = vdotq_s32(ab_vec, a_vec, b_vec);
a2_vec = vdotq_s32(a2_vec, a_vec, a_vec);
b2_vec = vdotq_s32(b2_vec, b_vec, b_vec);
}
simsimd_i32_t ab = vaddvq_s32(ab_vec);
simsimd_i32_t a2 = vaddvq_s32(a2_vec);
simsimd_i32_t b2 = vaddvq_s32(b2_vec);
// Take care of the tail:
for (; i < n; ++i) {
simsimd_i32_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_u8_neon(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_u8_neon(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_u8_neon(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
uint32x4_t d2_vec = vdupq_n_u32(0);
simsimd_size_t i = 0;
for (; i + 16 <= n; i += 16) {
uint8x16_t a_vec = vld1q_u8(a + i);
uint8x16_t b_vec = vld1q_u8(b + i);
uint8x16_t d_vec = vabdq_u8(a_vec, b_vec);
d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
}
simsimd_u32_t d2 = vaddvq_u32(d2_vec);
for (; i < n; ++i) {
simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
d2 += (simsimd_u32_t)(n * n);
}
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_u8_neon(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
uint32x4_t ab_vec = vdupq_n_u32(0);
uint32x4_t a2_vec = vdupq_n_u32(0);
uint32x4_t b2_vec = vdupq_n_u32(0);
for (; i + 16 <= n; i += 16) {
uint8x16_t a_vec = vld1q_u8(a + i);
uint8x16_t b_vec = vld1q_u8(b + i);
ab_vec = vdotq_u32(ab_vec, a_vec, b_vec);
a2_vec = vdotq_u32(a2_vec, a_vec, a_vec);
b2_vec = vdotq_u32(b2_vec, b_vec, b_vec);
}
simsimd_u32_t ab = vaddvq_u32(ab_vec);
simsimd_u32_t a2 = vaddvq_u32(a2_vec);
simsimd_u32_t b2 = vaddvq_u32(b2_vec);
// Take care of the tail:
for (; i < n; ++i) {
simsimd_u32_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_NEON_I8
#if SIMSIMD_TARGET_SVE
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f32_sve(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f32_sve(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f32_sve(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat32_t d2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
do {
svbool_t pg_vec = svwhilelt_b32((unsigned int)i, (unsigned int)n);
svfloat32_t a_vec = svld1_f32(pg_vec, a + i);
svfloat32_t b_vec = svld1_f32(pg_vec, b + i);
svfloat32_t a_minus_b_vec = svsub_f32_x(pg_vec, a_vec, b_vec);
d2_vec = svmla_f32_x(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
i += svcntw();
} while (i < n);
simsimd_f32_t d2 = svaddv_f32(svptrue_b32(), d2_vec);
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_f32_sve(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat32_t ab_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t a2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t b2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
do {
svbool_t pg_vec = svwhilelt_b32((unsigned int)i, (unsigned int)n);
svfloat32_t a_vec = svld1_f32(pg_vec, a + i);
svfloat32_t b_vec = svld1_f32(pg_vec, b + i);
ab_vec = svmla_f32_x(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f32_x(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f32_x(pg_vec, b2_vec, b_vec, b_vec);
i += svcntw();
} while (i < n);
simsimd_f32_t ab = svaddv_f32(svptrue_b32(), ab_vec);
simsimd_f32_t a2 = svaddv_f32(svptrue_b32(), a2_vec);
simsimd_f32_t b2 = svaddv_f32(svptrue_b32(), b2_vec);
*result = _simsimd_cos_normalize_f64_neon(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f64_sve(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f64_sve(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f64_sve(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat64_t d2_vec = svdupq_n_f64(0.0, 0.0);
do {
svbool_t pg_vec = svwhilelt_b64((unsigned int)i, (unsigned int)n);
svfloat64_t a_vec = svld1_f64(pg_vec, a + i);
svfloat64_t b_vec = svld1_f64(pg_vec, b + i);
svfloat64_t a_minus_b_vec = svsub_f64_x(pg_vec, a_vec, b_vec);
d2_vec = svmla_f64_x(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
i += svcntd();
} while (i < n);
simsimd_f64_t d2 = svaddv_f64(svptrue_b32(), d2_vec);
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_f64_sve(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat64_t ab_vec = svdupq_n_f64(0.0, 0.0);
svfloat64_t a2_vec = svdupq_n_f64(0.0, 0.0);
svfloat64_t b2_vec = svdupq_n_f64(0.0, 0.0);
do {
svbool_t pg_vec = svwhilelt_b64((unsigned int)i, (unsigned int)n);
svfloat64_t a_vec = svld1_f64(pg_vec, a + i);
svfloat64_t b_vec = svld1_f64(pg_vec, b + i);
ab_vec = svmla_f64_x(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f64_x(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f64_x(pg_vec, b2_vec, b_vec, b_vec);
i += svcntd();
} while (i < n);
simsimd_f64_t ab = svaddv_f64(svptrue_b32(), ab_vec);
simsimd_f64_t a2 = svaddv_f64(svptrue_b32(), a2_vec);
simsimd_f64_t b2 = svaddv_f64(svptrue_b32(), b2_vec);
*result = _simsimd_cos_normalize_f64_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SVE
#if SIMSIMD_TARGET_SVE_F16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve+fp16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f16_sve(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f16_sve(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f16_sve(simsimd_f16_t const *a_enum, simsimd_f16_t const *b_enum, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat16_t d2_vec = svdupq_n_f16(0, 0, 0, 0, 0, 0, 0, 0);
simsimd_f16_for_arm_simd_t const *a = (simsimd_f16_for_arm_simd_t const *)(a_enum);
simsimd_f16_for_arm_simd_t const *b = (simsimd_f16_for_arm_simd_t const *)(b_enum);
do {
svbool_t pg_vec = svwhilelt_b16((unsigned int)i, (unsigned int)n);
svfloat16_t a_vec = svld1_f16(pg_vec, a + i);
svfloat16_t b_vec = svld1_f16(pg_vec, b + i);
svfloat16_t a_minus_b_vec = svsub_f16_x(pg_vec, a_vec, b_vec);
d2_vec = svmla_f16_x(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
i += svcnth();
} while (i < n);
simsimd_f16_for_arm_simd_t d2_f16 = svaddv_f16(svptrue_b16(), d2_vec);
*result = d2_f16;
}
SIMSIMD_PUBLIC void simsimd_cos_f16_sve(simsimd_f16_t const *a_enum, simsimd_f16_t const *b_enum, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat16_t ab_vec = svdupq_n_f16(0, 0, 0, 0, 0, 0, 0, 0);
svfloat16_t a2_vec = svdupq_n_f16(0, 0, 0, 0, 0, 0, 0, 0);
svfloat16_t b2_vec = svdupq_n_f16(0, 0, 0, 0, 0, 0, 0, 0);
simsimd_f16_for_arm_simd_t const *a = (simsimd_f16_for_arm_simd_t const *)(a_enum);
simsimd_f16_for_arm_simd_t const *b = (simsimd_f16_for_arm_simd_t const *)(b_enum);
do {
svbool_t pg_vec = svwhilelt_b16((unsigned int)i, (unsigned int)n);
svfloat16_t a_vec = svld1_f16(pg_vec, a + i);
svfloat16_t b_vec = svld1_f16(pg_vec, b + i);
ab_vec = svmla_f16_x(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f16_x(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f16_x(pg_vec, b2_vec, b_vec, b_vec);
i += svcnth();
} while (i < n);
simsimd_f16_for_arm_simd_t ab = svaddv_f16(svptrue_b16(), ab_vec);
simsimd_f16_for_arm_simd_t a2 = svaddv_f16(svptrue_b16(), a2_vec);
simsimd_f16_for_arm_simd_t b2 = svaddv_f16(svptrue_b16(), b2_vec);
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SVE_F16
#if SIMSIMD_TARGET_SVE_BF16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve+bf16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+bf16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_bf16_sve(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_bf16_sve(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_sve(simsimd_bf16_t const *a_enum, simsimd_bf16_t const *b_enum, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat32_t d2_low_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t d2_high_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
simsimd_u16_t const *a = (simsimd_u16_t const *)(a_enum);
simsimd_u16_t const *b = (simsimd_u16_t const *)(b_enum);
do {
svbool_t pg_vec = svwhilelt_b16((unsigned int)i, (unsigned int)n);
svuint16_t a_vec = svld1_u16(pg_vec, a + i);
svuint16_t b_vec = svld1_u16(pg_vec, b + i);
// There is no `bf16` subtraction in SVE, so we need to convert to `u32` and shift.
svbool_t pg_low_vec = svwhilelt_b32((unsigned int)(i), (unsigned int)n);
svbool_t pg_high_vec = svwhilelt_b32((unsigned int)(i + svcnth() / 2), (unsigned int)n);
svfloat32_t a_low_vec = svreinterpret_f32_u32(svlsl_n_u32_x(pg_low_vec, svunpklo_u32(a_vec), 16));
svfloat32_t a_high_vec = svreinterpret_f32_u32(svlsl_n_u32_x(pg_high_vec, svunpkhi_u32(a_vec), 16));
svfloat32_t b_low_vec = svreinterpret_f32_u32(svlsl_n_u32_x(pg_low_vec, svunpklo_u32(b_vec), 16));
svfloat32_t b_high_vec = svreinterpret_f32_u32(svlsl_n_u32_x(pg_high_vec, svunpkhi_u32(b_vec), 16));
svfloat32_t a_minus_b_low_vec = svsub_f32_x(pg_low_vec, a_low_vec, b_low_vec);
svfloat32_t a_minus_b_high_vec = svsub_f32_x(pg_high_vec, a_high_vec, b_high_vec);
d2_low_vec = svmla_f32_x(pg_vec, d2_low_vec, a_minus_b_low_vec, a_minus_b_low_vec);
d2_high_vec = svmla_f32_x(pg_vec, d2_high_vec, a_minus_b_high_vec, a_minus_b_high_vec);
i += svcnth();
} while (i < n);
simsimd_f32_t d2 = svaddv_f32(svptrue_b32(), d2_low_vec) + svaddv_f32(svptrue_b32(), d2_high_vec);
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_bf16_sve(simsimd_bf16_t const *a_enum, simsimd_bf16_t const *b_enum, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat32_t ab_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t a2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t b2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
simsimd_bf16_for_arm_simd_t const *a = (simsimd_bf16_for_arm_simd_t const *)(a_enum);
simsimd_bf16_for_arm_simd_t const *b = (simsimd_bf16_for_arm_simd_t const *)(b_enum);
do {
svbool_t pg_vec = svwhilelt_b16((unsigned int)i, (unsigned int)n);
svbfloat16_t a_vec = svld1_bf16(pg_vec, a + i);
svbfloat16_t b_vec = svld1_bf16(pg_vec, b + i);
ab_vec = svbfdot_f32(ab_vec, a_vec, b_vec);
a2_vec = svbfdot_f32(a2_vec, a_vec, a_vec);
b2_vec = svbfdot_f32(b2_vec, b_vec, b_vec);
i += svcnth();
} while (i < n);
simsimd_f32_t ab = svaddv_f32(svptrue_b32(), ab_vec);
simsimd_f32_t a2 = svaddv_f32(svptrue_b32(), a2_vec);
simsimd_f32_t b2 = svaddv_f32(svptrue_b32(), b2_vec);
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SVE_BF16
#endif // _SIMSIMD_TARGET_ARM
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_HASWELL
#pragma GCC push_options
#pragma GCC target("avx2")
#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
SIMSIMD_INTERNAL simsimd_f32_t _simsimd_sqrt_f32_haswell(simsimd_f32_t x) {
return _mm_cvtss_f32(_mm_sqrt_ps(_mm_set_ss(x)));
}
SIMSIMD_INTERNAL simsimd_f64_t _simsimd_sqrt_f64_haswell(simsimd_f64_t x) {
return _mm_cvtsd_f64(_mm_sqrt_pd(_mm_set_sd(x)));
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_haswell(simsimd_f64_t ab, simsimd_f64_t a2,
simsimd_f64_t b2) {
// If both vectors have magnitude 0, the distance is 0.
if (a2 == 0 && b2 == 0) return 0;
// If any one of the vectors is 0, the square root of the product is 0,
// the division is illformed, and the result is 1.
else if (ab == 0)
return 1;
// We want to avoid the `simsimd_approximate_inverse_square_root` due to high latency:
// https://web.archive.org/web/20210208132927/http://assemblyrequired.crashworks.org/timing-square-root/
// The latency of the native instruction is 4 cycles and it's broadly supported.
// For single-precision floats it has a maximum relative error of 1.5*2^-12.
// Higher precision isn't implemented on older CPUs. See `_simsimd_cos_normalize_f64_skylake` for that.
__m128d squares = _mm_set_pd(a2, b2);
__m128d rsqrts = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(squares)));
// Newton-Raphson iteration for reciprocal square root:
// https://en.wikipedia.org/wiki/Newton%27s_method
rsqrts = _mm_add_pd( //
_mm_mul_pd(_mm_set1_pd(1.5), rsqrts),
_mm_mul_pd(_mm_mul_pd(_mm_mul_pd(squares, _mm_set1_pd(-0.5)), rsqrts), _mm_mul_pd(rsqrts, rsqrts)));
simsimd_f64_t a2_reciprocal = _mm_cvtsd_f64(_mm_unpackhi_pd(rsqrts, rsqrts));
simsimd_f64_t b2_reciprocal = _mm_cvtsd_f64(rsqrts);
simsimd_distance_t result = 1 - ab * a2_reciprocal * b2_reciprocal;
return result > 0 ? result : 0;
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_haswell(simsimd_f32_t ab, simsimd_f32_t a2,
simsimd_f32_t b2) {
// If both vectors have magnitude 0, the distance is 0.
if (a2 == 0.0f && b2 == 0.0f) return 0.0f;
// If any one of the vectors is 0, the square root of the product is 0,
// the division is illformed, and the result is 1.
else if (ab == 0.0f)
return 1.0f;
// Load the squares into an __m128 register for single-precision floating-point operations
__m128 squares = _mm_set_ps(a2, b2, a2, b2); // We replicate to make use of full register
// Compute the reciprocal square root of the squares using `_mm_rsqrt_ps` (single-precision)
__m128 rsqrts = _mm_rsqrt_ps(squares);
// Perform one iteration of Newton-Raphson refinement to improve the precision of rsqrt:
// Formula: y' = y * (1.5 - 0.5 * x * y * y)
__m128 half = _mm_set1_ps(0.5f);
__m128 three_halves = _mm_set1_ps(1.5f);
rsqrts =
_mm_mul_ps(rsqrts, _mm_sub_ps(three_halves, _mm_mul_ps(half, _mm_mul_ps(squares, _mm_mul_ps(rsqrts, rsqrts)))));
// Extract the reciprocal square roots of a2 and b2 from the __m128 register
simsimd_f32_t a2_reciprocal = _mm_cvtss_f32(_mm_shuffle_ps(rsqrts, rsqrts, _MM_SHUFFLE(0, 0, 0, 1)));
simsimd_f32_t b2_reciprocal = _mm_cvtss_f32(rsqrts);
// Calculate the cosine distance: 1 - ab * a2_reciprocal * b2_reciprocal
simsimd_distance_t result = 1.0f - ab * a2_reciprocal * b2_reciprocal;
return result > 0 ? result : 0;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_HASWELL
#endif // _SIMSIMD_TARGET_X86
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_HASWELL
#pragma GCC push_options
#pragma GCC target("avx2", "f16c", "fma")
#pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f16_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 a_vec, b_vec;
__m256 d2_vec = _mm256_setzero_ps();
simsimd_l2sq_f16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_partial_load_f16x8_haswell(a, n);
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
}
else {
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
__m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
d2_vec = _mm256_fmadd_ps(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_f16_haswell_cycle;
*result = _simsimd_reduce_f32x8_haswell(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 a_vec, b_vec;
__m256 ab_vec = _mm256_setzero_ps(), a2_vec = _mm256_setzero_ps(), b2_vec = _mm256_setzero_ps();
simsimd_cos_f16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_partial_load_f16x8_haswell(a, n);
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
}
else {
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
a2_vec = _mm256_fmadd_ps(a_vec, a_vec, a2_vec);
b2_vec = _mm256_fmadd_ps(b_vec, b_vec, b2_vec);
if (n) goto simsimd_cos_f16_haswell_cycle;
simsimd_f32_t ab = _simsimd_reduce_f32x8_haswell(ab_vec);
simsimd_f32_t a2 = _simsimd_reduce_f32x8_haswell(a2_vec);
simsimd_f32_t b2 = _simsimd_reduce_f32x8_haswell(b2_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_bf16_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 a_vec, b_vec;
__m256 d2_vec = _mm256_setzero_ps();
simsimd_l2sq_bf16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a, n));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n));
n = 0;
}
else {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)a));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
__m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
d2_vec = _mm256_fmadd_ps(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_bf16_haswell_cycle;
*result = _simsimd_reduce_f32x8_haswell(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 a_vec, b_vec;
__m256 ab_vec = _mm256_setzero_ps(), a2_vec = _mm256_setzero_ps(), b2_vec = _mm256_setzero_ps();
simsimd_cos_bf16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a, n));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n));
n = 0;
}
else {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)a));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
a2_vec = _mm256_fmadd_ps(a_vec, a_vec, a2_vec);
b2_vec = _mm256_fmadd_ps(b_vec, b_vec, b2_vec);
if (n) goto simsimd_cos_bf16_haswell_cycle;
simsimd_f32_t ab = _simsimd_reduce_f32x8_haswell(ab_vec);
simsimd_f32_t a2 = _simsimd_reduce_f32x8_haswell(a2_vec);
simsimd_f32_t b2 = _simsimd_reduce_f32x8_haswell(b2_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_i8_haswell(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_i8_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_i8_haswell(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i d2_i32_low_vec = _mm256_setzero_si256();
__m256i d2_i32_high_vec = _mm256_setzero_si256();
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_i8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_i8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
// Sign extend `i8` to `i16`
__m256i a_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(a_i8_vec));
__m256i a_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 1));
__m256i b_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(b_i8_vec));
__m256i b_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 1));
// Subtract
// After this we will be squaring the values. The sign will be dropped
// and each difference will be in the range [0, 255].
__m256i d_i16_low_vec = _mm256_sub_epi16(a_i16_low_vec, b_i16_low_vec);
__m256i d_i16_high_vec = _mm256_sub_epi16(a_i16_high_vec, b_i16_high_vec);
// Accumulate into `i32` vectors
d2_i32_low_vec = _mm256_add_epi32(d2_i32_low_vec, _mm256_madd_epi16(d_i16_low_vec, d_i16_low_vec));
d2_i32_high_vec = _mm256_add_epi32(d2_i32_high_vec, _mm256_madd_epi16(d_i16_high_vec, d_i16_high_vec));
}
// Accumulate the 32-bit integers from `d2_i32_high_vec` and `d2_i32_low_vec`
int d2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(d2_i32_low_vec, d2_i32_high_vec));
// Take care of the tail:
for (; i < n; ++i) {
int n = (int)(a[i]) - b[i];
d2 += n * n;
}
*result = (simsimd_f64_t)d2;
}
SIMSIMD_PUBLIC void simsimd_cos_i8_haswell(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i ab_i32_low_vec = _mm256_setzero_si256();
__m256i ab_i32_high_vec = _mm256_setzero_si256();
__m256i a2_i32_low_vec = _mm256_setzero_si256();
__m256i a2_i32_high_vec = _mm256_setzero_si256();
__m256i b2_i32_low_vec = _mm256_setzero_si256();
__m256i b2_i32_high_vec = _mm256_setzero_si256();
// AVX2 has no instructions for 8-bit signed integer dot-products,
// but it has a weird instruction for mixed signed-unsigned 8-bit dot-product.
// So we need to normalize the first vector to its absolute value,
// and shift the product sign into the second vector.
//
// __m256i a_i8_abs_vec = _mm256_abs_epi8(a_i8_vec);
// __m256i b_i8_flipped_vec = _mm256_sign_epi8(b_i8_vec, a_i8_vec);
// __m256i ab_i16_vec = _mm256_maddubs_epi16(a_i8_abs_vec, b_i8_flipped_vec);
//
// The problem with this approach, however, is the `-128` value in the second vector.
// Flipping its sign will do nothing, and the result will be incorrect.
// This can easily lead to noticeable numerical errors in the final result.
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_i8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_i8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
// Unpack `int8` to `int16`
__m256i a_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 0));
__m256i a_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 1));
__m256i b_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 0));
__m256i b_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 1));
// Multiply and accumulate as `int16`, accumulate products as `int32`:
ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec));
ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec));
a2_i32_low_vec = _mm256_add_epi32(a2_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, a_i16_low_vec));
a2_i32_high_vec = _mm256_add_epi32(a2_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, a_i16_high_vec));
b2_i32_low_vec = _mm256_add_epi32(b2_i32_low_vec, _mm256_madd_epi16(b_i16_low_vec, b_i16_low_vec));
b2_i32_high_vec = _mm256_add_epi32(b2_i32_high_vec, _mm256_madd_epi16(b_i16_high_vec, b_i16_high_vec));
}
// Further reduce to a single sum for each vector
int ab = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
int a2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(a2_i32_low_vec, a2_i32_high_vec));
int b2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(b2_i32_low_vec, b2_i32_high_vec));
// Take care of the tail:
for (; i < n; ++i) {
int ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_u8_haswell(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_u8_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_u8_haswell(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i d2_i32_low_vec = _mm256_setzero_si256();
__m256i d2_i32_high_vec = _mm256_setzero_si256();
__m256i const zeros_vec = _mm256_setzero_si256();
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_u8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_u8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
// Substracting unsigned vectors in AVX2 is done by saturating subtraction:
__m256i d_u8_vec = _mm256_or_si256(_mm256_subs_epu8(a_u8_vec, b_u8_vec), _mm256_subs_epu8(b_u8_vec, a_u8_vec));
// Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking
// instructions instead of extracts, as they are much faster and more efficient.
__m256i d_i16_low_vec = _mm256_unpacklo_epi8(d_u8_vec, zeros_vec);
__m256i d_i16_high_vec = _mm256_unpackhi_epi8(d_u8_vec, zeros_vec);
// Multiply and accumulate at `int16` level, accumulate at `int32` level:
d2_i32_low_vec = _mm256_add_epi32(d2_i32_low_vec, _mm256_madd_epi16(d_i16_low_vec, d_i16_low_vec));
d2_i32_high_vec = _mm256_add_epi32(d2_i32_high_vec, _mm256_madd_epi16(d_i16_high_vec, d_i16_high_vec));
}
// Accumulate the 32-bit integers from `d2_i32_high_vec` and `d2_i32_low_vec`
int d2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(d2_i32_low_vec, d2_i32_high_vec));
// Take care of the tail:
for (; i < n; ++i) {
int n = (int)(a[i]) - b[i];
d2 += n * n;
}
*result = (simsimd_f64_t)d2;
}
SIMSIMD_PUBLIC void simsimd_cos_u8_haswell(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i ab_i32_low_vec = _mm256_setzero_si256();
__m256i ab_i32_high_vec = _mm256_setzero_si256();
__m256i a2_i32_low_vec = _mm256_setzero_si256();
__m256i a2_i32_high_vec = _mm256_setzero_si256();
__m256i b2_i32_low_vec = _mm256_setzero_si256();
__m256i b2_i32_high_vec = _mm256_setzero_si256();
__m256i const zeros_vec = _mm256_setzero_si256();
// AVX2 has no instructions for 8-bit signed integer dot-products,
// but it has a weird instruction for mixed signed-unsigned 8-bit dot-product.
// So we need to normalize the first vector to its absolute value,
// and shift the product sign into the second vector.
//
// __m256i a_i8_abs_vec = _mm256_abs_epi8(a_i8_vec);
// __m256i b_i8_flipped_vec = _mm256_sign_epi8(b_i8_vec, a_i8_vec);
// __m256i ab_i16_vec = _mm256_maddubs_epi16(a_i8_abs_vec, b_i8_flipped_vec);
//
// The problem with this approach, however, is the `-128` value in the second vector.
// Flipping its sign will do nothing, and the result will be incorrect.
// This can easily lead to noticeable numerical errors in the final result.
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_u8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_u8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
// Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking
// instructions instead of extracts, as they are much faster and more efficient.
__m256i a_i16_low_vec = _mm256_unpacklo_epi8(a_u8_vec, zeros_vec);
__m256i a_i16_high_vec = _mm256_unpackhi_epi8(a_u8_vec, zeros_vec);
__m256i b_i16_low_vec = _mm256_unpacklo_epi8(b_u8_vec, zeros_vec);
__m256i b_i16_high_vec = _mm256_unpackhi_epi8(b_u8_vec, zeros_vec);
// Multiply and accumulate as `int16`, accumulate products as `int32`
ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec));
ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec));
a2_i32_low_vec = _mm256_add_epi32(a2_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, a_i16_low_vec));
a2_i32_high_vec = _mm256_add_epi32(a2_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, a_i16_high_vec));
b2_i32_low_vec = _mm256_add_epi32(b2_i32_low_vec, _mm256_madd_epi16(b_i16_low_vec, b_i16_low_vec));
b2_i32_high_vec = _mm256_add_epi32(b2_i32_high_vec, _mm256_madd_epi16(b_i16_high_vec, b_i16_high_vec));
}
// Further reduce to a single sum for each vector
int ab = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
int a2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(a2_i32_low_vec, a2_i32_high_vec));
int b2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(b2_i32_low_vec, b2_i32_high_vec));
// Take care of the tail:
for (; i < n; ++i) {
int ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f32_haswell(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f32_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f32_haswell(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 d2_vec = _mm256_setzero_ps();
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_loadu_ps(a + i);
__m256 b_vec = _mm256_loadu_ps(b + i);
__m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
d2_vec = _mm256_fmadd_ps(d_vec, d_vec, d2_vec);
}
simsimd_f64_t d2 = _simsimd_reduce_f32x8_haswell(d2_vec);
for (; i < n; ++i) {
float d = a[i] - b[i];
d2 += d * d;
}
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_f32_haswell(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 ab_vec = _mm256_setzero_ps();
__m256 a2_vec = _mm256_setzero_ps();
__m256 b2_vec = _mm256_setzero_ps();
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_loadu_ps(a + i);
__m256 b_vec = _mm256_loadu_ps(b + i);
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
a2_vec = _mm256_fmadd_ps(a_vec, a_vec, a2_vec);
b2_vec = _mm256_fmadd_ps(b_vec, b_vec, b2_vec);
}
simsimd_f64_t ab = _simsimd_reduce_f32x8_haswell(ab_vec);
simsimd_f64_t a2 = _simsimd_reduce_f32x8_haswell(a2_vec);
simsimd_f64_t b2 = _simsimd_reduce_f32x8_haswell(b2_vec);
for (; i < n; ++i) {
float ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f64_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f64_haswell(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f64_haswell(a, b, n, result);
*result = _simsimd_sqrt_f64_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f64_haswell(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256d d2_vec = _mm256_setzero_pd();
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
__m256d a_vec = _mm256_loadu_pd(a + i);
__m256d b_vec = _mm256_loadu_pd(b + i);
__m256d d_vec = _mm256_sub_pd(a_vec, b_vec);
d2_vec = _mm256_fmadd_pd(d_vec, d_vec, d2_vec);
}
simsimd_f64_t d2 = _simsimd_reduce_f64x4_haswell(d2_vec);
for (; i < n; ++i) {
simsimd_f64_t d = a[i] - b[i];
d2 += d * d;
}
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_f64_haswell(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256d ab_vec = _mm256_setzero_pd();
__m256d a2_vec = _mm256_setzero_pd();
__m256d b2_vec = _mm256_setzero_pd();
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
__m256d a_vec = _mm256_loadu_pd(a + i);
__m256d b_vec = _mm256_loadu_pd(b + i);
ab_vec = _mm256_fmadd_pd(a_vec, b_vec, ab_vec);
a2_vec = _mm256_fmadd_pd(a_vec, a_vec, a2_vec);
b2_vec = _mm256_fmadd_pd(b_vec, b_vec, b2_vec);
}
simsimd_f64_t ab = _simsimd_reduce_f64x4_haswell(ab_vec);
simsimd_f64_t a2 = _simsimd_reduce_f64x4_haswell(a2_vec);
simsimd_f64_t b2 = _simsimd_reduce_f64x4_haswell(b2_vec);
for (; i < n; ++i) {
simsimd_f64_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f64_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_HASWELL
#if SIMSIMD_TARGET_SKYLAKE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512bw", "avx512vl", "bmi2")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512bw,avx512vl,bmi2"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f32_skylake(a, b, n, result);
*result = _simsimd_sqrt_f64_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 d2_vec = _mm512_setzero();
__m512 a_vec, b_vec;
simsimd_l2sq_f32_skylake_cycle:
if (n < 16) {
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_ps(mask, a);
b_vec = _mm512_maskz_loadu_ps(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_ps(a);
b_vec = _mm512_loadu_ps(b);
a += 16, b += 16, n -= 16;
}
__m512 d_vec = _mm512_sub_ps(a_vec, b_vec);
d2_vec = _mm512_fmadd_ps(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_f32_skylake_cycle;
*result = _simsimd_reduce_f32x16_skylake(d2_vec);
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_skylake(simsimd_f64_t ab, simsimd_f64_t a2,
simsimd_f64_t b2) {
// If both vectors have magnitude 0, the distance is 0.
if (a2 == 0 && b2 == 0) return 0;
// If any one of the vectors is 0, the square root of the product is 0,
// the division is illformed, and the result is 1.
else if (ab == 0)
return 1;
// We want to avoid the `simsimd_approximate_inverse_square_root` due to high latency:
// https://web.archive.org/web/20210208132927/http://assemblyrequired.crashworks.org/timing-square-root/
// The maximum relative error for this approximation is less than 2^-14, which is 6x lower than
// for single-precision floats in the `_simsimd_cos_normalize_f64_haswell` implementation.
// Mysteriously, MSVC has no `_mm_rsqrt14_pd` intrinsic, but has its masked variants,
// so let's use `_mm_maskz_rsqrt14_pd(0xFF, ...)` instead.
__m128d squares = _mm_set_pd(a2, b2);
__m128d rsqrts = _mm_maskz_rsqrt14_pd(0xFF, squares);
// Let's implement a single Newton-Raphson iteration to refine the result.
// This is how it affects downstream applications:
//
// +--------+------+----------+---------------------+---------------------+---------------------+
// | Metric | NDim | DType | Baseline Error | Old SimSIMD Error | New SimSIMD Error |
// +--------+------+----------+---------------------+---------------------+---------------------+
// | cosine | 1536 | bfloat16 | 1.89e-08 ± 1.59e-08 | 3.07e-07 ± 3.09e-07 | 3.53e-09 ± 2.70e-09 |
// | cosine | 1536 | float16 | 1.67e-02 ± 1.44e-02 | 2.68e-05 ± 1.95e-05 | 2.02e-05 ± 1.39e-05 |
// | cosine | 1536 | float32 | 2.21e-08 ± 1.65e-08 | 3.47e-07 ± 3.49e-07 | 3.77e-09 ± 2.84e-09 |
// | cosine | 1536 | float64 | 0.00e+00 ± 0.00e+00 | 3.80e-07 ± 4.50e-07 | 1.35e-11 ± 1.85e-11 |
// | cosine | 1536 | int8 | 0.00e+00 ± 0.00e+00 | 4.60e-04 ± 3.36e-04 | 4.20e-04 ± 4.88e-04 |
// +--------+------+----------+---------------------+---------------------+---------------------+
//
// https://en.wikipedia.org/wiki/Newton%27s_method
rsqrts = _mm_add_pd( //
_mm_mul_pd(_mm_set1_pd(1.5), rsqrts),
_mm_mul_pd(_mm_mul_pd(_mm_mul_pd(squares, _mm_set1_pd(-0.5)), rsqrts), _mm_mul_pd(rsqrts, rsqrts)));
simsimd_f64_t a2_reciprocal = _mm_cvtsd_f64(_mm_unpackhi_pd(rsqrts, rsqrts));
simsimd_f64_t b2_reciprocal = _mm_cvtsd_f64(rsqrts);
simsimd_distance_t result = 1 - ab * a2_reciprocal * b2_reciprocal;
return result > 0 ? result : 0;
}
SIMSIMD_PUBLIC void simsimd_cos_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 ab_vec = _mm512_setzero();
__m512 a2_vec = _mm512_setzero();
__m512 b2_vec = _mm512_setzero();
__m512 a_vec, b_vec;
simsimd_cos_f32_skylake_cycle:
if (n < 16) {
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_ps(mask, a);
b_vec = _mm512_maskz_loadu_ps(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_ps(a);
b_vec = _mm512_loadu_ps(b);
a += 16, b += 16, n -= 16;
}
ab_vec = _mm512_fmadd_ps(a_vec, b_vec, ab_vec);
a2_vec = _mm512_fmadd_ps(a_vec, a_vec, a2_vec);
b2_vec = _mm512_fmadd_ps(b_vec, b_vec, b2_vec);
if (n) goto simsimd_cos_f32_skylake_cycle;
simsimd_f64_t ab = _simsimd_reduce_f32x16_skylake(ab_vec);
simsimd_f64_t a2 = _simsimd_reduce_f32x16_skylake(a2_vec);
simsimd_f64_t b2 = _simsimd_reduce_f32x16_skylake(b2_vec);
*result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f64_skylake(a, b, n, result);
*result = _simsimd_sqrt_f64_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512d d2_vec = _mm512_setzero_pd();
__m512d a_vec, b_vec;
simsimd_l2sq_f64_skylake_cycle:
if (n < 8) {
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_pd(mask, a);
b_vec = _mm512_maskz_loadu_pd(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_pd(a);
b_vec = _mm512_loadu_pd(b);
a += 8, b += 8, n -= 8;
}
__m512d d_vec = _mm512_sub_pd(a_vec, b_vec);
d2_vec = _mm512_fmadd_pd(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_f64_skylake_cycle;
*result = _mm512_reduce_add_pd(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512d ab_vec = _mm512_setzero_pd();
__m512d a2_vec = _mm512_setzero_pd();
__m512d b2_vec = _mm512_setzero_pd();
__m512d a_vec, b_vec;
simsimd_cos_f64_skylake_cycle:
if (n < 8) {
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_pd(mask, a);
b_vec = _mm512_maskz_loadu_pd(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_pd(a);
b_vec = _mm512_loadu_pd(b);
a += 8, b += 8, n -= 8;
}
ab_vec = _mm512_fmadd_pd(a_vec, b_vec, ab_vec);
a2_vec = _mm512_fmadd_pd(a_vec, a_vec, a2_vec);
b2_vec = _mm512_fmadd_pd(b_vec, b_vec, b2_vec);
if (n) goto simsimd_cos_f64_skylake_cycle;
simsimd_f64_t ab = _mm512_reduce_add_pd(ab_vec);
simsimd_f64_t a2 = _mm512_reduce_add_pd(a2_vec);
simsimd_f64_t b2 = _mm512_reduce_add_pd(b2_vec);
*result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SKYLAKE
#if SIMSIMD_TARGET_GENOA
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512bf16")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512bf16"))), \
apply_to = function)
SIMSIMD_INTERNAL __m512i _simsimd_substract_bf16x32_genoa(__m512i a_i16, __m512i b_i16) {
union {
__m512 fvec;
__m512i ivec;
simsimd_f32_t f32[16];
simsimd_u16_t u16[32];
simsimd_bf16_t bf16[32];
} d_odd, d_even, d, a_f32_even, b_f32_even, d_f32_even, a_f32_odd, b_f32_odd, d_f32_odd, a, b;
a.ivec = a_i16;
b.ivec = b_i16;
// There are several approaches to perform subtraction in `bf16`. The first one is:
//
// Perform a couple of casts - each is a bitshift. To convert `bf16` to `f32`,
// expand it to 32-bit integers, then shift the bits by 16 to the left.
// Then subtract as floats, and shift back. During expansion, we will double the space,
// and should use separate registers for top and bottom halves.
// Some compilers don't have `_mm512_extracti32x8_epi32`, so we use `_mm512_extracti64x4_epi64`:
//
// a_f32_bot.fvec = _mm512_castsi512_ps(_mm512_slli_epi32(
// _mm512_cvtepu16_epi32(_mm512_castsi512_si256(a_i16)), 16));
// b_f32_bot.fvec = _mm512_castsi512_ps(_mm512_slli_epi32(
// _mm512_cvtepu16_epi32(_mm512_castsi512_si256(b_i16)), 16));
// a_f32_top.fvec =_mm512_castsi512_ps(
// _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(a_i16, 1)), 16));
// b_f32_top.fvec =_mm512_castsi512_ps(
// _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(b_i16, 1)), 16));
// d_f32_top.fvec = _mm512_sub_ps(a_f32_top.fvec, b_f32_top.fvec);
// d_f32_bot.fvec = _mm512_sub_ps(a_f32_bot.fvec, b_f32_bot.fvec);
// d.ivec = _mm512_castsi256_si512(_mm512_cvtepi32_epi16(
// _mm512_srli_epi32(_mm512_castps_si512(d_f32_bot.fvec), 16)));
// d.ivec = _mm512_inserti64x4(d.ivec, _mm512_cvtepi32_epi16(
// _mm512_srli_epi32(_mm512_castps_si512(d_f32_top.fvec), 16)), 1);
//
// Instead of using multple shifts and an insertion, we can achieve similar result with fewer expensive
// calls to `_mm512_permutex2var_epi16`, or a cheap `_mm512_mask_shuffle_epi8` and blend:
//
a_f32_odd.ivec = _mm512_and_si512(a_i16, _mm512_set1_epi32(0xFFFF0000));
a_f32_even.ivec = _mm512_slli_epi32(a_i16, 16);
b_f32_odd.ivec = _mm512_and_si512(b_i16, _mm512_set1_epi32(0xFFFF0000));
b_f32_even.ivec = _mm512_slli_epi32(b_i16, 16);
d_f32_odd.fvec = _mm512_sub_ps(a_f32_odd.fvec, b_f32_odd.fvec);
d_f32_even.fvec = _mm512_sub_ps(a_f32_even.fvec, b_f32_even.fvec);
d_f32_even.ivec = _mm512_srli_epi32(d_f32_even.ivec, 16);
d.ivec = _mm512_mask_blend_epi16(0x55555555, d_f32_odd.ivec, d_f32_even.ivec);
return d.ivec;
}
SIMSIMD_PUBLIC void simsimd_l2_bf16_genoa(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_bf16_genoa(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_genoa(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 d2_vec = _mm512_setzero_ps();
__m512i a_i16_vec, b_i16_vec, d_i16_vec;
simsimd_l2sq_bf16_genoa_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a);
b_i16_vec = _mm512_loadu_epi16(b);
a += 32, b += 32, n -= 32;
}
d_i16_vec = _simsimd_substract_bf16x32_genoa(a_i16_vec, b_i16_vec);
d2_vec = _mm512_dpbf16_ps(d2_vec, (__m512bh)(d_i16_vec), (__m512bh)(d_i16_vec));
if (n) goto simsimd_l2sq_bf16_genoa_cycle;
*result = _simsimd_reduce_f32x16_skylake(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_bf16_genoa(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 ab_vec = _mm512_setzero_ps();
__m512 a2_vec = _mm512_setzero_ps();
__m512 b2_vec = _mm512_setzero_ps();
__m512i a_i16_vec, b_i16_vec;
simsimd_cos_bf16_genoa_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a);
b_i16_vec = _mm512_loadu_epi16(b);
a += 32, b += 32, n -= 32;
}
ab_vec = _mm512_dpbf16_ps(ab_vec, (__m512bh)(a_i16_vec), (__m512bh)(b_i16_vec));
a2_vec = _mm512_dpbf16_ps(a2_vec, (__m512bh)(a_i16_vec), (__m512bh)(a_i16_vec));
b2_vec = _mm512_dpbf16_ps(b2_vec, (__m512bh)(b_i16_vec), (__m512bh)(b_i16_vec));
if (n) goto simsimd_cos_bf16_genoa_cycle;
simsimd_f32_t ab = _simsimd_reduce_f32x16_skylake(ab_vec);
simsimd_f32_t a2 = _simsimd_reduce_f32x16_skylake(a2_vec);
simsimd_f32_t b2 = _simsimd_reduce_f32x16_skylake(b2_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_GENOA
#if SIMSIMD_TARGET_SAPPHIRE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512fp16")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f16_sapphire(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512h d2_vec = _mm512_setzero_ph();
__m512i a_i16_vec, b_i16_vec;
simsimd_l2sq_f16_sapphire_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a);
b_i16_vec = _mm512_loadu_epi16(b);
a += 32, b += 32, n -= 32;
}
__m512h d_vec = _mm512_sub_ph(_mm512_castsi512_ph(a_i16_vec), _mm512_castsi512_ph(b_i16_vec));
d2_vec = _mm512_fmadd_ph(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_f16_sapphire_cycle;
*result = _mm512_reduce_add_ph(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512h ab_vec = _mm512_setzero_ph();
__m512h a2_vec = _mm512_setzero_ph();
__m512h b2_vec = _mm512_setzero_ph();
__m512i a_i16_vec, b_i16_vec;
simsimd_cos_f16_sapphire_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a);
b_i16_vec = _mm512_loadu_epi16(b);
a += 32, b += 32, n -= 32;
}
ab_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(a_i16_vec), _mm512_castsi512_ph(b_i16_vec), ab_vec);
a2_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(a_i16_vec), _mm512_castsi512_ph(a_i16_vec), a2_vec);
b2_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(b_i16_vec), _mm512_castsi512_ph(b_i16_vec), b2_vec);
if (n) goto simsimd_cos_f16_sapphire_cycle;
simsimd_f32_t ab = _mm512_reduce_add_ph(ab_vec);
simsimd_f32_t a2 = _mm512_reduce_add_ph(a2_vec);
simsimd_f32_t b2 = _mm512_reduce_add_ph(b2_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SAPPHIRE
#if SIMSIMD_TARGET_ICE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512vnni")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512vnni"))), \
apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_i8_ice(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_i8_ice(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_i8_ice(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512i d2_i32_vec = _mm512_setzero_si512();
__m512i a_i16_vec, b_i16_vec, d_i16s_vec;
simsimd_l2sq_i8_ice_cycle:
if (n < 32) { // TODO: Avoid early i16 upcast to step through 64 values at a time
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, a));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, b));
n = 0;
}
else {
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)a));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)b));
a += 32, b += 32, n -= 32;
}
d_i16s_vec = _mm512_sub_epi16(a_i16_vec, b_i16_vec);
d2_i32_vec = _mm512_dpwssd_epi32(d2_i32_vec, d_i16s_vec, d_i16s_vec);
if (n) goto simsimd_l2sq_i8_ice_cycle;
*result = _mm512_reduce_add_epi32(d2_i32_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512i ab_i32_vec = _mm512_setzero_si512();
__m512i a2_i32_vec = _mm512_setzero_si512();
__m512i b2_i32_vec = _mm512_setzero_si512();
__m512i a_i16_vec, b_i16_vec;
simsimd_cos_i8_ice_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, a));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, b));
n = 0;
}
else {
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)a));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)b));
a += 32, b += 32, n -= 32;
}
// We can't directly use the `_mm512_dpbusd_epi32` intrinsic everywhere,
// as it's asymmetric with respect to the sign of the input arguments:
//
// Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
//
// To compute the squares, we could just drop the sign bit of the second argument.
// But this would lead to big-big problems on values like `-128`!
// For dot-products we don't have the luxury of optimizing the sign bit away.
// Assuming this is an approximate kernel (with reciprocal square root approximations)
// in the end, we can allow clamping the value to [-127, 127] range.
//
// On Ice Lake:
//
// 1. `VPDPBUSDS (ZMM, ZMM, ZMM)` can only execute on port 0, with 5 cycle latency.
// 2. `VPDPWSSDS (ZMM, ZMM, ZMM)` can also only execute on port 0, with 5 cycle latency.
// 3. `VPMADDWD (ZMM, ZMM, ZMM)` can execute on ports 0 and 5, with 5 cycle latency.
//
// On Zen4 Genoa:
//
// 1. `VPDPBUSDS (ZMM, ZMM, ZMM)` can execute on ports 0 and 1, with 4 cycle latency.
// 2. `VPDPWSSDS (ZMM, ZMM, ZMM)` can also execute on ports 0 and 1, with 4 cycle latency.
// 3. `VPMADDWD (ZMM, ZMM, ZMM)` can execute on ports 0 and 1, with 3 cycle latency.
//
// The old solution was complex replied on 1. and 2.:
//
// a_i8_abs_vec = _mm512_abs_epi8(a_i8_vec);
// b_i8_abs_vec = _mm512_abs_epi8(b_i8_vec);
// a2_i32_vec = _mm512_dpbusds_epi32(a2_i32_vec, a_i8_abs_vec, a_i8_abs_vec);
// b2_i32_vec = _mm512_dpbusds_epi32(b2_i32_vec, b_i8_abs_vec, b_i8_abs_vec);
// ab_i32_low_vec = _mm512_dpwssds_epi32( //
// ab_i32_low_vec, //
// _mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_vec)), //
// _mm512_cvtepi8_epi16(_mm512_castsi512_si256(b_i8_vec)));
// ab_i32_high_vec = _mm512_dpwssds_epi32( //
// ab_i32_high_vec, //
// _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_vec, 1)), //
// _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(b_i8_vec, 1)));
//
// The new solution is simpler and relies on 3.:
ab_i32_vec = _mm512_add_epi32(ab_i32_vec, _mm512_madd_epi16(a_i16_vec, b_i16_vec));
a2_i32_vec = _mm512_add_epi32(a2_i32_vec, _mm512_madd_epi16(a_i16_vec, a_i16_vec));
b2_i32_vec = _mm512_add_epi32(b2_i32_vec, _mm512_madd_epi16(b_i16_vec, b_i16_vec));
if (n) goto simsimd_cos_i8_ice_cycle;
int ab = _mm512_reduce_add_epi32(ab_i32_vec);
int a2 = _mm512_reduce_add_epi32(a2_i32_vec);
int b2 = _mm512_reduce_add_epi32(b2_i32_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_u8_ice(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_u8_ice(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_u8_ice(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512i d2_i32_low_vec = _mm512_setzero_si512();
__m512i d2_i32_high_vec = _mm512_setzero_si512();
__m512i const zeros_vec = _mm512_setzero_si512();
__m512i d_i16_low_vec, d_i16_high_vec;
__m512i a_u8_vec, b_u8_vec, d_u8_vec;
simsimd_l2sq_u8_ice_cycle:
if (n < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
}
else {
a_u8_vec = _mm512_loadu_si512(a);
b_u8_vec = _mm512_loadu_si512(b);
a += 64, b += 64, n -= 64;
}
// Substracting unsigned vectors in AVX-512 is done by saturating subtraction:
d_u8_vec = _mm512_or_si512(_mm512_subs_epu8(a_u8_vec, b_u8_vec), _mm512_subs_epu8(b_u8_vec, a_u8_vec));
d_i16_low_vec = _mm512_unpacklo_epi8(d_u8_vec, zeros_vec);
d_i16_high_vec = _mm512_unpackhi_epi8(d_u8_vec, zeros_vec);
// Multiply and accumulate at `int16` level, accumulate at `int32` level:
d2_i32_low_vec = _mm512_dpwssd_epi32(d2_i32_low_vec, d_i16_low_vec, d_i16_low_vec);
d2_i32_high_vec = _mm512_dpwssd_epi32(d2_i32_high_vec, d_i16_high_vec, d_i16_high_vec);
if (n) goto simsimd_l2sq_u8_ice_cycle;
*result = _mm512_reduce_add_epi32(_mm512_add_epi32(d2_i32_low_vec, d2_i32_high_vec));
}
SIMSIMD_PUBLIC void simsimd_cos_u8_ice(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512i ab_i32_low_vec = _mm512_setzero_si512();
__m512i ab_i32_high_vec = _mm512_setzero_si512();
__m512i a2_i32_low_vec = _mm512_setzero_si512();
__m512i a2_i32_high_vec = _mm512_setzero_si512();
__m512i b2_i32_low_vec = _mm512_setzero_si512();
__m512i b2_i32_high_vec = _mm512_setzero_si512();
__m512i const zeros_vec = _mm512_setzero_si512();
__m512i a_i16_low_vec, a_i16_high_vec, b_i16_low_vec, b_i16_high_vec;
__m512i a_u8_vec, b_u8_vec;
simsimd_cos_u8_ice_cycle:
if (n < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
}
else {
a_u8_vec = _mm512_loadu_si512(a);
b_u8_vec = _mm512_loadu_si512(b);
a += 64, b += 64, n -= 64;
}
// Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking
// instructions instead of extracts, as they are much faster and more efficient.
a_i16_low_vec = _mm512_unpacklo_epi8(a_u8_vec, zeros_vec);
a_i16_high_vec = _mm512_unpackhi_epi8(a_u8_vec, zeros_vec);
b_i16_low_vec = _mm512_unpacklo_epi8(b_u8_vec, zeros_vec);
b_i16_high_vec = _mm512_unpackhi_epi8(b_u8_vec, zeros_vec);
// Multiply and accumulate as `int16`, accumulate products as `int32`:
ab_i32_low_vec = _mm512_dpwssds_epi32(ab_i32_low_vec, a_i16_low_vec, b_i16_low_vec);
ab_i32_high_vec = _mm512_dpwssds_epi32(ab_i32_high_vec, a_i16_high_vec, b_i16_high_vec);
a2_i32_low_vec = _mm512_dpwssds_epi32(a2_i32_low_vec, a_i16_low_vec, a_i16_low_vec);
a2_i32_high_vec = _mm512_dpwssds_epi32(a2_i32_high_vec, a_i16_high_vec, a_i16_high_vec);
b2_i32_low_vec = _mm512_dpwssds_epi32(b2_i32_low_vec, b_i16_low_vec, b_i16_low_vec);
b2_i32_high_vec = _mm512_dpwssds_epi32(b2_i32_high_vec, b_i16_high_vec, b_i16_high_vec);
if (n) goto simsimd_cos_u8_ice_cycle;
int ab = _mm512_reduce_add_epi32(_mm512_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
int a2 = _mm512_reduce_add_epi32(_mm512_add_epi32(a2_i32_low_vec, a2_i32_high_vec));
int b2 = _mm512_reduce_add_epi32(_mm512_add_epi32(b2_i32_low_vec, b2_i32_high_vec));
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_i4x2_ice(simsimd_i4x2_t const *a, simsimd_i4x2_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
simsimd_l2sq_i4x2_ice(a, b, n_words, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_i4x2_ice(simsimd_i4x2_t const *a, simsimd_i4x2_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
// While `int8_t` covers the range [-128, 127], `int4_t` covers only [-8, 7].
// The absolute difference between two 4-bit integers is at most 15 and it is always a `uint4_t` value!
// Moreover, it's square is at most 225, which fits into `uint8_t` and can be computed with a single
// lookup table. Accumulating those values is similar to checksumming, a piece of cake for SIMD!
__m512i const i4_to_i8_lookup_vec = _mm512_set_epi8( //
-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, //
-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, //
-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, //
-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0);
__m512i const u4_squares_lookup_vec = _mm512_set_epi8( //
(char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0, //
(char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0, //
(char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0, //
(char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0);
/// The mask used to take the low nibble of each byte.
__m512i const i4_nibble_vec = _mm512_set1_epi8(0x0F);
// Temporaries:
__m512i a_i4x2_vec, b_i4x2_vec;
__m512i a_i8_low_vec, a_i8_high_vec, b_i8_low_vec, b_i8_high_vec;
__m512i d_u8_low_vec, d_u8_high_vec; //! Only the low 4 bits are actually used
__m512i d2_u8_low_vec, d2_u8_high_vec;
__m512i d2_u16_low_vec, d2_u16_high_vec;
// Accumulators:
__m512i d2_u32_vec = _mm512_setzero_si512();
simsimd_l2sq_i4x2_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_i4x2_vec = _mm512_maskz_loadu_epi8(mask, a);
b_i4x2_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
}
else {
a_i4x2_vec = _mm512_loadu_epi8(a);
b_i4x2_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
}
// Unpack the 4-bit values into 8-bit values with an empty top nibble.
a_i8_low_vec = _mm512_and_si512(a_i4x2_vec, i4_nibble_vec);
a_i8_high_vec = _mm512_and_si512(_mm512_srli_epi64(a_i4x2_vec, 4), i4_nibble_vec);
b_i8_low_vec = _mm512_and_si512(b_i4x2_vec, i4_nibble_vec);
b_i8_high_vec = _mm512_and_si512(_mm512_srli_epi64(b_i4x2_vec, 4), i4_nibble_vec);
a_i8_low_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, a_i8_low_vec);
a_i8_high_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, a_i8_high_vec);
b_i8_low_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, b_i8_low_vec);
b_i8_high_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, b_i8_high_vec);
// We can implement subtraction with a lookup table, or using `_mm512_sub_epi8`.
d_u8_low_vec = _mm512_abs_epi8(_mm512_sub_epi8(a_i8_low_vec, b_i8_low_vec));
d_u8_high_vec = _mm512_abs_epi8(_mm512_sub_epi8(a_i8_high_vec, b_i8_high_vec));
// Now we can use the lookup table to compute the squares of the 4-bit unsigned integers
// in the low nibbles of the `d_u8_low_vec` and `d_u8_high_vec` vectors.
d2_u8_low_vec = _mm512_shuffle_epi8(u4_squares_lookup_vec, d_u8_low_vec);
d2_u8_high_vec = _mm512_shuffle_epi8(u4_squares_lookup_vec, d_u8_high_vec);
// Aggregating into 16-bit integers, we need to first upcast our 8-bit values to 16 bits.
// After that, we will perform one more operation, upcasting further into 32-bit integers.
d2_u16_low_vec = //
_mm512_add_epi16( //
_mm512_unpacklo_epi8(d2_u8_low_vec, _mm512_setzero_si512()),
_mm512_unpackhi_epi8(d2_u8_low_vec, _mm512_setzero_si512()));
d2_u16_high_vec = //
_mm512_add_epi16( //
_mm512_unpacklo_epi8(d2_u8_high_vec, _mm512_setzero_si512()),
_mm512_unpackhi_epi8(d2_u8_high_vec, _mm512_setzero_si512()));
d2_u32_vec = _mm512_add_epi32(d2_u32_vec, _mm512_unpacklo_epi16(d2_u16_low_vec, _mm512_setzero_si512()));
d2_u32_vec = _mm512_add_epi32(d2_u32_vec, _mm512_unpacklo_epi16(d2_u16_high_vec, _mm512_setzero_si512()));
if (n_words) goto simsimd_l2sq_i4x2_ice_cycle;
// Finally, we can reduce the 16-bit integers to 32-bit integers and sum them up.
int d2 = _mm512_reduce_add_epi32(d2_u32_vec);
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_i4x2_ice(simsimd_i4x2_t const *a, simsimd_i4x2_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
// We need to compose a lookup table for all the scalar products of 4-bit integers.
// While `int8_t` covers the range [-128, 127], `int4_t` covers only [-8, 7].
// Practically speaking, the product of two 4-bit signed integers is a 7-bit integer,
// as the maximum absolute value of the product is `abs(-8 * -8) == 64`.
//
// To store 128 possible values of 2^7 bits we only need 128 single-byte scalars,
// or just 2x ZMM registers. In that case our lookup will only take `vpermi2b` instruction,
// easily inokable with `_mm512_permutex2var_epi8` intrinsic with latency of 6 on Sapphire Rapids.
// The problem is converting 2d indices of our symmetric matrix into 1d offsets in the dense array.
//
// Alternatively, we can take the entire symmetric (16 x 16) matrix of products,
// put into 4x ZMM registers, and use it with `_mm512_shuffle_epi8`, remembering
// that it can only lookup with 128-bit lanes (16x 8-bit values).
// That intrinsic has latency 1, but will need to be repeated and combined with
// multiple iterations of `_mm512_shuffle_i64x2` that has latency 3.
//
// Altenatively, we can get down to 3 cycles per lookup with `vpermb` and `_mm512_permutexvar_epi8` intrinsics.
// For that we can split our (16 x 16) matrix into 4x (8 x 8) submatrices, and use 4x ZMM registers.
//
// Still, all of those solutions are quite heavy compared to two parallel calls to `_mm512_dpbusds_epi32`
// for the dot product. But we can still use the `_mm512_permutexvar_epi8` to compute the squares of the
// 16 possible `int4_t` values faster.
//
// Here is how our `int4_t` range looks:
//
// dec: 0 1 2 3 4 5 6 7 -8 -7 -6 -5 -4 -3 -2 -1
// hex: 0 1 2 3 4 5 6 7 8 9 A B C D E F
//
// Squared:
//
// dec2: 0 1 4 9 16 25 36 49 64 49 36 25 16 9 4 1
// hex2: 0 1 4 9 10 19 24 31 40 31 24 19 10 9 4 1
//
// Broadcast it to every lane, so that: `square(x) == _mm512_shuffle_epi8(i4_squares_lookup_vec, x)`.
__m512i const i4_to_i8_lookup_vec = _mm512_set_epi8( //
-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, //
-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, //
-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, //
-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0);
__m512i const i4_squares_lookup_vec = _mm512_set_epi8( //
1, 4, 9, 16, 25, 36, 49, 64, 49, 36, 25, 16, 9, 4, 1, 0, //
1, 4, 9, 16, 25, 36, 49, 64, 49, 36, 25, 16, 9, 4, 1, 0, //
1, 4, 9, 16, 25, 36, 49, 64, 49, 36, 25, 16, 9, 4, 1, 0, //
1, 4, 9, 16, 25, 36, 49, 64, 49, 36, 25, 16, 9, 4, 1, 0);
/// The mask used to take the low nibble of each byte.
__m512i const i4_nibble_vec = _mm512_set1_epi8(0x0F);
// Temporaries:
__m512i a_i4x2_vec, b_i4x2_vec;
__m512i a_i8_low_vec, a_i8_high_vec, b_i8_low_vec, b_i8_high_vec;
__m512i a2_u8_vec, b2_u8_vec;
// Accumulators:
__m512i a2_u16_low_vec = _mm512_setzero_si512();
__m512i a2_u16_high_vec = _mm512_setzero_si512();
__m512i b2_u16_low_vec = _mm512_setzero_si512();
__m512i b2_u16_high_vec = _mm512_setzero_si512();
__m512i ab_i32_low_vec = _mm512_setzero_si512();
__m512i ab_i32_high_vec = _mm512_setzero_si512();
simsimd_cos_i4x2_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_i4x2_vec = _mm512_maskz_loadu_epi8(mask, a);
b_i4x2_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
}
else {
a_i4x2_vec = _mm512_loadu_epi8(a);
b_i4x2_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
}
// Unpack the 4-bit values into 8-bit values with an empty top nibble.
// For now, they are not really 8-bit integers, as they are not sign-extended.
// That part will come later, using the `i4_to_i8_lookup_vec` lookup.
a_i8_low_vec = _mm512_and_si512(a_i4x2_vec, i4_nibble_vec);
a_i8_high_vec = _mm512_and_si512(_mm512_srli_epi64(a_i4x2_vec, 4), i4_nibble_vec);
b_i8_low_vec = _mm512_and_si512(b_i4x2_vec, i4_nibble_vec);
b_i8_high_vec = _mm512_and_si512(_mm512_srli_epi64(b_i4x2_vec, 4), i4_nibble_vec);
// Compute the squares of the 4-bit integers.
// For symmetry we could have used 4 registers, aka "a2_i8_low_vec", "a2_i8_high_vec", "b2_i8_low_vec",
// "b2_i8_high_vec". But the largest square value is just 64, so we can safely aggregate into 8-bit unsigned values.
a2_u8_vec = _mm512_add_epi8(_mm512_shuffle_epi8(i4_squares_lookup_vec, a_i8_low_vec),
_mm512_shuffle_epi8(i4_squares_lookup_vec, a_i8_high_vec));
b2_u8_vec = _mm512_add_epi8(_mm512_shuffle_epi8(i4_squares_lookup_vec, b_i8_low_vec),
_mm512_shuffle_epi8(i4_squares_lookup_vec, b_i8_high_vec));
// We can safely aggregate into just 16-bit sums without overflow, if the vectors have less than:
// (2 scalars / byte) * (64 bytes / register) * (256 non-overflowing 8-bit additions in 16-bit intesgers)
// = 32'768 dimensions.
//
// We use saturated addition here to clearly inform in case of overflow.
a2_u16_low_vec = _mm512_adds_epu16(a2_u16_low_vec, _mm512_cvtepu8_epi16(_mm512_castsi512_si256(a2_u8_vec)));
a2_u16_high_vec = _mm512_adds_epu16(a2_u16_high_vec, _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(a2_u8_vec, 1)));
b2_u16_low_vec = _mm512_adds_epu16(b2_u16_low_vec, _mm512_cvtepu8_epi16(_mm512_castsi512_si256(a2_u8_vec)));
b2_u16_high_vec = _mm512_adds_epu16(b2_u16_high_vec, _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(a2_u8_vec, 1)));
// Time to perform the proper sign extension of the 4-bit integers to 8-bit integers.
a_i8_low_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, a_i8_low_vec);
a_i8_high_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, a_i8_high_vec);
b_i8_low_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, b_i8_low_vec);
b_i8_high_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, b_i8_high_vec);
// The same trick won't work for the primary dot-product, as the signs vector
// components may differ significantly. So we have to use two `_mm512_dpwssds_epi32`
// intrinsics instead, upcasting four chunks to 16-bit integers beforehand!
// Alternatively, we can flip the signs of the second argument and use `_mm512_dpbusds_epi32`,
// but it ends up taking more instructions.
ab_i32_low_vec = _mm512_dpwssds_epi32( //
ab_i32_low_vec, //
_mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_low_vec)), //
_mm512_cvtepi8_epi16(_mm512_castsi512_si256(b_i8_low_vec)));
ab_i32_low_vec = _mm512_dpwssds_epi32( //
ab_i32_low_vec, //
_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_low_vec, 1)), //
_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(b_i8_low_vec, 1)));
ab_i32_high_vec = _mm512_dpwssds_epi32( //
ab_i32_high_vec, //
_mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_high_vec)), //
_mm512_cvtepi8_epi16(_mm512_castsi512_si256(b_i8_high_vec)));
ab_i32_high_vec = _mm512_dpwssds_epi32( //
ab_i32_high_vec, //
_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_high_vec, 1)), //
_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(b_i8_high_vec, 1)));
if (n_words) goto simsimd_cos_i4x2_ice_cycle;
int ab = _mm512_reduce_add_epi32(_mm512_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
unsigned short a2_u16[32], b2_u16[32];
_mm512_storeu_si512(a2_u16, _mm512_add_epi16(a2_u16_low_vec, a2_u16_high_vec));
_mm512_storeu_si512(b2_u16, _mm512_add_epi16(b2_u16_low_vec, b2_u16_high_vec));
unsigned int a2 = 0, b2 = 0;
for (int i = 0; i < 32; ++i) a2 += a2_u16[i], b2 += b2_u16[i];
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_ICE
#if SIMSIMD_TARGET_SIERRA
#pragma GCC push_options
#pragma GCC target("avx2", "bmi2", "avx2vnni")
#pragma clang attribute push(__attribute__((target("avx2,bmi2,avx2vnni"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_cos_i8_sierra(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i ab_i32_vec = _mm256_setzero_si256();
__m256i a2_i32_vec = _mm256_setzero_si256();
__m256i b2_i32_vec = _mm256_setzero_si256();
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_i8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_i8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
ab_i32_vec = _mm256_dpbssds_epi32(ab_i32_vec, a_i8_vec, b_i8_vec);
a2_i32_vec = _mm256_dpbssds_epi32(a2_i32_vec, a_i8_vec, a_i8_vec);
b2_i32_vec = _mm256_dpbssds_epi32(b2_i32_vec, b_i8_vec, b_i8_vec);
}
// Further reduce to a single sum for each vector
int ab = _simsimd_reduce_i32x8_haswell(ab_i32_vec);
int a2 = _simsimd_reduce_i32x8_haswell(a2_i32_vec);
int b2 = _simsimd_reduce_i32x8_haswell(b2_i32_vec);
// Take care of the tail:
for (; i < n; ++i) {
int ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif // SIMSIMD_TARGET_SIERRA
#endif // _SIMSIMD_TARGET_X86
#ifdef __cplusplus
}
#endif
#endif
simsimd-6.5.12/include/simsimd/types.h 0000644 0000000 0000000 00000063027 10461020230 0016001 0 ustar 0000000 0000000 /**
* @file types.h
* @brief Shared definitions for the SimSIMD library.
* @author Ash Vardanian
* @date October 2, 2023
*
* Defines:
* - Sized aliases for numeric types, like: `simsimd_i32_t` and `simsimd_f64_t`.
* - Macros for internal compiler/hardware checks, like: `_SIMSIMD_TARGET_ARM`.
* - Macros for feature controls, like: `SIMSIMD_TARGET_NEON`
*/
#ifndef SIMSIMD_TYPES_H
#define SIMSIMD_TYPES_H
// Inferring target OS: Windows, macOS, or Linux
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
#define _SIMSIMD_DEFINED_WINDOWS 1
#elif defined(__APPLE__) && defined(__MACH__)
#define _SIMSIMD_DEFINED_APPLE 1
#elif defined(__linux__)
#define _SIMSIMD_DEFINED_LINUX 1
#endif
// Annotation for the public API symbols:
//
// - `SIMSIMD_PUBLIC` is used for functions that are part of the public API.
// - `SIMSIMD_INTERNAL` is used for internal helper functions with unstable APIs.
// - `SIMSIMD_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
//
// On GCC we mark the functions as `nonnull` informing that none of the arguments can be `NULL`.
// Marking with `pure` and `const` isn't possible as outputting to a pointer is a "side effect".
#if defined(_WIN32) || defined(__CYGWIN__)
#define SIMSIMD_DYNAMIC __declspec(dllexport)
#define SIMSIMD_PUBLIC inline static
#define SIMSIMD_INTERNAL inline static
#elif defined(__GNUC__) || defined(__clang__)
#define SIMSIMD_DYNAMIC __attribute__((visibility("default"))) __attribute__((nonnull))
#define SIMSIMD_PUBLIC __attribute__((unused, nonnull)) inline static
#define SIMSIMD_INTERNAL __attribute__((always_inline)) inline static
#else
#define SIMSIMD_DYNAMIC
#define SIMSIMD_PUBLIC inline static
#define SIMSIMD_INTERNAL inline static
#endif
// Compiling for Arm: _SIMSIMD_TARGET_ARM
#if !defined(_SIMSIMD_TARGET_ARM)
#if defined(__aarch64__) || defined(_M_ARM64)
#define _SIMSIMD_TARGET_ARM 1
#else
#define _SIMSIMD_TARGET_ARM 0
#endif // defined(__aarch64__) || defined(_M_ARM64)
#endif // !defined(_SIMSIMD_TARGET_ARM)
// Compiling for x86: _SIMSIMD_TARGET_X86
#if !defined(_SIMSIMD_TARGET_X86)
#if defined(__x86_64__) || defined(_M_X64)
#define _SIMSIMD_TARGET_X86 1
#else
#define _SIMSIMD_TARGET_X86 0
#endif // defined(__x86_64__) || defined(_M_X64)
#endif // !defined(_SIMSIMD_TARGET_X86)
// Compiling for Arm: SIMSIMD_TARGET_NEON
#if !defined(SIMSIMD_TARGET_NEON) || (SIMSIMD_TARGET_NEON && !_SIMSIMD_TARGET_ARM)
#if defined(__ARM_NEON)
#define SIMSIMD_TARGET_NEON _SIMSIMD_TARGET_ARM
#else
#undef SIMSIMD_TARGET_NEON
#define SIMSIMD_TARGET_NEON 0
#endif // defined(__ARM_NEON)
#endif // !defined(SIMSIMD_TARGET_NEON) || ...
// Compiling for Arm: SIMSIMD_TARGET_NEON_I8
#if !defined(SIMSIMD_TARGET_NEON_I8) || (SIMSIMD_TARGET_NEON_I8 && !_SIMSIMD_TARGET_ARM)
#if defined(__ARM_NEON)
#define SIMSIMD_TARGET_NEON_I8 _SIMSIMD_TARGET_ARM
#else
#undef SIMSIMD_TARGET_NEON_I8
#define SIMSIMD_TARGET_NEON_I8 0
#endif // defined(__ARM_NEON)
#endif // !defined(SIMSIMD_TARGET_NEON_I8) || ...
// Compiling for Arm: SIMSIMD_TARGET_NEON_F16
#if !defined(SIMSIMD_TARGET_NEON_F16) || (SIMSIMD_TARGET_NEON_F16 && !_SIMSIMD_TARGET_ARM)
#if defined(__ARM_NEON)
#define SIMSIMD_TARGET_NEON_F16 _SIMSIMD_TARGET_ARM
#else
#undef SIMSIMD_TARGET_NEON_F16
#define SIMSIMD_TARGET_NEON_F16 0
#endif // defined(__ARM_NEON)
#endif // !defined(SIMSIMD_TARGET_NEON_F16) || ...
// Compiling for Arm: SIMSIMD_TARGET_NEON_BF16
#if !defined(SIMSIMD_TARGET_NEON_BF16) || (SIMSIMD_TARGET_NEON_BF16 && !_SIMSIMD_TARGET_ARM)
#if defined(__ARM_NEON)
#define SIMSIMD_TARGET_NEON_BF16 _SIMSIMD_TARGET_ARM
#else
#undef SIMSIMD_TARGET_NEON_BF16
#define SIMSIMD_TARGET_NEON_BF16 0
#endif // defined(__ARM_NEON)
#endif // !defined(SIMSIMD_TARGET_NEON_BF16) || ...
// Compiling for Arm: SIMSIMD_TARGET_SVE
#if !defined(SIMSIMD_TARGET_SVE) || (SIMSIMD_TARGET_SVE && !_SIMSIMD_TARGET_ARM)
#if defined(__ARM_FEATURE_SVE)
#define SIMSIMD_TARGET_SVE _SIMSIMD_TARGET_ARM
#else
#undef SIMSIMD_TARGET_SVE
#define SIMSIMD_TARGET_SVE 0
#endif // defined(__ARM_FEATURE_SVE)
#endif // !defined(SIMSIMD_TARGET_SVE) || ...
// Compiling for Arm: SIMSIMD_TARGET_SVE_I8
#if !defined(SIMSIMD_TARGET_SVE_I8) || (SIMSIMD_TARGET_SVE_I8 && !_SIMSIMD_TARGET_ARM)
#if defined(__ARM_FEATURE_SVE)
#define SIMSIMD_TARGET_SVE_I8 _SIMSIMD_TARGET_ARM
#else
#undef SIMSIMD_TARGET_SVE_I8
#define SIMSIMD_TARGET_SVE_I8 0
#endif // defined(__ARM_FEATURE_SVE)
#endif // !defined(SIMSIMD_TARGET_SVE_I8) || ...
// Compiling for Arm: SIMSIMD_TARGET_SVE_F16
#if !defined(SIMSIMD_TARGET_SVE_F16) || (SIMSIMD_TARGET_SVE_F16 && !_SIMSIMD_TARGET_ARM)
#if defined(__ARM_FEATURE_SVE)
#define SIMSIMD_TARGET_SVE_F16 _SIMSIMD_TARGET_ARM
#else
#undef SIMSIMD_TARGET_SVE_F16
#define SIMSIMD_TARGET_SVE_F16 0
#endif // defined(__ARM_FEATURE_SVE)
#endif // !defined(SIMSIMD_TARGET_SVE_F16) || ...
// Compiling for Arm: SIMSIMD_TARGET_SVE_BF16
#if !defined(SIMSIMD_TARGET_SVE_BF16) || (SIMSIMD_TARGET_SVE_BF16 && !_SIMSIMD_TARGET_ARM)
#if defined(__ARM_FEATURE_SVE)
#define SIMSIMD_TARGET_SVE_BF16 _SIMSIMD_TARGET_ARM
#else
#undef SIMSIMD_TARGET_SVE_BF16
#define SIMSIMD_TARGET_SVE_BF16 0
#endif // defined(__ARM_FEATURE_SVE)
#endif // !defined(SIMSIMD_TARGET_SVE_BF16) || ...
// Compiling for Arm: SIMSIMD_TARGET_SVE2
#if !defined(SIMSIMD_TARGET_SVE2) || (SIMSIMD_TARGET_SVE2 && !_SIMSIMD_TARGET_ARM)
#if defined(__ARM_FEATURE_SVE)
#define SIMSIMD_TARGET_SVE2 _SIMSIMD_TARGET_ARM
#else
#undef SIMSIMD_TARGET_SVE2
#define SIMSIMD_TARGET_SVE2 0
#endif // defined(__ARM_FEATURE_SVE)
#endif // !defined(SIMSIMD_TARGET_SVE2) || ...
// Compiling for x86: SIMSIMD_TARGET_HASWELL
//
// Starting with Ivy Bridge, Intel supports the `F16C` extensions for fast half-precision
// to single-precision floating-point conversions. On AMD those instructions
// are supported on all CPUs starting with Jaguar 2009.
// Starting with Sandy Bridge, Intel adds basic AVX support in their CPUs and in 2013
// extends it with AVX2 in the Haswell generation. Moreover, Haswell adds FMA support.
#if !defined(SIMSIMD_TARGET_HASWELL) || (SIMSIMD_TARGET_HASWELL && !_SIMSIMD_TARGET_X86)
#if defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
#define SIMSIMD_TARGET_HASWELL 1
#else
#undef SIMSIMD_TARGET_HASWELL
#define SIMSIMD_TARGET_HASWELL 0
#endif // defined(__AVX2__)
#endif // !defined(SIMSIMD_TARGET_HASWELL) || ...
// Compiling for x86: SIMSIMD_TARGET_SKYLAKE, SIMSIMD_TARGET_ICE, SIMSIMD_TARGET_GENOA,
// SIMSIMD_TARGET_SAPPHIRE, SIMSIMD_TARGET_TURIN, SIMSIMD_TARGET_SIERRA
//
// To list all available macros for x86, take a recent compiler, like GCC 12 and run:
// gcc-12 -march=sapphirerapids -dM -E - < /dev/null | egrep "SSE|AVX" | sort
// On Arm machines you may want to check for other flags:
// gcc-12 -march=native -dM -E - < /dev/null | egrep "NEON|SVE|FP16|FMA" | sort
#if !defined(SIMSIMD_TARGET_SKYLAKE) || (SIMSIMD_TARGET_SKYLAKE && !_SIMSIMD_TARGET_X86)
#if defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && \
defined(__AVX512BW__)
#define SIMSIMD_TARGET_SKYLAKE 1
#else
#undef SIMSIMD_TARGET_SKYLAKE
#define SIMSIMD_TARGET_SKYLAKE 0
#endif
#endif // !defined(SIMSIMD_TARGET_SKYLAKE) || ...
#if !defined(SIMSIMD_TARGET_ICE) || (SIMSIMD_TARGET_ICE && !_SIMSIMD_TARGET_X86)
#if defined(__AVX512VNNI__) && defined(__AVX512IFMA__) && defined(__AVX512BITALG__) && defined(__AVX512VBMI2__) && \
defined(__AVX512VPOPCNTDQ__)
#define SIMSIMD_TARGET_ICE 1
#else
#undef SIMSIMD_TARGET_ICE
#define SIMSIMD_TARGET_ICE 0
#endif
#endif // !defined(SIMSIMD_TARGET_ICE) || ...
#if !defined(SIMSIMD_TARGET_GENOA) || (SIMSIMD_TARGET_GENOA && !_SIMSIMD_TARGET_X86)
#if defined(__AVX512BF16__)
#define SIMSIMD_TARGET_GENOA 1
#else
#undef SIMSIMD_TARGET_GENOA
#define SIMSIMD_TARGET_GENOA 0
#endif
#endif // !defined(SIMSIMD_TARGET_GENOA) || ...
#if !defined(SIMSIMD_TARGET_SAPPHIRE) || (SIMSIMD_TARGET_SAPPHIRE && !_SIMSIMD_TARGET_X86)
#if defined(__AVX512FP16__)
#define SIMSIMD_TARGET_SAPPHIRE 1
#else
#undef SIMSIMD_TARGET_SAPPHIRE
#define SIMSIMD_TARGET_SAPPHIRE 0
#endif
#endif // !defined(SIMSIMD_TARGET_SAPPHIRE) || ...
#if !defined(SIMSIMD_TARGET_TURIN) || (SIMSIMD_TARGET_TURIN && !_SIMSIMD_TARGET_X86)
#if defined(__AVX512VP2INTERSECT__)
#define SIMSIMD_TARGET_TURIN 1
#else
#undef SIMSIMD_TARGET_TURIN
#define SIMSIMD_TARGET_TURIN 0
#endif
#endif // !defined(SIMSIMD_TARGET_TURIN) || ...
#if !defined(SIMSIMD_TARGET_SIERRA) || (SIMSIMD_TARGET_SIERRA && !_SIMSIMD_TARGET_X86)
#if defined(__AVX2_VNNI__)
#define SIMSIMD_TARGET_SIERRA 1
#else
#undef SIMSIMD_TARGET_SIERRA
#define SIMSIMD_TARGET_SIERRA 0
#endif
#endif // !defined(SIMSIMD_TARGET_SIERRA) || ...
#if defined(_MSC_VER)
#include
#else
#if SIMSIMD_TARGET_NEON
#include
#endif
#if SIMSIMD_TARGET_SVE || SIMSIMD_TARGET_SVE2
#include
#endif
#if SIMSIMD_TARGET_HASWELL || SIMSIMD_TARGET_SKYLAKE || SIMSIMD_TARGET_ICE || SIMSIMD_TARGET_GENOA || \
SIMSIMD_TARGET_SAPPHIRE || SIMSIMD_TARGET_TURIN
#include
#endif
#endif
#if !defined(SIMSIMD_SQRT)
#include
#define SIMSIMD_SQRT(x) (sqrt(x))
#endif
#if !defined(SIMSIMD_RSQRT)
#include
#define SIMSIMD_RSQRT(x) (1 / SIMSIMD_SQRT(x))
#endif
#if !defined(SIMSIMD_LOG)
#include
#define SIMSIMD_LOG(x) (log(x))
#endif
// Copy 16 bits (2 bytes) from source to destination
#if defined(__GNUC__) || defined(__clang__)
#define SIMSIMD_COPY16(destination_ptr, source_ptr) __builtin_memcpy((destination_ptr), (source_ptr), 2)
#else
#include /* fallback for exotic compilers */
#define SIMSIMD_COPY16(destination_ptr, source_ptr) memcpy((destination_ptr), (source_ptr), 2)
#endif
#if !defined(SIMSIMD_F32_DIVISION_EPSILON)
#define SIMSIMD_F32_DIVISION_EPSILON (1e-7)
#endif
#if !defined(SIMSIMD_F16_DIVISION_EPSILON)
#define SIMSIMD_F16_DIVISION_EPSILON (1e-3)
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef unsigned char simsimd_b8_t;
typedef unsigned char simsimd_i4x2_t;
typedef signed char simsimd_i8_t;
typedef unsigned char simsimd_u8_t;
typedef signed short simsimd_i16_t;
typedef unsigned short simsimd_u16_t;
typedef signed int simsimd_i32_t;
typedef unsigned int simsimd_u32_t;
typedef signed long long simsimd_i64_t;
typedef unsigned long long simsimd_u64_t;
typedef float simsimd_f32_t;
typedef double simsimd_f64_t;
typedef simsimd_u64_t simsimd_size_t;
typedef simsimd_f64_t simsimd_distance_t;
/* @brief Half-precision floating-point type.
*
* - GCC or Clang on 64-bit Arm: `__fp16`, may require `-mfp16-format` option.
* - GCC or Clang on 64-bit x86: `_Float16`.
* - Default: `unsigned short`.
*/
#if !defined(SIMSIMD_NATIVE_F16) || SIMSIMD_NATIVE_F16
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__ARM_ARCH) || defined(__aarch64__)) && \
(defined(__ARM_FP16_FORMAT_IEEE))
#undef SIMSIMD_NATIVE_F16
#define SIMSIMD_NATIVE_F16 1
typedef __fp16 simsimd_f16_t;
#elif ((defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__)) && \
(defined(__AVX512FP16__)))
typedef _Float16 simsimd_f16_t;
#undef SIMSIMD_NATIVE_F16
#define SIMSIMD_NATIVE_F16 1
#else // Unknown compiler or architecture
#if defined(__GNUC__) || defined(__clang__) // Some compilers don't support warning pragmas
#warning "Unknown compiler or architecture for float16."
#endif
#undef SIMSIMD_NATIVE_F16
#define SIMSIMD_NATIVE_F16 0
#endif // Unknown compiler or architecture
#endif // !SIMSIMD_NATIVE_F16
#if !SIMSIMD_NATIVE_F16
typedef unsigned short simsimd_f16_t;
#endif
#if !defined(SIMSIMD_NATIVE_BF16) || SIMSIMD_NATIVE_BF16
/**
* @brief Half-precision brain-float type.
*
* - GCC or Clang on 64-bit Arm: `__bf16`
* - GCC or Clang on 64-bit x86: `_BFloat16`.
* - Default: `unsigned short`.
*
* The compilers have added `__bf16` support in compliance with the x86-64 psABI spec.
* The motivation for this new special type is summed up as:
*
* Currently `__bfloat16` is a typedef of short, which creates a problem where the
* compiler does not raise any alarms if it is used to add, subtract, multiply or
* divide, but the result of the calculation is actually meaningless.
* To solve this problem, a real scalar type `__Bfloat16` needs to be introduced.
* It is mainly used for intrinsics, not available for C standard operators.
* `__Bfloat16` will also be used for movement like passing parameter, load and store,
* vector initialization, vector shuffle, and etc. It creates a need for a
* corresponding psABI.
*
* @warning Apple Clang has hard time with bf16.
* https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
* https://forums.developer.apple.com/forums/thread/726201
* https://www.phoronix.com/news/GCC-LLVM-bf16-BFloat16-Type
*/
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__ARM_ARCH) || defined(__aarch64__)) && \
(defined(__ARM_BF16_FORMAT_ALTERNATIVE))
#undef SIMSIMD_NATIVE_BF16
#define SIMSIMD_NATIVE_BF16 1
typedef __bf16 simsimd_bf16_t;
#elif ((defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__)) && \
(defined(__AVX512BF16__)))
typedef __bfloat16 simsimd_bf16_t;
#undef SIMSIMD_NATIVE_BF16
#define SIMSIMD_NATIVE_BF16 1
#else // Unknown compiler or architecture
#if defined(__GNUC__) || defined(__clang__) // Some compilers don't support warning pragmas
#warning "Unknown compiler or architecture for bfloat16."
#endif
#undef SIMSIMD_NATIVE_BF16
#define SIMSIMD_NATIVE_BF16 0
#endif // Unknown compiler or architecture
#endif // !SIMSIMD_NATIVE_BF16
#if !SIMSIMD_NATIVE_BF16
typedef unsigned short simsimd_bf16_t;
#endif
/**
* @brief Alias for the half-precision floating-point type on Arm.
*
* Clang and GCC bring the `float16_t` symbol when you compile for Aarch64.
* MSVC lacks it, and it's `vld1_f16`-like intrinsics are in reality macros,
* that cast to 16-bit integers internally, instead of using floats.
* Some of those are defined as aliases, so we use `#define` preprocessor
* directives instead of `typedef` to avoid errors.
*/
#if _SIMSIMD_TARGET_ARM
#if defined(_MSC_VER)
#define simsimd_f16_for_arm_simd_t simsimd_f16_t
#define simsimd_bf16_for_arm_simd_t simsimd_bf16_t
#else
#define simsimd_f16_for_arm_simd_t float16_t
#define simsimd_bf16_for_arm_simd_t bfloat16_t
#endif
#endif
/*
* Let's make sure the sizes of the types are as expected.
* In C the `_Static_assert` is only available with C11 and later.
*/
#define SIMSIMD_STATIC_ASSERT(cond, msg) typedef char static_assertion_##msg[(cond) ? 1 : -1]
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_b8_t) == 1, simsimd_b8_t_must_be_1_byte);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i4x2_t) == 1, simsimd_i4x2_t_must_be_1_byte);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i8_t) == 1, simsimd_i8_t_must_be_1_byte);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u8_t) == 1, simsimd_u8_t_must_be_1_byte);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i16_t) == 2, simsimd_i16_t_must_be_2_bytes);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u16_t) == 2, simsimd_u16_t_must_be_2_bytes);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i32_t) == 4, simsimd_i32_t_must_be_4_bytes);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u32_t) == 4, simsimd_u32_t_must_be_4_bytes);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i64_t) == 8, simsimd_i64_t_must_be_8_bytes);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u64_t) == 8, simsimd_u64_t_must_be_8_bytes);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_f32_t) == 4, simsimd_f32_t_must_be_4_bytes);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_f64_t) == 8, simsimd_f64_t_must_be_8_bytes);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_f16_t) == 2, simsimd_f16_t_must_be_2_bytes);
SIMSIMD_STATIC_ASSERT(sizeof(simsimd_bf16_t) == 2, simsimd_bf16_t_must_be_2_bytes);
#define SIMSIMD_DEREFERENCE(x) (*(x))
#define SIMSIMD_EXPORT(x, y) *(y) = x
/**
* @brief Returns the value of the half-precision floating-point number,
* potentially decompressed into single-precision.
*/
#if !defined(SIMSIMD_F16_TO_F32)
#if SIMSIMD_NATIVE_F16
#define SIMSIMD_F16_TO_F32(x) (SIMSIMD_DEREFERENCE(x))
#define SIMSIMD_F32_TO_F16(x, y) (SIMSIMD_EXPORT(x, y))
#else
#define SIMSIMD_F16_TO_F32(x) (simsimd_f16_to_f32(x))
#define SIMSIMD_F32_TO_F16(x, y) (simsimd_f32_to_f16(x, y))
#endif
#endif
/**
* @brief Returns the value of the half-precision brain floating-point number,
* potentially decompressed into single-precision.
*/
#if !defined(SIMSIMD_BF16_TO_F32)
#if SIMSIMD_NATIVE_BF16
#define SIMSIMD_BF16_TO_F32(x) (SIMSIMD_DEREFERENCE(x))
#define SIMSIMD_F32_TO_BF16(x, y) (SIMSIMD_EXPORT(x, y))
#else
#define SIMSIMD_BF16_TO_F32(x) (simsimd_bf16_to_f32(x))
#define SIMSIMD_F32_TO_BF16(x, y) (simsimd_f32_to_bf16(x, y))
#endif
#endif
#if !defined(SIMSIMD_F32_TO_I8)
#define SIMSIMD_F32_TO_I8(x, y) \
*(y) = (simsimd_i8_t)((x) > 127 ? 127 : ((x) < -128 ? -128 : (int)((x) + ((x) < 0 ? -0.5f : 0.5f))))
#endif
#if !defined(SIMSIMD_F32_TO_U8)
#define SIMSIMD_F32_TO_U8(x, y) \
*(y) = (simsimd_u8_t)((x) > 255 ? 255 : ((x) < 0 ? 0 : (int)((x) + ((x) < 0 ? -0.5f : 0.5f))))
#endif
#if !defined(SIMSIMD_F64_TO_I8)
#define SIMSIMD_F64_TO_I8(x, y) \
*(y) = (simsimd_i8_t)((x) > 127 ? 127 : ((x) < -128 ? -128 : (int)((x) + ((x) < 0 ? -0.5 : 0.5))))
#endif
#if !defined(SIMSIMD_F64_TO_U8)
#define SIMSIMD_F64_TO_U8(x, y) \
*(y) = (simsimd_u8_t)((x) > 255 ? 255 : ((x) < 0 ? 0 : (int)((x) + ((x) < 0 ? -0.5 : 0.5))))
#endif
/** @brief Convenience type for half-precision floating-point type conversions. */
typedef union {
unsigned i;
float f;
} simsimd_f32i32_t;
/** @brief Convenience type addressing the real and imaginary parts of a half-precision complex number. */
typedef struct {
simsimd_f16_t real;
simsimd_f16_t imag;
} simsimd_f16c_t;
/** @brief Convenience type addressing the real and imaginary parts of a half-precision brain-float complex number. */
typedef struct {
simsimd_bf16_t real;
simsimd_bf16_t imag;
} simsimd_bf16c_t;
/** @brief Convenience type addressing the real and imaginary parts of a single-precision complex number. */
typedef struct {
simsimd_f32_t real;
simsimd_f32_t imag;
} simsimd_f32c_t;
/** @brief Convenience type addressing the real and imaginary parts of a double-precision complex number. */
typedef struct {
simsimd_f64_t real;
simsimd_f64_t imag;
} simsimd_f64c_t;
/**
* @brief Computes `1/sqrt(x)` using the trick from Quake 3,
* replacing the magic numbers with the ones suggested by Jan Kadlec.
*
* Subsequent additions by hardware manufacturers have made this algorithm redundant for the most part.
* For example, on x86, Intel introduced the SSE instruction `rsqrtss` in 1999. In a 2009 benchmark on
* the Intel Core 2, this instruction took 0.85ns per float compared to 3.54ns for the fast inverse
* square root algorithm, and had less error. Carmack's Magic Number `rsqrt` had an average error
* of 0.0990%, while SSE `rsqrtss` had 0.0094%, a 10x improvement.
*
* https://web.archive.org/web/20210208132927/http://assemblyrequired.crashworks.org/timing-square-root/
* https://stackoverflow.com/a/41460625/2766161
*/
SIMSIMD_INTERNAL simsimd_f32_t simsimd_approximate_inverse_square_root(simsimd_f32_t number) {
simsimd_f32i32_t conv;
conv.f = number;
conv.i = 0x5F1FFFF9 - (conv.i >> 1);
// Refine using a Newton-Raphson step for better accuracy
conv.f *= 0.703952253f * (2.38924456f - number * conv.f * conv.f);
return conv.f;
}
/**
* @brief Approximates `sqrt(x)` using the fast inverse square root trick
* with adjustments for direct square root approximation.
*
* Similar to `rsqrt` approximation but multiplies by `number` to get `sqrt`.
* This technique is useful where `sqrt` approximation is needed in performance-critical code,
* though modern hardware provides optimized alternatives.
*/
SIMSIMD_INTERNAL simsimd_f32_t simsimd_approximate_square_root(simsimd_f32_t number) {
return number * simsimd_approximate_inverse_square_root(number);
}
/**
* @brief Computes `log(x)` using the Mercator series.
* The series converges to the natural logarithm for args between -1 and 1.
* Published in 1668 in "Logarithmotechnia".
*/
SIMSIMD_INTERNAL simsimd_f32_t simsimd_approximate_log(simsimd_f32_t number) {
simsimd_f32_t x = number - 1;
simsimd_f32_t x2 = x * x;
simsimd_f32_t x3 = x * x * x;
return x - x2 / 2 + x3 / 3;
}
/**
* @brief For compilers that don't natively support the `_Float16` type,
* upcasts contents into a more conventional `float`.
*
* @warning This function won't handle boundary conditions well.
*
* https://stackoverflow.com/a/60047308
* https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
* https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
*/
SIMSIMD_INTERNAL simsimd_f32_t simsimd_f16_to_f32_implementation(simsimd_f16_t const *x_ptr) {
unsigned short x;
SIMSIMD_COPY16(&x, x_ptr);
unsigned int exponent = (x & 0x7C00) >> 10;
unsigned int mantissa = (x & 0x03FF) << 13;
simsimd_f32i32_t mantissa_conv;
mantissa_conv.f = (float)mantissa;
unsigned int v = (mantissa_conv.i) >> 23;
simsimd_f32i32_t conv;
conv.i = (x & 0x8000) << 16 | (exponent != 0) * ((exponent + 112) << 23 | mantissa) |
((exponent == 0) & (mantissa != 0)) * ((v - 37) << 23 | ((mantissa << (150 - v)) & 0x007FE000));
return conv.f;
}
/**
* @brief Compresses a `float` to an `f16` representation (IEEE-754 16-bit floating-point format).
*
* @warning This function won't handle boundary conditions well.
*
* https://stackoverflow.com/a/60047308
* https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
* https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
*/
SIMSIMD_INTERNAL void simsimd_f32_to_f16_implementation(simsimd_f32_t x, simsimd_f16_t *result_ptr) {
simsimd_f32i32_t conv;
conv.f = x;
unsigned int b = conv.i + 0x00001000;
unsigned int e = (b & 0x7F800000) >> 23;
unsigned int m = b & 0x007FFFFF;
unsigned short result = ((b & 0x80000000) >> 16) | (e > 112) * ((((e - 112) << 10) & 0x7C00) | (m >> 13)) |
((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) |
((e > 143) * 0x7FFF);
SIMSIMD_COPY16(result_ptr, &result);
}
/**
* @brief For compilers that don't natively support the `__bf16` type,
* upcasts contents into a more conventional `float`.
*
* https://stackoverflow.com/questions/55253233/convert-fp32-to-bfloat16-in-c/55254307#55254307
* https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus
*/
SIMSIMD_INTERNAL simsimd_f32_t simsimd_bf16_to_f32_implementation(simsimd_bf16_t const *x_ptr) {
unsigned short x;
SIMSIMD_COPY16(&x, x_ptr);
simsimd_f32i32_t conv;
conv.i = x << 16; // Zero extends the mantissa
return conv.f;
}
/**
* @brief Compresses a `float` to a `bf16` representation.
*
* https://stackoverflow.com/questions/55253233/convert-fp32-to-bfloat16-in-c/55254307#55254307
* https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus
*/
SIMSIMD_INTERNAL void simsimd_f32_to_bf16_implementation(simsimd_f32_t x, simsimd_bf16_t *result_ptr) {
simsimd_f32i32_t conv;
conv.f = x;
conv.i += 0x8000; // Rounding is optional
conv.i >>= 16;
// Use an intermediate variable to ensure correct behavior on big-endian systems.
// Copying directly from `&conv.i` would copy the wrong bytes on big-endian,
// since the lower 16 bits are at offset 2, not offset 0.
unsigned short result = (unsigned short)conv.i;
SIMSIMD_COPY16(result_ptr, &result);
}
SIMSIMD_INTERNAL simsimd_u32_t simsimd_u32_rol(simsimd_u32_t x, int n) { return (x << n) | (x >> (32 - n)); }
SIMSIMD_INTERNAL simsimd_u16_t simsimd_u16_rol(simsimd_u16_t x, int n) { return (x << n) | (x >> (16 - n)); }
SIMSIMD_INTERNAL simsimd_u8_t simsimd_u8_rol(simsimd_u8_t x, int n) { return (x << n) | (x >> (8 - n)); }
SIMSIMD_INTERNAL simsimd_u32_t simsimd_u32_ror(simsimd_u32_t x, int n) { return (x >> n) | (x << (32 - n)); }
SIMSIMD_INTERNAL simsimd_u16_t simsimd_u16_ror(simsimd_u16_t x, int n) { return (x >> n) | (x << (16 - n)); }
SIMSIMD_INTERNAL simsimd_u8_t simsimd_u8_ror(simsimd_u8_t x, int n) { return (x >> n) | (x << (8 - n)); }
#if SIMSIMD_DYNAMIC_DISPATCH
/** @copydoc simsimd_f16_to_f32_implementation */
SIMSIMD_DYNAMIC simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr);
/** @copydoc simsimd_f32_to_f16_implementation */
SIMSIMD_DYNAMIC void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_ptr);
/** @copydoc simsimd_bf16_to_f32_implementation */
SIMSIMD_DYNAMIC simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr);
/** @copydoc simsimd_f32_to_bf16_implementation */
SIMSIMD_DYNAMIC void simsimd_f32_to_bf16(simsimd_f32_t x, simsimd_bf16_t *result_ptr);
#else // SIMSIMD_DYNAMIC_DISPATCH
/** @copydoc simsimd_f16_to_f32_implementation */
SIMSIMD_PUBLIC simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr) {
return simsimd_f16_to_f32_implementation(x_ptr);
}
/** @copydoc simsimd_f32_to_f16_implementation */
SIMSIMD_PUBLIC void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_ptr) {
simsimd_f32_to_f16_implementation(x, result_ptr);
}
/** @copydoc simsimd_bf16_to_f32_implementation */
SIMSIMD_PUBLIC simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr) {
return simsimd_bf16_to_f32_implementation(x_ptr);
}
/** @copydoc simsimd_f32_to_bf16_implementation */
SIMSIMD_PUBLIC void simsimd_f32_to_bf16(simsimd_f32_t x, simsimd_bf16_t *result_ptr) {
simsimd_f32_to_bf16_implementation(x, result_ptr);
}
#endif // SIMSIMD_DYNAMIC_DISPATCH
#ifdef __cplusplus
} // extern "C"
#endif
#endif
simsimd-6.5.12/rust/lib.rs 0000644 0000000 0000000 00000200555 10461020230 0013504 0 ustar 0000000 0000000 //! # SpatialSimilarity - Hardware-Accelerated Similarity Metrics and Distance Functions
//!
//! * Targets ARM NEON, SVE, x86 AVX2, AVX-512 (VNNI, FP16) hardware backends.
//! * Handles `f64` double- and `f32` single-precision, integral, and binary vectors.
//! * Exposes half-precision (`f16`) and brain floating point (`bf16`) types.
//! * Zero-dependency header-only C 99 library with bindings for Rust and other languages.
//!
//! ## Implemented distance functions include:
//!
//! * Euclidean (L2), inner product, and cosine (angular) spatial distances.
//! * Hamming (~ Manhattan) and Jaccard (~ Tanimoto) binary distances.
//! * Kullback-Leibler and Jensen-Shannon divergences for probability distributions.
//!
//! ## Example
//!
//! ```rust
//! use simsimd::SpatialSimilarity;
//!
//! let a = &[1, 2, 3];
//! let b = &[4, 5, 6];
//!
//! // Compute cosine distance
//! let cos_dist = i8::cos(a, b);
//!
//! // Compute dot product distance
//! let dot_product = i8::dot(a, b);
//!
//! // Compute squared Euclidean distance
//! let l2sq_dist = i8::l2sq(a, b);
//!
//! // Optimize performance by flushing denormals
//! simsimd::capabilities::flush_denormals();
//! ```
//!
//! ## Mixed Precision Support
//!
//! ```rust
//! use simsimd::{SpatialSimilarity, f16, bf16};
//!
//! // Work with half-precision floats
//! let half_a: Vec = vec![1.0, 2.0, 3.0].iter().map(|&x| f16::from_f32(x)).collect();
//! let half_b: Vec = vec![4.0, 5.0, 6.0].iter().map(|&x| f16::from_f32(x)).collect();
//! let half_cos_dist = f16::cos(&half_a, &half_b);
//!
//! // Work with brain floats
//! let brain_a: Vec = vec![1.0, 2.0, 3.0].iter().map(|&x| bf16::from_f32(x)).collect();
//! let brain_b: Vec = vec![4.0, 5.0, 6.0].iter().map(|&x| bf16::from_f32(x)).collect();
//! let brain_cos_dist = bf16::cos(&brain_a, &brain_b);
//!
//! // Direct bit manipulation
//! let half = f16::from_f32(3.14);
//! let bits = half.0; // Access raw u16 representation
//! let reconstructed = f16(bits);
//! ```
//!
//! ## Traits
//!
//! The `SpatialSimilarity` trait covers following methods:
//!
//! - `cosine(a: &[Self], b: &[Self]) -> Option`: Computes cosine distance (1 - similarity) between two slices.
//! - `dot(a: &[Self], b: &[Self]) -> Option`: Computes dot product distance between two slices.
//! - `sqeuclidean(a: &[Self], b: &[Self]) -> Option`: Computes squared Euclidean distance between two slices.
//!
//! The `BinarySimilarity` trait covers following methods:
//!
//! - `hamming(a: &[Self], b: &[Self]) -> Option`: Computes Hamming distance between two slices.
//! - `jaccard(a: &[Self], b: &[Self]) -> Option`: Computes Jaccard distance between two slices.
//!
//! The `ProbabilitySimilarity` trait covers following methods:
//!
//! - `jensenshannon(a: &[Self], b: &[Self]) -> Option`: Computes Jensen-Shannon divergence between two slices.
//! - `kullbackleibler(a: &[Self], b: &[Self]) -> Option`: Computes Kullback-Leibler divergence between two slices.
//!
#![allow(non_camel_case_types)]
#![cfg_attr(all(not(test), not(feature = "std")), no_std)]
pub type Distance = f64;
pub type ComplexProduct = (f64, f64);
/// Size type used in C FFI to match `simsimd_size_t` which is always `uint64_t`.
/// This is aliased to `u64` instead of `usize` to maintain ABI compatibility across
/// all platforms, including 32-bit architectures where `usize` is 32-bit but the
/// C library expects 64-bit size parameters.
///
/// TODO: In v7, change the C library to use `size_t` and this to `usize`.
type u64size = u64;
/// Compatibility function for pre 1.85 Rust versions lacking `f32::abs`.
#[inline(always)]
fn f32_abs_compat(x: f32) -> f32 {
f32::from_bits(x.to_bits() & 0x7FFF_FFFF)
}
#[link(name = "simsimd")]
extern "C" {
fn simsimd_dot_i8(a: *const i8, b: *const i8, c: u64size, d: *mut Distance);
fn simsimd_dot_f16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_dot_bf16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_dot_f32(a: *const f32, b: *const f32, c: u64size, d: *mut Distance);
fn simsimd_dot_f64(a: *const f64, b: *const f64, c: u64size, d: *mut Distance);
fn simsimd_dot_f16c(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_dot_bf16c(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_dot_f32c(a: *const f32, b: *const f32, c: u64size, d: *mut Distance);
fn simsimd_dot_f64c(a: *const f64, b: *const f64, c: u64size, d: *mut Distance);
fn simsimd_vdot_f16c(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_vdot_bf16c(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_vdot_f32c(a: *const f32, b: *const f32, c: u64size, d: *mut Distance);
fn simsimd_vdot_f64c(a: *const f64, b: *const f64, c: u64size, d: *mut Distance);
fn simsimd_cos_i8(a: *const i8, b: *const i8, c: u64size, d: *mut Distance);
fn simsimd_cos_f16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_cos_bf16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_cos_f32(a: *const f32, b: *const f32, c: u64size, d: *mut Distance);
fn simsimd_cos_f64(a: *const f64, b: *const f64, c: u64size, d: *mut Distance);
fn simsimd_l2sq_i8(a: *const i8, b: *const i8, c: u64size, d: *mut Distance);
fn simsimd_l2sq_f16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_l2sq_bf16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_l2sq_f32(a: *const f32, b: *const f32, c: u64size, d: *mut Distance);
fn simsimd_l2sq_f64(a: *const f64, b: *const f64, c: u64size, d: *mut Distance);
fn simsimd_l2_i8(a: *const i8, b: *const i8, c: u64size, d: *mut Distance);
fn simsimd_l2_f16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_l2_bf16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_l2_f32(a: *const f32, b: *const f32, c: u64size, d: *mut Distance);
fn simsimd_l2_f64(a: *const f64, b: *const f64, c: u64size, d: *mut Distance);
fn simsimd_hamming_b8(a: *const u8, b: *const u8, c: u64size, d: *mut Distance);
fn simsimd_jaccard_b8(a: *const u8, b: *const u8, c: u64size, d: *mut Distance);
fn simsimd_js_f16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_js_bf16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_js_f32(a: *const f32, b: *const f32, c: u64size, d: *mut Distance);
fn simsimd_js_f64(a: *const f64, b: *const f64, c: u64size, d: *mut Distance);
fn simsimd_kl_f16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_kl_bf16(a: *const u16, b: *const u16, c: u64size, d: *mut Distance);
fn simsimd_kl_f32(a: *const f32, b: *const f32, c: u64size, d: *mut Distance);
fn simsimd_kl_f64(a: *const f64, b: *const f64, c: u64size, d: *mut Distance);
fn simsimd_intersect_u16(
a: *const u16,
b: *const u16,
a_length: u64size,
b_length: u64size,
d: *mut Distance,
);
fn simsimd_intersect_u32(
a: *const u32,
b: *const u32,
a_length: u64size,
b_length: u64size,
d: *mut Distance,
);
fn simsimd_uses_neon() -> i32;
fn simsimd_uses_neon_f16() -> i32;
fn simsimd_uses_neon_bf16() -> i32;
fn simsimd_uses_neon_i8() -> i32;
fn simsimd_uses_sve() -> i32;
fn simsimd_uses_sve_f16() -> i32;
fn simsimd_uses_sve_bf16() -> i32;
fn simsimd_uses_sve_i8() -> i32;
fn simsimd_uses_haswell() -> i32;
fn simsimd_uses_skylake() -> i32;
fn simsimd_uses_ice() -> i32;
fn simsimd_uses_genoa() -> i32;
fn simsimd_uses_sapphire() -> i32;
fn simsimd_uses_turin() -> i32;
fn simsimd_uses_sierra() -> i32;
fn simsimd_flush_denormals() -> i32;
fn simsimd_uses_dynamic_dispatch() -> i32;
fn simsimd_f32_to_f16(f32_value: f32, result_ptr: *mut u16);
fn simsimd_f16_to_f32(f16_ptr: *const u16) -> f32;
fn simsimd_f32_to_bf16(f32_value: f32, result_ptr: *mut u16);
fn simsimd_bf16_to_f32(bf16_ptr: *const u16) -> f32;
}
/// A half-precision (16-bit) floating point number.
///
/// This type represents IEEE 754 half-precision binary floating-point format.
/// It provides conversion methods to and from f32, and the underlying u16
/// representation is publicly accessible for direct bit manipulation.
///
/// # Examples
///
/// ```
/// use simsimd::f16;
///
/// // Create from f32
/// let half = f16::from_f32(3.14);
///
/// // Convert back to f32
/// let float = half.to_f32();
///
/// // Direct access to bits
/// let bits = half.0;
/// ```
#[repr(transparent)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct f16(pub u16);
impl f16 {
/// Positive zero.
pub const ZERO: Self = f16(0);
/// Positive one.
pub const ONE: Self = f16(0x3C00);
/// Negative one.
pub const NEG_ONE: Self = f16(0xBC00);
/// Converts an f32 to f16 representation.
///
/// # Examples
///
/// ```
/// use simsimd::f16;
/// let half = f16::from_f32(3.14159);
/// ```
#[inline(always)]
pub fn from_f32(value: f32) -> Self {
let mut result: u16 = 0;
unsafe { simsimd_f32_to_f16(value, &mut result) };
f16(result)
}
/// Converts the f16 to an f32.
///
/// # Examples
///
/// ```
/// use simsimd::f16;
/// let half = f16::from_f32(3.14159);
/// let float = half.to_f32();
/// ```
#[inline(always)]
pub fn to_f32(self) -> f32 {
unsafe { simsimd_f16_to_f32(&self.0) }
}
/// Returns true if this value is NaN.
#[inline(always)]
pub fn is_nan(self) -> bool {
self.to_f32().is_nan()
}
/// Returns true if this value is positive or negative infinity.
#[inline(always)]
pub fn is_infinite(self) -> bool {
self.to_f32().is_infinite()
}
/// Returns true if this number is neither infinite nor NaN.
#[inline(always)]
pub fn is_finite(self) -> bool {
self.to_f32().is_finite()
}
/// Returns the absolute value of self.
#[inline(always)]
pub fn abs(self) -> Self {
Self::from_f32(f32_abs_compat(self.to_f32()))
}
/// Returns the largest integer less than or equal to a number.
///
/// This method is only available when the `std` feature is enabled.
#[cfg(feature = "std")]
#[inline(always)]
pub fn floor(self) -> Self {
Self::from_f32(self.to_f32().floor())
}
/// Returns the smallest integer greater than or equal to a number.
///
/// This method is only available when the `std` feature is enabled.
#[cfg(feature = "std")]
#[inline(always)]
pub fn ceil(self) -> Self {
Self::from_f32(self.to_f32().ceil())
}
/// Returns the nearest integer to a number. Round half-way cases away from 0.0.
///
/// This method is only available when the `std` feature is enabled.
#[cfg(feature = "std")]
#[inline(always)]
pub fn round(self) -> Self {
Self::from_f32(self.to_f32().round())
}
}
#[cfg(feature = "std")]
impl core::fmt::Display for f16 {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "{}", self.to_f32())
}
}
impl core::ops::Add for f16 {
type Output = Self;
#[inline(always)]
fn add(self, rhs: Self) -> Self::Output {
Self::from_f32(self.to_f32() + rhs.to_f32())
}
}
impl core::ops::Sub for f16 {
type Output = Self;
#[inline(always)]
fn sub(self, rhs: Self) -> Self::Output {
Self::from_f32(self.to_f32() - rhs.to_f32())
}
}
impl core::ops::Mul for f16 {
type Output = Self;
#[inline(always)]
fn mul(self, rhs: Self) -> Self::Output {
Self::from_f32(self.to_f32() * rhs.to_f32())
}
}
impl core::ops::Div for f16 {
type Output = Self;
#[inline(always)]
fn div(self, rhs: Self) -> Self::Output {
Self::from_f32(self.to_f32() / rhs.to_f32())
}
}
impl core::ops::Neg for f16 {
type Output = Self;
#[inline(always)]
fn neg(self) -> Self::Output {
Self::from_f32(-self.to_f32())
}
}
impl core::cmp::PartialOrd for f16 {
#[inline(always)]
fn partial_cmp(&self, other: &Self) -> Option {
self.to_f32().partial_cmp(&other.to_f32())
}
}
/// A brain floating point (bfloat16) number.
///
/// This type represents Google's bfloat16 format, which truncates IEEE 754
/// single-precision to 16 bits by keeping the exponent bits but reducing
/// the mantissa. This provides a wider range than f16 but lower precision.
///
/// # Examples
///
/// ```
/// use simsimd::bf16;
///
/// // Create from f32
/// let brain_half = bf16::from_f32(3.14);
///
/// // Convert back to f32
/// let float = brain_half.to_f32();
///
/// // Direct access to bits
/// let bits = brain_half.0;
/// ```
#[repr(transparent)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct bf16(pub u16);
impl bf16 {
/// Positive zero.
pub const ZERO: Self = bf16(0);
/// Positive one.
pub const ONE: Self = bf16(0x3F80);
/// Negative one.
pub const NEG_ONE: Self = bf16(0xBF80);
/// Converts an f32 to bf16 representation.
///
/// # Examples
///
/// ```
/// use simsimd::bf16;
/// let brain_half = bf16::from_f32(3.14159);
/// ```
#[inline(always)]
pub fn from_f32(value: f32) -> Self {
let mut result: u16 = 0;
unsafe { simsimd_f32_to_bf16(value, &mut result) };
bf16(result)
}
/// Converts the bf16 to an f32.
///
/// # Examples
///
/// ```
/// use simsimd::bf16;
/// let brain_half = bf16::from_f32(3.14159);
/// let float = brain_half.to_f32();
/// ```
#[inline(always)]
pub fn to_f32(self) -> f32 {
unsafe { simsimd_bf16_to_f32(&self.0) }
}
/// Returns true if this value is NaN.
#[inline(always)]
pub fn is_nan(self) -> bool {
self.to_f32().is_nan()
}
/// Returns true if this value is positive or negative infinity.
#[inline(always)]
pub fn is_infinite(self) -> bool {
self.to_f32().is_infinite()
}
/// Returns true if this number is neither infinite nor NaN.
#[inline(always)]
pub fn is_finite(self) -> bool {
self.to_f32().is_finite()
}
/// Returns the absolute value of self.
#[inline(always)]
pub fn abs(self) -> Self {
Self::from_f32(f32_abs_compat(self.to_f32()))
}
/// Returns the largest integer less than or equal to a number.
///
/// This method is only available when the `std` feature is enabled.
#[cfg(feature = "std")]
#[inline(always)]
pub fn floor(self) -> Self {
Self::from_f32(self.to_f32().floor())
}
/// Returns the smallest integer greater than or equal to a number.
///
/// This method is only available when the `std` feature is enabled.
#[cfg(feature = "std")]
#[inline(always)]
pub fn ceil(self) -> Self {
Self::from_f32(self.to_f32().ceil())
}
/// Returns the nearest integer to a number. Round half-way cases away from 0.0.
///
/// This method is only available when the `std` feature is enabled.
#[cfg(feature = "std")]
#[inline(always)]
pub fn round(self) -> Self {
Self::from_f32(self.to_f32().round())
}
}
#[cfg(feature = "std")]
impl core::fmt::Display for bf16 {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "{}", self.to_f32())
}
}
impl core::ops::Add for bf16 {
type Output = Self;
#[inline(always)]
fn add(self, rhs: Self) -> Self::Output {
Self::from_f32(self.to_f32() + rhs.to_f32())
}
}
impl core::ops::Sub for bf16 {
type Output = Self;
#[inline(always)]
fn sub(self, rhs: Self) -> Self::Output {
Self::from_f32(self.to_f32() - rhs.to_f32())
}
}
impl core::ops::Mul for bf16 {
type Output = Self;
#[inline(always)]
fn mul(self, rhs: Self) -> Self::Output {
Self::from_f32(self.to_f32() * rhs.to_f32())
}
}
impl core::ops::Div for bf16 {
type Output = Self;
#[inline(always)]
fn div(self, rhs: Self) -> Self::Output {
Self::from_f32(self.to_f32() / rhs.to_f32())
}
}
impl core::ops::Neg for bf16 {
type Output = Self;
#[inline(always)]
fn neg(self) -> Self::Output {
Self::from_f32(-self.to_f32())
}
}
impl core::cmp::PartialOrd for bf16 {
#[inline(always)]
fn partial_cmp(&self, other: &Self) -> Option {
self.to_f32().partial_cmp(&other.to_f32())
}
}
/// The `capabilities` module provides functions for detecting the hardware features
/// available on the current system.
pub mod capabilities {
pub fn uses_neon() -> bool {
unsafe { crate::simsimd_uses_neon() != 0 }
}
pub fn uses_neon_f16() -> bool {
unsafe { crate::simsimd_uses_neon_f16() != 0 }
}
pub fn uses_neon_bf16() -> bool {
unsafe { crate::simsimd_uses_neon_bf16() != 0 }
}
pub fn uses_neon_i8() -> bool {
unsafe { crate::simsimd_uses_neon_i8() != 0 }
}
pub fn uses_sve() -> bool {
unsafe { crate::simsimd_uses_sve() != 0 }
}
pub fn uses_sve_f16() -> bool {
unsafe { crate::simsimd_uses_sve_f16() != 0 }
}
pub fn uses_sve_bf16() -> bool {
unsafe { crate::simsimd_uses_sve_bf16() != 0 }
}
pub fn uses_sve_i8() -> bool {
unsafe { crate::simsimd_uses_sve_i8() != 0 }
}
pub fn uses_haswell() -> bool {
unsafe { crate::simsimd_uses_haswell() != 0 }
}
pub fn uses_skylake() -> bool {
unsafe { crate::simsimd_uses_skylake() != 0 }
}
pub fn uses_ice() -> bool {
unsafe { crate::simsimd_uses_ice() != 0 }
}
pub fn uses_genoa() -> bool {
unsafe { crate::simsimd_uses_genoa() != 0 }
}
pub fn uses_sapphire() -> bool {
unsafe { crate::simsimd_uses_sapphire() != 0 }
}
pub fn uses_turin() -> bool {
unsafe { crate::simsimd_uses_turin() != 0 }
}
pub fn uses_sierra() -> bool {
unsafe { crate::simsimd_uses_sierra() != 0 }
}
/// Flushes denormalized numbers to zero on the current CPU architecture.
///
/// This function should be called on each thread before any SIMD operations
/// to avoid performance penalties. When facing denormalized values,
/// Fused-Multiply-Add (FMA) operations can be up to 30x slower.
///
/// # Returns
///
/// Returns `true` if the operation was successful, `false` otherwise.
pub fn flush_denormals() -> bool {
unsafe { crate::simsimd_flush_denormals() != 0 }
}
/// Checks if the library is using dynamic dispatch for function selection.
///
/// # Returns
///
/// Returns `true` when the C backend is compiled with dynamic dispatch
/// (default for this crate via `build.rs`), otherwise `false`.
pub fn uses_dynamic_dispatch() -> bool {
unsafe { crate::simsimd_uses_dynamic_dispatch() != 0 }
}
}
/// `SpatialSimilarity` provides a set of trait methods for computing similarity
/// or distance between spatial data vectors in SIMD (Single Instruction, Multiple Data) context.
/// These methods can be used to calculate metrics like cosine distance, dot product,
/// and squared Euclidean distance between two slices of data.
///
/// Each method takes two slices of data (a and b) and returns an Option.
/// The result is `None` if the slices are not of the same length, as these operations
/// require one-to-one correspondence between the elements of the slices.
/// Otherwise, it returns the computed similarity or distance as `Some(f64)`.
/// Convenience methods like `cosine`/`sqeuclidean` delegate to the core methods
/// `cos`/`l2sq` implemented by this trait.
pub trait SpatialSimilarity
where
Self: Sized,
{
/// Computes the cosine distance between two slices.
/// The cosine distance is 1 minus the cosine similarity between two non-zero vectors
/// of an dot product space that measures the cosine of the angle between them.
fn cos(a: &[Self], b: &[Self]) -> Option;
/// Computes the inner product (also known as dot product) between two slices.
/// The dot product is the sum of the products of the corresponding entries
/// of the two sequences of numbers.
fn dot(a: &[Self], b: &[Self]) -> Option;
/// Computes the squared Euclidean distance between two slices.
/// The squared Euclidean distance is the sum of the squared differences
/// between corresponding elements of the two slices.
fn l2sq(a: &[Self], b: &[Self]) -> Option;
/// Computes the Euclidean distance between two slices.
/// The Euclidean distance is the square root of
// sum of the squared differences between corresponding
/// elements of the two slices.
fn l2(a: &[Self], b: &[Self]) -> Option;
/// Computes the squared Euclidean distance between two slices.
/// The squared Euclidean distance is the sum of the squared differences
/// between corresponding elements of the two slices.
fn sqeuclidean(a: &[Self], b: &[Self]) -> Option {
SpatialSimilarity::l2sq(a, b)
}
/// Computes the Euclidean distance between two slices.
/// The Euclidean distance is the square root of the
/// sum of the squared differences between corresponding
/// elements of the two slices.
fn euclidean(a: &[Self], b: &[Self]) -> Option {
SpatialSimilarity::l2(a, b)
}
/// Computes the squared Euclidean distance between two slices.
/// The squared Euclidean distance is the sum of the squared differences
/// between corresponding elements of the two slices.
fn inner(a: &[Self], b: &[Self]) -> Option {
SpatialSimilarity::dot(a, b)
}
/// Computes the cosine distance between two slices.
/// The cosine distance is 1 minus the cosine similarity between two non-zero vectors
/// of an dot product space that measures the cosine of the angle between them.
fn cosine(a: &[Self], b: &[Self]) -> Option {
SpatialSimilarity::cos(a, b)
}
}
/// `BinarySimilarity` provides trait methods for computing similarity metrics
/// that are commonly used with binary data vectors, such as Hamming distance
/// and Jaccard index.
///
/// The methods accept two slices of binary data and return an Option
/// indicating the computed similarity or distance, with `None` returned if the
/// slices differ in length.
pub trait BinarySimilarity
where
Self: Sized,
{
/// Computes the Hamming distance between two binary data slices.
/// The Hamming distance between two strings of equal length is the number of
/// bits at which the corresponding values are different.
fn hamming(a: &[Self], b: &[Self]) -> Option;
/// Computes the Jaccard index between two bitsets represented by binary data slices.
/// The Jaccard index, also known as the Jaccard similarity coefficient, is a statistic
/// used for gauging the similarity and diversity of sample sets.
fn jaccard(a: &[Self], b: &[Self]) -> Option;
}
/// `ProbabilitySimilarity` provides trait methods for computing similarity or divergence
/// measures between probability distributions, such as the Jensen-Shannon divergence
/// and the Kullback-Leibler divergence.
///
/// These methods are particularly useful in contexts such as information theory and
/// machine learning, where one often needs to measure how one probability distribution
/// differs from a second, reference probability distribution.
pub trait ProbabilitySimilarity
where
Self: Sized,
{
/// Computes the Jensen-Shannon divergence between two probability distributions.
/// The Jensen-Shannon divergence is a method of measuring the similarity between
/// two probability distributions. It is based on the Kullback-Leibler divergence,
/// but is symmetric and always has a finite value.
fn jensenshannon(a: &[Self], b: &[Self]) -> Option;
/// Computes the Kullback-Leibler divergence between two probability distributions.
/// The Kullback-Leibler divergence is a measure of how one probability distribution
/// diverges from a second, expected probability distribution.
fn kullbackleibler(a: &[Self], b: &[Self]) -> Option;
}
/// `ComplexProducts` provides trait methods for computing products between
/// complex number vectors. This includes standard and Hermitian dot products.
pub trait ComplexProducts
where
Self: Sized,
{
/// Computes the dot product between two complex number vectors.
fn dot(a: &[Self], b: &[Self]) -> Option;
/// Computes the Hermitian dot product (conjugate dot product) between two complex number vectors.
fn vdot(a: &[Self], b: &[Self]) -> Option;
}
/// `Sparse` provides trait methods for sparse vectors.
pub trait Sparse
where
Self: Sized,
{
/// Computes the number of common elements between two sparse vectors.
/// both vectors must be sorted in ascending order.
fn intersect(a: &[Self], b: &[Self]) -> Option;
}
impl BinarySimilarity for u8 {
fn hamming(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_hamming_b8(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn jaccard(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_jaccard_b8(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl SpatialSimilarity for i8 {
fn cos(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_cos_i8(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn dot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_dot_i8(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2sq(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2sq_i8(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2_i8(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl Sparse for u16 {
fn intersect(a: &[Self], b: &[Self]) -> Option {
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe {
simsimd_intersect_u16(
a.as_ptr(),
b.as_ptr(),
a.len() as u64size,
b.len() as u64size,
distance_ptr,
)
};
Some(distance_value)
}
}
impl Sparse for u32 {
fn intersect(a: &[Self], b: &[Self]) -> Option {
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe {
simsimd_intersect_u32(
a.as_ptr(),
b.as_ptr(),
a.len() as u64size,
b.len() as u64size,
distance_ptr,
)
};
Some(distance_value)
}
}
impl SpatialSimilarity for f16 {
fn cos(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const f16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_cos_f16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn dot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const f16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_dot_f16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2sq(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const f16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2sq_f16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const f16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2_f16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl SpatialSimilarity for bf16 {
fn cos(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const bf16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_cos_bf16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn dot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const bf16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_dot_bf16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2sq(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const bf16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2sq_bf16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const bf16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2_bf16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl SpatialSimilarity for f32 {
fn cos(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_cos_f32(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn dot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_dot_f32(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2sq(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2sq_f32(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2_f32(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl SpatialSimilarity for f64 {
fn cos(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_cos_f64(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn dot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_dot_f64(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2sq(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2sq_f64(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn l2(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_l2_f64(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl ProbabilitySimilarity for f16 {
fn jensenshannon(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const f16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_js_f16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn kullbackleibler(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const f16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_kl_f16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl ProbabilitySimilarity for bf16 {
fn jensenshannon(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const bf16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_js_bf16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn kullbackleibler(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
// Explicitly cast `*const bf16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_kl_bf16(a_ptr, b_ptr, a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl ProbabilitySimilarity for f32 {
fn jensenshannon(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_js_f32(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn kullbackleibler(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_kl_f32(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl ProbabilitySimilarity for f64 {
fn jensenshannon(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_js_f64(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
fn kullbackleibler(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() {
return None;
}
let mut distance_value: Distance = 0.0;
let distance_ptr: *mut Distance = &mut distance_value as *mut Distance;
unsafe { simsimd_kl_f64(a.as_ptr(), b.as_ptr(), a.len() as u64size, distance_ptr) };
Some(distance_value)
}
}
impl ComplexProducts for f16 {
fn dot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() || a.len() % 2 != 0 {
return None;
}
// Prepare the output array where the real and imaginary parts will be stored
let mut product: [Distance; 2] = [0.0, 0.0];
let product_ptr: *mut Distance = &mut product[0] as *mut _;
// Explicitly cast `*const f16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
// The C function expects the number of complex pairs, not the total number of f16 elements
unsafe { simsimd_dot_f16c(a_ptr, b_ptr, a.len() as u64size / 2, product_ptr) };
Some((product[0], product[1]))
}
fn vdot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() || a.len() % 2 != 0 {
return None;
}
let mut product: [Distance; 2] = [0.0, 0.0];
let product_ptr: *mut Distance = &mut product[0] as *mut _;
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
// The C function expects the number of complex pairs, not the total number of f16 elements
unsafe { simsimd_vdot_f16c(a_ptr, b_ptr, a.len() as u64size / 2, product_ptr) };
Some((product[0], product[1]))
}
}
impl ComplexProducts for bf16 {
fn dot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() || a.len() % 2 != 0 {
return None;
}
// Prepare the output array where the real and imaginary parts will be stored
let mut product: [Distance; 2] = [0.0, 0.0];
let product_ptr: *mut Distance = &mut product[0] as *mut _;
// Explicitly cast `*const bf16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
// The C function expects the number of complex pairs, not the total number of bf16 elements
unsafe { simsimd_dot_bf16c(a_ptr, b_ptr, a.len() as u64size / 2, product_ptr) };
Some((product[0], product[1]))
}
fn vdot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() || a.len() % 2 != 0 {
return None;
}
// Prepare the output array where the real and imaginary parts will be stored
let mut product: [Distance; 2] = [0.0, 0.0];
let product_ptr: *mut Distance = &mut product[0] as *mut _;
// Explicitly cast `*const bf16` to `*const u16`
let a_ptr = a.as_ptr() as *const u16;
let b_ptr = b.as_ptr() as *const u16;
// The C function expects the number of complex pairs, not the total number of bf16 elements
unsafe { simsimd_vdot_bf16c(a_ptr, b_ptr, a.len() as u64size / 2, product_ptr) };
Some((product[0], product[1]))
}
}
impl ComplexProducts for f32 {
fn dot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() || a.len() % 2 != 0 {
return None;
}
let mut product: [Distance; 2] = [0.0, 0.0];
let product_ptr: *mut Distance = &mut product[0] as *mut _;
// The C function expects the number of complex pairs, not the total number of floats
unsafe { simsimd_dot_f32c(a.as_ptr(), b.as_ptr(), a.len() as u64size / 2, product_ptr) };
Some((product[0], product[1]))
}
fn vdot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() || a.len() % 2 != 0 {
return None;
}
let mut product: [Distance; 2] = [0.0, 0.0];
let product_ptr: *mut Distance = &mut product[0] as *mut _;
// The C function expects the number of complex pairs, not the total number of floats
unsafe { simsimd_vdot_f32c(a.as_ptr(), b.as_ptr(), a.len() as u64size / 2, product_ptr) };
Some((product[0], product[1]))
}
}
impl ComplexProducts for f64 {
fn dot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() || a.len() % 2 != 0 {
return None;
}
let mut product: [Distance; 2] = [0.0, 0.0];
let product_ptr: *mut Distance = &mut product[0] as *mut _;
// The C function expects the number of complex pairs, not the total number of floats
unsafe { simsimd_dot_f64c(a.as_ptr(), b.as_ptr(), a.len() as u64size / 2, product_ptr) };
Some((product[0], product[1]))
}
fn vdot(a: &[Self], b: &[Self]) -> Option {
if a.len() != b.len() || a.len() % 2 != 0 {
return None;
}
let mut product: [Distance; 2] = [0.0, 0.0];
let product_ptr: *mut Distance = &mut product[0] as *mut _;
// The C function expects the number of complex pairs, not the total number of floats
unsafe { simsimd_vdot_f64c(a.as_ptr(), b.as_ptr(), a.len() as u64size / 2, product_ptr) };
Some((product[0], product[1]))
}
}
#[cfg(test)]
mod tests {
use super::*;
use half::bf16 as HalfBF16;
use half::f16 as HalfF16;
#[test]
fn hardware_features_detection() {
let uses_arm = capabilities::uses_neon() || capabilities::uses_sve();
let uses_x86 = capabilities::uses_haswell()
|| capabilities::uses_skylake()
|| capabilities::uses_ice()
|| capabilities::uses_genoa()
|| capabilities::uses_sapphire()
|| capabilities::uses_turin();
// The CPU can't simultaneously support ARM and x86 SIMD extensions
if uses_arm {
assert!(!uses_x86);
}
if uses_x86 {
assert!(!uses_arm);
}
println!("- uses_neon: {}", capabilities::uses_neon());
println!("- uses_neon_f16: {}", capabilities::uses_neon_f16());
println!("- uses_neon_bf16: {}", capabilities::uses_neon_bf16());
println!("- uses_neon_i8: {}", capabilities::uses_neon_i8());
println!("- uses_sve: {}", capabilities::uses_sve());
println!("- uses_sve_f16: {}", capabilities::uses_sve_f16());
println!("- uses_sve_bf16: {}", capabilities::uses_sve_bf16());
println!("- uses_sve_i8: {}", capabilities::uses_sve_i8());
println!("- uses_haswell: {}", capabilities::uses_haswell());
println!("- uses_skylake: {}", capabilities::uses_skylake());
println!("- uses_ice: {}", capabilities::uses_ice());
println!("- uses_genoa: {}", capabilities::uses_genoa());
println!("- uses_sapphire: {}", capabilities::uses_sapphire());
println!("- uses_turin: {}", capabilities::uses_turin());
println!("- uses_sierra: {}", capabilities::uses_sierra());
}
//
fn assert_almost_equal(left: Distance, right: Distance, tolerance: Distance) {
let lower = right - tolerance;
let upper = right + tolerance;
assert!(left >= lower && left <= upper);
}
#[test]
fn cos_i8() {
let a = &[3, 97, 127];
let b = &[3, 97, 127];
if let Some(result) = SpatialSimilarity::cosine(a, b) {
assert_almost_equal(0.00012027938, result, 0.01);
}
}
#[test]
fn cos_f32() {
let a = &[1.0, 2.0, 3.0];
let b = &[4.0, 5.0, 6.0];
if let Some(result) = SpatialSimilarity::cosine(a, b) {
assert_almost_equal(0.025, result, 0.01);
}
}
#[test]
fn dot_i8() {
let a = &[1, 2, 3];
let b = &[4, 5, 6];
if let Some(result) = SpatialSimilarity::dot(a, b) {
assert_almost_equal(32.0, result, 0.01);
}
}
#[test]
fn dot_f32() {
let a = &[1.0, 2.0, 3.0];
let b = &[4.0, 5.0, 6.0];
if let Some(result) = SpatialSimilarity::dot(a, b) {
assert_almost_equal(32.0, result, 0.01);
}
}
#[test]
fn dot_f32_complex() {
// Let's consider these as complex numbers where every pair is (real, imaginary)
let a: &[f32; 4] = &[1.0, 2.0, 3.0, 4.0]; // Represents two complex numbers: 1+2i, 3+4i
let b: &[f32; 4] = &[5.0, 6.0, 7.0, 8.0]; // Represents two complex numbers: 5+6i, 7+8i
if let Some((real, imag)) = ComplexProducts::dot(a, b) {
assert_almost_equal(-18.0, real, 0.01);
assert_almost_equal(68.0, imag, 0.01);
}
}
#[test]
fn vdot_f32_complex() {
// Here we're assuming a similar setup to the previous test, but for the Hermitian (conjugate) dot product
let a: &[f32; 4] = &[1.0, 2.0, 3.0, 4.0]; // Represents two complex numbers: 1+2i, 3+4i
let b: &[f32; 4] = &[5.0, 6.0, 7.0, 8.0]; // Represents two complex numbers: 5+6i, 7+8i
if let Some((real, imag)) = ComplexProducts::vdot(a, b) {
assert_almost_equal(70.0, real, 0.01);
assert_almost_equal(-8.0, imag, 0.01);
}
}
#[test]
fn l2sq_i8() {
let a = &[1, 2, 3];
let b = &[4, 5, 6];
if let Some(result) = SpatialSimilarity::sqeuclidean(a, b) {
assert_almost_equal(27.0, result, 0.01);
}
}
#[test]
fn l2sq_f32() {
let a = &[1.0, 2.0, 3.0];
let b = &[4.0, 5.0, 6.0];
if let Some(result) = SpatialSimilarity::sqeuclidean(a, b) {
assert_almost_equal(27.0, result, 0.01);
}
}
#[test]
fn l2_f32() {
let a: &[f32; 3] = &[1.0, 2.0, 3.0];
let b: &[f32; 3] = &[4.0, 5.0, 6.0];
if let Some(result) = SpatialSimilarity::euclidean(a, b) {
assert_almost_equal(5.2, result, 0.01);
}
}
#[test]
fn l2_f64() {
let a: &[f64; 3] = &[1.0, 2.0, 3.0];
let b: &[f64; 3] = &[4.0, 5.0, 6.0];
if let Some(result) = SpatialSimilarity::euclidean(a, b) {
assert_almost_equal(5.2, result, 0.01);
}
}
#[test]
fn l2_f16() {
let a_half: Vec = vec![1.0, 2.0, 3.0]
.iter()
.map(|&x| HalfF16::from_f32(x))
.collect();
let b_half: Vec = vec![4.0, 5.0, 6.0]
.iter()
.map(|&x| HalfF16::from_f32(x))
.collect();
let a_simsimd: &[f16] =
unsafe { std::slice::from_raw_parts(a_half.as_ptr() as *const f16, a_half.len()) };
let b_simsimd: &[f16] =
unsafe { std::slice::from_raw_parts(b_half.as_ptr() as *const f16, b_half.len()) };
if let Some(result) = SpatialSimilarity::euclidean(&a_simsimd, &b_simsimd) {
assert_almost_equal(5.2, result, 0.01);
}
}
#[test]
fn l2_i8() {
let a = &[1, 2, 3];
let b = &[4, 5, 6];
if let Some(result) = SpatialSimilarity::euclidean(a, b) {
assert_almost_equal(5.2, result, 0.01);
}
}
// Adding new tests for bit-level distances
#[test]
fn hamming_u8() {
let a = &[0b01010101, 0b11110000, 0b10101010];
let b = &[0b01010101, 0b11110000, 0b10101010];
if let Some(result) = BinarySimilarity::hamming(a, b) {
assert_almost_equal(0.0, result, 0.01);
}
}
#[test]
fn jaccard_u8() {
// For binary data, treat each byte as a set of bits
let a = &[0b11110000, 0b00001111, 0b10101010];
let b = &[0b11110000, 0b00001111, 0b01010101];
if let Some(result) = BinarySimilarity::jaccard(a, b) {
assert_almost_equal(0.5, result, 0.01);
}
}
// Adding new tests for probability similarities
#[test]
fn js_f32() {
let a: &[f32; 3] = &[0.1, 0.9, 0.0];
let b: &[f32; 3] = &[0.2, 0.8, 0.0];
if let Some(result) = ProbabilitySimilarity::jensenshannon(a, b) {
assert_almost_equal(0.099, result, 0.01);
}
}
#[test]
fn kl_f32() {
let a: &[f32; 3] = &[0.1, 0.9, 0.0];
let b: &[f32; 3] = &[0.2, 0.8, 0.0];
if let Some(result) = ProbabilitySimilarity::kullbackleibler(a, b) {
assert_almost_equal(0.036, result, 0.01);
}
}
#[test]
fn cos_f16_same() {
// Assuming these u16 values represent f16 bit patterns, and they are identical
let a_u16: &[u16] = &[15360, 16384, 17408]; // Corresponding to some f16 values
let b_u16: &[u16] = &[15360, 16384, 17408]; // Same as above for simplicity
// Reinterpret cast from &[u16] to &[f16]
let a_f16: &[f16] =
unsafe { std::slice::from_raw_parts(a_u16.as_ptr() as *const f16, a_u16.len()) };
let b_f16: &[f16] =
unsafe { std::slice::from_raw_parts(b_u16.as_ptr() as *const f16, b_u16.len()) };
if let Some(result) = SpatialSimilarity::cosine(a_f16, b_f16) {
assert_almost_equal(0.0, result, 0.01);
}
}
#[test]
fn cos_bf16_same() {
// Assuming these u16 values represent bf16 bit patterns, and they are identical
let a_u16: &[u16] = &[15360, 16384, 17408]; // Corresponding to some bf16 values
let b_u16: &[u16] = &[15360, 16384, 17408]; // Same as above for simplicity
// Reinterpret cast from &[u16] to &[bf16]
let a_bf16: &[bf16] =
unsafe { std::slice::from_raw_parts(a_u16.as_ptr() as *const bf16, a_u16.len()) };
let b_bf16: &[bf16] =
unsafe { std::slice::from_raw_parts(b_u16.as_ptr() as *const bf16, b_u16.len()) };
if let Some(result) = SpatialSimilarity::cosine(a_bf16, b_bf16) {
assert_almost_equal(0.0, result, 0.01);
}
}
#[test]
fn cos_f16_interop() {
let a_half: Vec = vec![1.0, 2.0, 3.0]
.iter()
.map(|&x| HalfF16::from_f32(x))
.collect();
let b_half: Vec = vec![4.0, 5.0, 6.0]
.iter()
.map(|&x| HalfF16::from_f32(x))
.collect();
// SAFETY: This is safe as long as the memory representations are guaranteed to be identical,
// which they are due to both being #[repr(transparent)] wrappers around u16.
let a_simsimd: &[f16] =
unsafe { std::slice::from_raw_parts(a_half.as_ptr() as *const f16, a_half.len()) };
let b_simsimd: &[f16] =
unsafe { std::slice::from_raw_parts(b_half.as_ptr() as *const f16, b_half.len()) };
// Use the reinterpret-casted slices with your SpatialSimilarity implementation
if let Some(result) = SpatialSimilarity::cosine(a_simsimd, b_simsimd) {
assert_almost_equal(0.025, result, 0.01);
}
}
#[test]
fn cos_bf16_interop() {
let a_half: Vec = vec![1.0, 2.0, 3.0]
.iter()
.map(|&x| HalfBF16::from_f32(x))
.collect();
let b_half: Vec = vec![4.0, 5.0, 6.0]
.iter()
.map(|&x| HalfBF16::from_f32(x))
.collect();
// SAFETY: This is safe as long as the memory representations are guaranteed to be identical,
// which they are due to both being #[repr(transparent)] wrappers around u16.
let a_simsimd: &[bf16] =
unsafe { std::slice::from_raw_parts(a_half.as_ptr() as *const bf16, a_half.len()) };
let b_simsimd: &[bf16] =
unsafe { std::slice::from_raw_parts(b_half.as_ptr() as *const bf16, b_half.len()) };
// Use the reinterpret-casted slices with your SpatialSimilarity implementation
if let Some(result) = SpatialSimilarity::cosine(a_simsimd, b_simsimd) {
assert_almost_equal(0.025, result, 0.01);
}
}
#[test]
fn intersect_u16() {
{
let a_u16: &[u16] = &[153, 16384, 17408];
let b_u16: &[u16] = &[7408, 15360, 16384];
if let Some(result) = Sparse::intersect(a_u16, b_u16) {
assert_almost_equal(1.0, result, 0.0001);
}
}
{
let a_u16: &[u16] = &[8, 153, 11638];
let b_u16: &[u16] = &[7408, 15360, 16384];
if let Some(result) = Sparse::intersect(a_u16, b_u16) {
assert_almost_equal(0.0, result, 0.0001);
}
}
}
#[test]
fn intersect_u32() {
{
let a_u32: &[u32] = &[11, 153];
let b_u32: &[u32] = &[11, 153, 7408, 16384];
if let Some(result) = Sparse::intersect(a_u32, b_u32) {
assert_almost_equal(2.0, result, 0.0001);
}
}
{
let a_u32: &[u32] = &[153, 7408, 11638];
let b_u32: &[u32] = &[153, 7408, 11638];
if let Some(result) = Sparse::intersect(a_u32, b_u32) {
assert_almost_equal(3.0, result, 0.0001);
}
}
}
/// Reference implementation of set intersection using Rust's standard library
fn reference_intersect(a: &[T], b: &[T]) -> usize {
let mut a_iter = a.iter();
let mut b_iter = b.iter();
let mut a_current = a_iter.next();
let mut b_current = b_iter.next();
let mut count = 0;
while let (Some(a_val), Some(b_val)) = (a_current, b_current) {
match a_val.cmp(b_val) {
core::cmp::Ordering::Less => a_current = a_iter.next(),
core::cmp::Ordering::Greater => b_current = b_iter.next(),
core::cmp::Ordering::Equal => {
count += 1;
a_current = a_iter.next();
b_current = b_iter.next();
}
}
}
count
}
/// Generate test arrays with various sizes and patterns for intersection testing
/// Includes empty, small, medium, large arrays with different overlap characteristics
fn generate_intersection_test_arrays() -> Vec>
where
T: core::convert::TryFrom + Copy,
>::Error: core::fmt::Debug,
{
vec![
// Empty array
vec![],
// Single element
vec![T::try_from(42).unwrap()],
// Very small arrays (< 16 elements) - tests serial fallback
vec![
T::try_from(1).unwrap(),
T::try_from(5).unwrap(),
T::try_from(10).unwrap(),
],
vec![
T::try_from(2).unwrap(),
T::try_from(4).unwrap(),
T::try_from(6).unwrap(),
T::try_from(8).unwrap(),
T::try_from(10).unwrap(),
T::try_from(12).unwrap(),
T::try_from(14).unwrap(),
],
// Small arrays (< 32 elements) - boundary case for Turin
(0..14).map(|x| T::try_from(x * 10).unwrap()).collect(),
(5..20).map(|x| T::try_from(x * 10).unwrap()).collect(),
// Medium arrays (32-64 elements) - tests one or two SIMD iterations
(0..40).map(|x| T::try_from(x * 2).unwrap()).collect(),
(10..50).map(|x| T::try_from(x * 2).unwrap()).collect(), // 50% overlap with previous
(0..45).map(|x| T::try_from(x * 3).unwrap()).collect(), // Different stride
// Large arrays (> 64 elements) - tests main SIMD loop
(0..100).map(|x| T::try_from(x * 2).unwrap()).collect(),
(50..150).map(|x| T::try_from(x * 2).unwrap()).collect(), // 50% overlap
(0..100).map(|x| T::try_from(x * 5).unwrap()).collect(), // Sparse overlap
(0..150)
.filter(|x| x % 7 == 0)
.map(|x| T::try_from(x).unwrap())
.collect(),
// Very large arrays (> 256 elements) - stress test
(0..500).map(|x| T::try_from(x * 3).unwrap()).collect(),
(100..600).map(|x| T::try_from(x * 3).unwrap()).collect(), // Large overlap
(0..600).map(|x| T::try_from(x * 7).unwrap()).collect(), // Minimal overlap
// Edge cases: no overlap at all
(0..50).map(|x| T::try_from(x * 2).unwrap()).collect(),
(1000..1050).map(|x| T::try_from(x * 2).unwrap()).collect(), // Completely disjoint
// Dense arrays at boundaries
(0..16).map(|x| T::try_from(x).unwrap()).collect(), // Exactly 16 elements
(0..32).map(|x| T::try_from(x).unwrap()).collect(), // Exactly 32 elements
(0..64).map(|x| T::try_from(x).unwrap()).collect(), // Exactly 64 elements
]
}
#[test]
fn intersect_u32_comprehensive() {
let test_arrays: Vec> = generate_intersection_test_arrays();
for (i, array_a) in test_arrays.iter().enumerate() {
for (j, array_b) in test_arrays.iter().enumerate() {
let expected = reference_intersect(array_a, array_b);
let result =
Sparse::intersect(array_a.as_slice(), array_b.as_slice()).unwrap() as usize;
assert_eq!(
expected,
result,
"Intersection mismatch for arrays[{}] (len={}) and arrays[{}] (len={})",
i,
array_a.len(),
j,
array_b.len()
);
}
}
}
#[test]
fn intersect_u16_comprehensive() {
let test_arrays: Vec> = generate_intersection_test_arrays();
for (i, array_a) in test_arrays.iter().enumerate() {
for (j, array_b) in test_arrays.iter().enumerate() {
let expected = reference_intersect(array_a, array_b);
let result =
Sparse::intersect(array_a.as_slice(), array_b.as_slice()).unwrap() as usize;
assert_eq!(
expected,
result,
"Intersection mismatch for arrays[{}] (len={}) and arrays[{}] (len={})",
i,
array_a.len(),
j,
array_b.len()
);
}
}
}
#[test]
fn intersect_edge_cases() {
// Test empty arrays
let empty: &[u32] = &[];
let non_empty: &[u32] = &[1, 2, 3];
assert_eq!(Sparse::intersect(empty, empty), Some(0.0));
assert_eq!(Sparse::intersect(empty, non_empty), Some(0.0));
assert_eq!(Sparse::intersect(non_empty, empty), Some(0.0));
// Test single element matches
assert_eq!(Sparse::intersect(&[42u32], &[42u32]), Some(1.0));
assert_eq!(Sparse::intersect(&[42u32], &[43u32]), Some(0.0));
// Test no overlap
let a: &[u32] = &[1, 2, 3, 4, 5];
let b: &[u32] = &[10, 20, 30, 40, 50];
assert_eq!(Sparse::intersect(a, b), Some(0.0));
// Test complete overlap
let c: &[u32] = &[10, 20, 30, 40, 50];
assert_eq!(Sparse::intersect(c, c), Some(5.0));
// Test one element at boundary (exactly at 16, 32, 64 element boundaries)
let boundary_16: Vec = (0..16).collect();
let boundary_32: Vec = (0..32).collect();
let boundary_64: Vec = (0..64).collect();
assert_eq!(Sparse::intersect(&boundary_16, &boundary_16), Some(16.0));
assert_eq!(Sparse::intersect(&boundary_32, &boundary_32), Some(32.0));
assert_eq!(Sparse::intersect(&boundary_64, &boundary_64), Some(64.0));
// Test partial overlap at boundaries
let first_half: Vec = (0..32).collect();
let second_half: Vec = (16..48).collect();
assert_eq!(Sparse::intersect(&first_half, &second_half), Some(16.0));
}
#[test]
fn f16_arithmetic() {
let a = f16::from_f32(3.5);
let b = f16::from_f32(2.0);
// Test basic arithmetic
assert!((a + b).to_f32() - 5.5 < 0.01);
assert!((a - b).to_f32() - 1.5 < 0.01);
assert!((a * b).to_f32() - 7.0 < 0.01);
assert!((a / b).to_f32() - 1.75 < 0.01);
assert!((-a).to_f32() + 3.5 < 0.01);
// Test constants
assert!(f16::ZERO.to_f32() == 0.0);
assert!((f16::ONE.to_f32() - 1.0).abs() < 0.01);
assert!((f16::NEG_ONE.to_f32() + 1.0).abs() < 0.01);
// Test comparisons
assert!(a > b);
assert!(!(a < b));
assert!(a == a);
// Test utility methods
assert!((-a).abs().to_f32() - 3.5 < 0.01);
assert!(a.is_finite());
assert!(!a.is_nan());
assert!(!a.is_infinite());
}
#[test]
fn bf16_arithmetic() {
let a = bf16::from_f32(3.5);
let b = bf16::from_f32(2.0);
// Test basic arithmetic
assert!((a + b).to_f32() - 5.5 < 0.1);
assert!((a - b).to_f32() - 1.5 < 0.1);
assert!((a * b).to_f32() - 7.0 < 0.1);
assert!((a / b).to_f32() - 1.75 < 0.1);
assert!((-a).to_f32() + 3.5 < 0.1);
// Test constants
assert!(bf16::ZERO.to_f32() == 0.0);
assert!((bf16::ONE.to_f32() - 1.0).abs() < 0.01);
assert!((bf16::NEG_ONE.to_f32() + 1.0).abs() < 0.01);
// Test comparisons
assert!(a > b);
assert!(!(a < b));
assert!(a == a);
// Test utility methods
assert!((-a).abs().to_f32() - 3.5 < 0.1);
assert!(a.is_finite());
assert!(!a.is_nan());
assert!(!a.is_infinite());
}
#[test]
fn bf16_dot() {
let brain_a: Vec = vec![1.0, 2.0, 3.0, 1.0, 2.0]
.iter()
.map(|&x| bf16::from_f32(x))
.collect();
let brain_b: Vec = vec![4.0, 5.0, 6.0, 4.0, 5.0]
.iter()
.map(|&x| bf16::from_f32(x))
.collect();
if let Some(result) = ::dot(&brain_a, &brain_b) {
assert_eq!(46.0, result);
}
}
}