vsimd-0.8.0/.cargo_vcs_info.json0000644000000001520000000000100121610ustar { "git": { "sha1": "d74c030d9dc4f3cae02146d1f497ff62726ef09a" }, "path_in_vcs": "crates/vsimd" }vsimd-0.8.0/Cargo.toml0000644000000022070000000000100101620ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.63" name = "vsimd" version = "0.8.0" description = "SIMD utilities" readme = "README.md" keywords = ["simd"] categories = ["no-std"] license = "MIT" repository = "https://github.com/Nugine/simd" [package.metadata.docs.rs] all-features = true rustdoc-args = [ "--cfg", "docsrs", ] [dev-dependencies.const-str] version = "0.5.3" [dev-dependencies.rand] version = "0.8.5" [features] alloc = [] detect = ["std"] std = ["alloc"] unstable = [] [target."cfg(target_arch=\"wasm32\")".dev-dependencies.getrandom] version = "0.2.8" features = ["js"] [target."cfg(target_arch=\"wasm32\")".dev-dependencies.wasm-bindgen-test] version = "0.3.33" vsimd-0.8.0/Cargo.toml.orig000064400000000000000000000011111046102023000136340ustar 00000000000000[package] name = "vsimd" version = "0.8.0" edition = "2021" description = "SIMD utilities" license = "MIT" repository = "https://github.com/Nugine/simd" keywords = ["simd"] categories = ["no-std"] readme = "README.md" rust-version = "1.63" [package.metadata.docs.rs] all-features = true rustdoc-args = ["--cfg", "docsrs"] [features] alloc = [] std = ["alloc"] detect = ["std"] unstable = [] [dev-dependencies] const-str = "0.5.3" rand = "0.8.5" [target.'cfg(target_arch="wasm32")'.dev-dependencies] getrandom = { version = "0.2.8", features = ["js"] } wasm-bindgen-test = "0.3.33" vsimd-0.8.0/README.md000064400000000000000000000005761046102023000122420ustar 00000000000000# vsimd [![Crates.io](https://img.shields.io/crates/v/vsimd.svg)](https://crates.io/crates/vsimd) [![Docs](https://docs.rs/vsimd/badge.svg)](https://docs.rs/vsimd/) [![MIT licensed][mit-badge]][mit-url] [mit-badge]: https://img.shields.io/badge/license-MIT-blue.svg [mit-url]: ../../LICENSE ⚠️ This crate contains shared implementation details. Do not directly depend on it. vsimd-0.8.0/src/algorithm.rs000064400000000000000000000016551046102023000141050ustar 00000000000000pub const fn lookup(lut: &[u8; 16], x: u8) -> u8 { if x < 0x80 { lut[(x & 0x0f) as usize] } else { 0 } } pub const fn avgr(a: u8, b: u8) -> u8 { ((a as u16 + b as u16 + 1) >> 1) as u8 } #[cfg(test)] pub fn print_fn_table(is_primary: impl Fn(u8) -> bool, f: impl Fn(u8) -> u8) { print!(" 0 1 2 3 4 5 6 7 8 9 A B C D E F"); for c in 0..=255u8 { let val = f(c); if c & 0x0f == 0 { println!(); print!("{:x} | ", c >> 4); } if is_primary(c) { print!("\x1b[1;31m{val:0>2X}\x1b[0m "); } else if val >= 0x80 { print!("\x1b[1;36m{val:0>2X}\x1b[0m "); } else { print!("\x1b[1;32m{val:0>2X}\x1b[0m "); } } println!(); println!(); } #[cfg(test)] pub fn i8_lt(a: i8, b: i8) -> u8 { if a < b { 0xff } else { 0x00 } } vsimd-0.8.0/src/alsw.rs000064400000000000000000000121121046102023000130530ustar 00000000000000// ALSW: Avgr, Lookup, Saturating_add, Wrapping_add // Inspired by // use crate::algorithm::{avgr, lookup}; use crate::pod::POD; use crate::table::u8x16xn_lookup; use crate::vector::{V128, V256}; use crate::Scalable; use core::ops::Not; #[inline] #[must_use] pub const fn hash(hash_lut: &[u8; 16], c: u8) -> u8 { avgr(0xE0 | (c >> 3), lookup(hash_lut, c)) } #[inline] #[must_use] pub const fn check(hash_lut: &[u8; 16], offset: &[u8; 16], c: u8) -> u8 { let h = hash(hash_lut, c); let o = lookup(offset, h); (c as i8).saturating_add(o as i8) as u8 } #[inline] #[must_use] pub const fn decode(hash_lut: &[u8; 16], offset: &[u8; 16], c: u8) -> u8 { let h = hash(hash_lut, c); let o = lookup(offset, h); c.wrapping_add(o) } #[derive(Debug, Clone, Copy)] pub struct AlswLut { pub hash: V, pub offset: V, } impl AlswLut { #[inline] #[must_use] pub const fn x2(self) -> AlswLut { AlswLut { hash: self.hash.x2(), offset: self.offset.x2(), } } } #[inline(always)] pub fn check_ascii_xn, V: POD>(s: S, x: V, check: AlswLut) -> bool { let shr3 = s.u32xn_shr::<3>(x); let h1 = s.u8xn_avgr(shr3, u8x16xn_lookup(s, check.hash, x)); let o1 = u8x16xn_lookup(s, check.offset, h1); let c1 = s.i8xn_add_sat(x, o1); s.u8xn_highbit_any(c1).not() } #[inline(always)] pub fn decode_ascii_xn, V: POD>(s: S, x: V, check: AlswLut, decode: AlswLut) -> (V, V) { let shr3 = s.u32xn_shr::<3>(x); let h1 = s.u8xn_avgr(shr3, u8x16xn_lookup(s, check.hash, x)); let h2 = s.u8xn_avgr(shr3, u8x16xn_lookup(s, decode.hash, x)); let o1 = u8x16xn_lookup(s, check.offset, h1); let o2 = u8x16xn_lookup(s, decode.offset, h2); let c1 = s.i8xn_add_sat(x, o1); let c2 = s.u8xn_add(x, o2); (c1, c2) } #[macro_export] macro_rules! impl_alsw { ($spec:ty) => { impl $spec { const CHECK_HASH: [u8; 16] = { let mut arr = [0; 16]; let mut i = 0; while i < 16 { let x: u8 = Self::check_hash(i as u8); arr[i] = (x << 1) - 1; i += 1; } arr }; const CHECK_OFFSET: [u8; 16] = { let mut arr = [0x80; 16]; let mut c: u8 = 255; loop { if Self::decode(c) != 0xff { let h = $crate::alsw::hash(&Self::CHECK_HASH, c); arr[(h & 0x0f) as usize] = 0u8.wrapping_sub(c); } if c == 0 { break; } c -= 1; } arr }; const DECODE_HASH: [u8; 16] = { let mut arr = [0; 16]; let mut i = 0; while i < 16 { let x: u8 = Self::decode_hash(i as u8); arr[i] = (x << 1) - 1; i += 1; } arr }; const DECODE_OFFSET: [u8; 16] = { let mut arr = [0x80; 16]; let mut c: u8 = 255; loop { let idx = Self::decode(c); if idx != 0xff { let h = $crate::alsw::hash(&Self::DECODE_HASH, c); arr[(h & 0x0f) as usize] = idx.wrapping_sub(c); } if c == 0 { break; } c -= 1; } arr }; #[inline] #[must_use] const fn check_lut() -> AlswLut { AlswLut { hash: V128::from_bytes(Self::CHECK_HASH), offset: V128::from_bytes(Self::CHECK_OFFSET), } } #[inline] #[must_use] const fn decode_lut() -> AlswLut { AlswLut { hash: V128::from_bytes(Self::DECODE_HASH), offset: V128::from_bytes(Self::DECODE_OFFSET), } } #[cfg(test)] fn test_check() { let hash = &Self::CHECK_HASH; let offset = &Self::CHECK_OFFSET; let check = |c: u8| $crate::alsw::check(hash, offset, c); for c in 0..=255u8 { assert_eq!(check(c) < 0x80, Self::decode(c) != 0xff); } } #[cfg(test)] fn test_decode() { let hash = &Self::DECODE_HASH; let offset = &Self::DECODE_OFFSET; let decode = |c: u8| $crate::alsw::decode(hash, offset, c); for c in 0..=255u8 { let idx = Self::decode(c); if idx != 0xff { assert_eq!(decode(c), idx); } } } } }; } vsimd-0.8.0/src/ascii.rs000064400000000000000000000030121046102023000131740ustar 00000000000000use crate::pod::POD; use crate::Scalable; /// An enum type which represents the case of ascii letters. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum AsciiCase { /// a-z are lower case letters. Lower, /// A-Z are upper case letters. Upper, } #[inline(always)] fn convert_ascii_case, V: POD, const C: u8>(s: S, x: V) -> V { assert!(matches!(C, b'A' | b'a')); let x1 = s.u8xn_sub(x, s.u8xn_splat(C + 0x80)); let x2 = s.i8xn_lt(x1, s.i8xn_splat(-0x80 + 26)); let x3 = s.and(x2, s.u8xn_splat(0x20)); s.xor(x, x3) } #[inline(always)] pub fn to_ascii_lowercase, V: POD>(s: S, x: V) -> V { convert_ascii_case::(s, x) } #[inline(always)] pub fn to_ascii_uppercase, V: POD>(s: S, x: V) -> V { convert_ascii_case::(s, x) } #[cfg(test)] mod algorithm { use crate::algorithm::*; #[test] #[ignore] fn convert_case() { let convert = |c: u8, shift: u8| { let x1 = c.wrapping_sub(shift + 0x80); let x2 = i8_lt(x1 as i8, -0x80 + 26); let x3 = x2 & 0x20; c ^ x3 }; let to_upper = |c: u8| convert(c, b'a'); let to_lower = |c: u8| convert(c, b'A'); print_fn_table(|c| c.is_ascii_lowercase(), to_upper); print_fn_table(|c| c.is_ascii_uppercase(), to_lower); for c in 0..=255u8 { assert_eq!(to_upper(c), c.to_ascii_uppercase()); assert_eq!(to_lower(c), c.to_ascii_lowercase()); } } } vsimd-0.8.0/src/bswap.rs000064400000000000000000000047141046102023000132320ustar 00000000000000use crate::pod::POD; use crate::vector::{V128, V256}; use crate::SIMD256; pub(crate) const SHUFFLE_U16X8: V128 = V128::from_bytes([ 0x01, 0x00, 0x03, 0x02, 0x05, 0x04, 0x07, 0x06, // 0x09, 0x08, 0x0b, 0x0a, 0x0d, 0x0c, 0x0f, 0x0e, // ]); pub(crate) const SHUFFLE_U32X4: V128 = V128::from_bytes([ 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04, // 0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c, // ]); pub(crate) const SHUFFLE_U64X2: V128 = V128::from_bytes([ 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, // 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, // ]); pub(crate) const SHUFFLE_U16X16: V256 = SHUFFLE_U16X8.x2(); pub(crate) const SHUFFLE_U32X8: V256 = SHUFFLE_U32X4.x2(); pub(crate) const SHUFFLE_U64X4: V256 = SHUFFLE_U64X2.x2(); pub unsafe trait BSwap: POD { const LANES: usize; fn swap_single(x: Self) -> Self; fn swap_simd(s: S, a: V256) -> V256; } unsafe impl BSwap for u16 { const LANES: usize = 16; #[inline(always)] fn swap_single(x: Self) -> Self { x.swap_bytes() } #[inline(always)] fn swap_simd(s: S, a: V256) -> V256 { s.u16x16_bswap(a) } } unsafe impl BSwap for u32 { const LANES: usize = 8; #[inline(always)] fn swap_single(x: Self) -> Self { x.swap_bytes() } #[inline(always)] fn swap_simd(s: S, a: V256) -> V256 { s.u32x8_bswap(a) } } unsafe impl BSwap for u64 { const LANES: usize = 4; #[inline(always)] fn swap_single(x: Self) -> Self { x.swap_bytes() } #[inline(always)] fn swap_simd(s: S, a: V256) -> V256 { s.u64x4_bswap(a) } } #[inline(always)] pub unsafe fn bswap_fallback(mut src: *const T, len: usize, mut dst: *mut T) where T: BSwap, { let end = src.add(len); while src < end { let x = src.read(); let y = ::swap_single(x); dst.write(y); src = src.add(1); dst = dst.add(1); } } #[inline(always)] pub unsafe fn bswap_simd(s: S, mut src: *const T, mut len: usize, mut dst: *mut T) where T: BSwap, { let end = src.add(len / T::LANES * T::LANES); while src < end { let x = s.v256_load_unaligned(src.cast()); let y = ::swap_simd(s, x); s.v256_store_unaligned(dst.cast(), y); src = src.add(T::LANES); dst = dst.add(T::LANES); } len %= T::LANES; bswap_fallback(src, len, dst); } vsimd-0.8.0/src/hex.rs000064400000000000000000000207201046102023000126750ustar 00000000000000use crate::alsw::{self, AlswLut}; use crate::isa::{AVX2, NEON, SSSE3, WASM128}; use crate::mask::{u8x16_highbit_any, u8x32_highbit_any}; use crate::pod::POD; use crate::vector::{V128, V256, V64}; use crate::{Scalable, SIMD128, SIMD256}; pub const UPPER_CHARSET: &[u8; 16] = b"0123456789ABCDEF"; pub const LOWER_CHARSET: &[u8; 16] = b"0123456789abcdef"; const fn parse_hex(x: u8) -> u8 { match x { b'0'..=b'9' => x - b'0', b'a'..=b'f' => x - b'a' + 10, b'A'..=b'F' => x - b'A' + 10, _ => 0xff, } } #[inline(always)] #[must_use] pub const fn unhex(x: u8) -> u8 { const UNHEX_TABLE: &[u8; 256] = &{ let mut arr = [0; 256]; let mut i = 0; while i < 256 { arr[i] = parse_hex(i as u8); i += 1; } arr }; UNHEX_TABLE[x as usize] } #[inline(always)] pub fn check_xn(s: S, x: V) -> bool where S: Scalable, V: POD, { let x1 = s.u8xn_sub(x, s.u8xn_splat(0x30 + 0x80)); let x2 = s.u8xn_sub(s.and(x, s.u8xn_splat(0xdf)), s.u8xn_splat(0x41 + 0x80)); let m1 = s.i8xn_lt(x1, s.i8xn_splat(-118)); let m2 = s.i8xn_lt(x2, s.i8xn_splat(-122)); s.mask8xn_all(s.or(m1, m2)) } pub const ENCODE_UPPER_LUT: V256 = V256::double_bytes(*UPPER_CHARSET); pub const ENCODE_LOWER_LUT: V256 = V256::double_bytes(*LOWER_CHARSET); #[inline(always)] pub fn encode_bytes16(s: S, x: V128, lut: V256) -> V256 { let x = s.u16x16_from_u8x16(x); let hi = s.u16x16_shl::<8>(x); let lo = s.u16x16_shr::<4>(x); let values = s.v256_and(s.v256_or(hi, lo), s.u8x32_splat(0x0f)); s.u8x16x2_swizzle(lut, values) } #[inline(always)] pub fn encode_bytes32(s: S, x: V256, lut: V256) -> (V256, V256) { let m = s.u8x32_splat(0x0f); let hi = s.v256_and(s.u16x16_shr::<4>(x), m); let lo = s.v256_and(x, m); let ac = s.u8x16x2_zip_lo(hi, lo); let bd = s.u8x16x2_zip_hi(hi, lo); let ab = s.v128x2_zip_lo(ac, bd); let cd = s.v128x2_zip_hi(ac, bd); let y1 = s.u8x16x2_swizzle(lut, ab); let y2 = s.u8x16x2_swizzle(lut, cd); (y1, y2) } struct HexAlsw; impl HexAlsw { const fn decode(c: u8) -> u8 { parse_hex(c) } const fn check_hash(i: u8) -> u8 { match i { 0 => 1, 1..=6 => 1, 7..=9 => 6, 0xA..=0xF => 8, _ => unreachable!(), } } const fn decode_hash(i: u8) -> u8 { Self::check_hash(i) } } impl_alsw!(HexAlsw); const HEX_ALSW_CHECK: AlswLut = HexAlsw::check_lut(); const HEX_ALSW_DECODE: AlswLut = HexAlsw::decode_lut(); const HEX_ALSW_CHECK_X2: AlswLut = HexAlsw::check_lut().x2(); const HEX_ALSW_DECODE_X2: AlswLut = HexAlsw::decode_lut().x2(); const DECODE_UZP1: V256 = V256::double_bytes([ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, // 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // ]); const DECODE_UZP2: V256 = V256::double_bytes([ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, // ]); #[inline(always)] fn merge_bits, V: POD>(s: S, x: V) -> V { // x: {0000hhhh|0000llll} xn let x1 = s.u16xn_shl::<4>(x); // x1: {hhhh0000|llll0000} xn let x2 = s.u16xn_shr::<12>(x1); // x2: {0000llll|00000000} xn s.or(x1, x2) // {hhhhllll|????????} xn } #[inline(always)] fn decode16(s: S, x: V128) -> (V128, V128) { let (c1, c2) = alsw::decode_ascii_xn(s, x, HEX_ALSW_CHECK, HEX_ALSW_DECODE); (merge_bits(s, c2), c1) } #[inline(always)] fn decode32(s: S, x: V256) -> (V256, V256) { let (c1, c2) = alsw::decode_ascii_xn(s, x, HEX_ALSW_CHECK_X2, HEX_ALSW_DECODE_X2); (merge_bits(s, c2), c1) } #[allow(clippy::result_unit_err)] #[inline(always)] pub fn decode_ascii16(s: S, x: V128) -> Result { let (y, is_invalid) = decode16(s, x); let ans = if matches_isa!(S, SSSE3 | WASM128) { const UZP1: V128 = DECODE_UZP1.to_v128x2().0; s.u8x16_swizzle(y, UZP1).to_v64x2().0 } else if matches_isa!(S, NEON) { let (a, b) = y.to_v64x2(); s.u8x8_unzip_even(a, b) } else { unreachable!() }; if u8x16_highbit_any(s, is_invalid) { Err(()) } else { Ok(ans) } } #[allow(clippy::result_unit_err)] #[inline(always)] pub fn decode_ascii32(s: S, x: V256) -> Result { let (y, is_invalid) = decode32(s, x); let ans = if matches_isa!(S, SSSE3 | WASM128) { let (a, b) = s.u8x16x2_swizzle(y, DECODE_UZP1).to_v128x2(); s.u64x2_zip_lo(a, b) } else if matches_isa!(S, NEON) { let (a, b) = y.to_v128x2(); s.u8x16_unzip_even(a, b) } else { unreachable!() }; if u8x32_highbit_any(s, is_invalid) { Err(()) } else { Ok(ans) } } #[allow(clippy::result_unit_err)] #[inline(always)] pub fn decode_ascii32x2(s: S, x: (V256, V256)) -> Result { let (y1, is_invalid1) = decode32(s, x.0); let (y2, is_invalid2) = decode32(s, x.1); let is_invalid = s.v256_or(is_invalid1, is_invalid2); let ans = if matches_isa!(S, AVX2) { let ab = s.u8x16x2_swizzle(y1, DECODE_UZP1); let cd = s.u8x16x2_swizzle(y2, DECODE_UZP2); let acbd = s.v256_or(ab, cd); s.u64x4_permute::<0b_1101_1000>(acbd) // 0213 } else if matches_isa!(S, SSSE3 | WASM128) { let ab = s.u8x16x2_swizzle(y1, DECODE_UZP1); let cd = s.u8x16x2_swizzle(y2, DECODE_UZP1); s.u64x4_unzip_even(ab, cd) } else if matches_isa!(S, NEON) { s.u8x32_unzip_even(y1, y2) } else { unreachable!() }; if u8x32_highbit_any(s, is_invalid) { Err(()) } else { Ok(ans) } } pub mod sse2 { use crate::isa::SSE2; use crate::vector::{V128, V64}; use crate::SIMD128; #[inline(always)] #[must_use] pub fn decode_nibbles(s: SSE2, x: V128) -> (V128, V128) { // http://0x80.pl/notesen/2022-01-17-validating-hex-parse.html // Algorithm 3 let t1 = s.u8x16_add(x, s.u8x16_splat(0xff - b'9')); let t2 = s.u8x16_sub_sat(t1, s.u8x16_splat(6)); let t3 = s.u8x16_sub(t2, s.u8x16_splat(0xf0)); let t4 = s.v128_and(x, s.u8x16_splat(0xdf)); let t5 = s.u8x16_sub(t4, s.u8x16_splat(0x41)); let t6 = s.u8x16_add_sat(t5, s.u8x16_splat(10)); let t7 = s.u8x16_min(t3, t6); let t8 = s.u8x16_add_sat(t7, s.u8x16_splat(127 - 15)); (t7, t8) } #[inline(always)] #[must_use] pub fn merge_bits(s: SSE2, x: V128) -> V64 { let lo = s.u16x8_shr::<8>(x); let hi = s.u16x8_shl::<4>(x); let t1 = s.v128_or(lo, hi); let t2 = s.v128_and(t1, s.u16x8_splat(0x00ff)); let t3 = s.i16x8_packus(t2, s.v128_create_zero()); t3.to_v64x2().0 } pub const LOWER_OFFSET: V128 = V128::from_bytes([0x27; 16]); pub const UPPER_OFFSET: V128 = V128::from_bytes([0x07; 16]); #[inline(always)] #[must_use] pub fn encode16(s: SSE2, x: V128, offset: V128) -> (V128, V128) { let m = s.u8x16_splat(0x0f); let hi = s.v128_and(s.u16x8_shr::<4>(x), m); let lo = s.v128_and(x, m); let c1 = s.u8x16_splat(0x30); let h1 = s.u8x16_add(hi, c1); let l1 = s.u8x16_add(lo, c1); let c2 = s.u8x16_splat(0x39); let h2 = s.v128_and(s.i8x16_lt(c2, h1), offset); let l2 = s.v128_and(s.i8x16_lt(c2, l1), offset); let h3 = s.u8x16_add(h1, h2); let l3 = s.u8x16_add(l1, l2); let y1 = s.u8x16_zip_lo(h3, l3); let y2 = s.u8x16_zip_hi(h3, l3); (y1, y2) } } #[cfg(test)] mod algorithm { use super::*; #[test] #[ignore] fn check() { fn is_hex_v1(c: u8) -> bool { matches!(c, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F') } fn is_hex_v2(c: u8) -> bool { let x1 = c.wrapping_sub(0x30); let x2 = (c & 0xdf).wrapping_sub(0x41); x1 < 10 || x2 < 6 } fn is_hex_v3(c: u8) -> bool { let x1 = c.wrapping_sub(0x30 + 0x80); let x2 = (c & 0xdf).wrapping_sub(0x41 + 0x80); ((x1 as i8) < -118) || ((x2 as i8) < -122) } for c in 0..=255_u8 { let (v1, v2, v3) = (is_hex_v1(c), is_hex_v2(c), is_hex_v3(c)); assert_eq!(v1, v2); assert_eq!(v1, v3); } } #[test] #[ignore] fn hex_alsw() { HexAlsw::test_check(); HexAlsw::test_decode(); } } vsimd-0.8.0/src/isa.rs000064400000000000000000000163221046102023000126700ustar 00000000000000use crate::{SIMD128, SIMD256, SIMD64}; pub unsafe trait InstructionSet: Copy + 'static { const ID: InstructionSetTypeId; const ARCH: bool; unsafe fn new() -> Self; fn is_enabled() -> bool; } #[inline(always)] #[must_use] pub fn detect() -> Option { S::is_enabled().then(|| unsafe { S::new() }) } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum InstructionSetTypeId { Fallback, SSE2, SSSE3, SSE41, AVX2, NEON, WASM128, } #[doc(hidden)] #[inline] #[must_use] pub const fn matches_isa_impl() -> bool where S: InstructionSet, U: InstructionSet, { #[allow(clippy::enum_glob_use)] use InstructionSetTypeId::*; let (self_ty, super_ty) = (S::ID, U::ID); let inherits = match self_ty { Fallback => matches!(super_ty, Fallback), SSE2 => matches!(super_ty, Fallback | SSE2), SSSE3 => matches!(super_ty, Fallback | SSE2 | SSSE3), SSE41 => matches!(super_ty, Fallback | SSE2 | SSSE3 | SSE41), AVX2 => matches!(super_ty, Fallback | SSE2 | SSSE3 | SSE41 | AVX2), NEON => matches!(super_ty, Fallback | NEON), WASM128 => matches!(super_ty, Fallback | WASM128), }; S::ARCH && U::ARCH && inherits } #[macro_export] macro_rules! is_isa_type { ($self:ident, $isa:ident) => {{ matches!( <$self as $crate::isa::InstructionSet>::ID, <$isa as $crate::isa::InstructionSet>::ID ) }}; } #[macro_export] macro_rules! matches_isa { ($self:ident, $super:ident $(| $other:ident)*) => {{ // TODO: inline const use $crate::isa::InstructionSet; struct MatchesISA(S); impl MatchesISA { const VALUE: bool = { $crate::isa::matches_isa_impl::() $(||$crate::isa::matches_isa_impl::())* }; } MatchesISA::<$self>::VALUE }}; } #[derive(Debug, Clone, Copy)] pub struct Fallback(()); unsafe impl InstructionSet for Fallback { const ID: InstructionSetTypeId = InstructionSetTypeId::Fallback; const ARCH: bool = true; #[inline(always)] unsafe fn new() -> Self { Self(()) } #[inline(always)] fn is_enabled() -> bool { true } } #[allow(unused_macros)] macro_rules! is_feature_detected { ($feature:tt) => {{ #[cfg(target_feature = $feature)] { true } #[cfg(not(target_feature = $feature))] { #[cfg(feature = "detect")] { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { std::arch::is_x86_feature_detected!($feature) } #[cfg(target_arch = "arm")] { std::arch::is_arm_feature_detected!($feature) } #[cfg(target_arch = "aarch64")] { std::arch::is_aarch64_feature_detected!($feature) } #[cfg(not(any( target_arch = "x86", target_arch = "x86_64", target_arch = "arm", target_arch = "aarch64" )))] { false } } #[cfg(not(feature = "detect"))] { false } } }}; } macro_rules! x86_is_enabled { ($feature:tt) => {{ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { is_feature_detected!($feature) } #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] { false } }}; } #[derive(Debug, Clone, Copy)] pub struct SSE2(()); unsafe impl InstructionSet for SSE2 { const ID: InstructionSetTypeId = InstructionSetTypeId::SSE2; const ARCH: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64")); #[inline(always)] unsafe fn new() -> Self { Self(()) } #[inline(always)] fn is_enabled() -> bool { x86_is_enabled!("sse2") } } unsafe impl SIMD64 for SSE2 {} unsafe impl SIMD128 for SSE2 {} unsafe impl SIMD256 for SSE2 {} #[derive(Debug, Clone, Copy)] pub struct SSSE3(()); unsafe impl InstructionSet for SSSE3 { const ID: InstructionSetTypeId = InstructionSetTypeId::SSSE3; const ARCH: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64")); #[inline(always)] unsafe fn new() -> Self { Self(()) } #[inline(always)] fn is_enabled() -> bool { x86_is_enabled!("ssse3") } } unsafe impl SIMD64 for SSSE3 {} unsafe impl SIMD128 for SSSE3 {} unsafe impl SIMD256 for SSSE3 {} #[derive(Debug, Clone, Copy)] pub struct SSE41(()); unsafe impl InstructionSet for SSE41 { const ID: InstructionSetTypeId = InstructionSetTypeId::SSE41; const ARCH: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64")); #[inline(always)] unsafe fn new() -> Self { Self(()) } #[inline(always)] fn is_enabled() -> bool { x86_is_enabled!("sse4.1") } } unsafe impl SIMD64 for SSE41 {} unsafe impl SIMD128 for SSE41 {} unsafe impl SIMD256 for SSE41 {} #[derive(Debug, Clone, Copy)] pub struct AVX2(()); unsafe impl InstructionSet for AVX2 { const ID: InstructionSetTypeId = InstructionSetTypeId::AVX2; const ARCH: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64")); #[inline(always)] unsafe fn new() -> Self { Self(()) } #[inline(always)] fn is_enabled() -> bool { x86_is_enabled!("avx2") } } unsafe impl SIMD64 for AVX2 {} unsafe impl SIMD128 for AVX2 {} unsafe impl SIMD256 for AVX2 {} #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, Copy)] pub struct NEON(()); unsafe impl InstructionSet for NEON { const ID: InstructionSetTypeId = InstructionSetTypeId::NEON; const ARCH: bool = cfg!(any(target_arch = "arm", target_arch = "aarch64")); #[inline(always)] unsafe fn new() -> Self { Self(()) } #[inline(always)] fn is_enabled() -> bool { #[cfg(target_arch = "arm")] { #[cfg(feature = "unstable")] { is_feature_detected!("neon") } #[cfg(not(feature = "unstable"))] { false } } #[cfg(target_arch = "aarch64")] { is_feature_detected!("neon") } #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))] { false } } } unsafe impl SIMD64 for NEON {} unsafe impl SIMD128 for NEON {} unsafe impl SIMD256 for NEON {} #[derive(Debug, Clone, Copy)] pub struct WASM128(()); unsafe impl InstructionSet for WASM128 { const ID: InstructionSetTypeId = InstructionSetTypeId::WASM128; const ARCH: bool = cfg!(target_arch = "wasm32"); #[inline(always)] unsafe fn new() -> Self { Self(()) } #[inline(always)] fn is_enabled() -> bool { #[cfg(target_arch = "wasm32")] { is_feature_detected!("simd128") } #[cfg(not(target_arch = "wasm32"))] { false } } } unsafe impl SIMD64 for WASM128 {} unsafe impl SIMD128 for WASM128 {} unsafe impl SIMD256 for WASM128 {} vsimd-0.8.0/src/lib.rs000064400000000000000000000031721046102023000126610ustar 00000000000000//! ⚠️ This crate contains shared implementation details. Do not directly depend on it. #![cfg_attr(not(any(test, feature = "std")), no_std)] #![cfg_attr( feature = "unstable", feature(stdsimd), feature(arm_target_feature), feature(portable_simd), feature(inline_const), feature(array_chunks) )] #![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr(test, deny(warnings))] // #![deny( missing_debug_implementations, missing_docs, clippy::all, clippy::pedantic, clippy::cargo, clippy::missing_inline_in_public_items )] #![warn(clippy::todo)] #![allow( clippy::inline_always, missing_docs, clippy::missing_safety_doc, clippy::missing_errors_doc, clippy::missing_panics_doc, clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::cast_possible_wrap, clippy::cast_lossless, clippy::verbose_bit_mask, clippy::module_name_repetitions, clippy::wildcard_imports, clippy::items_after_statements, clippy::match_same_arms, clippy::many_single_char_names )] #[cfg(feature = "alloc")] extern crate alloc; #[macro_use] mod macros; #[macro_use] pub mod isa; pub mod vector; #[macro_use] pub mod pod; pub use self::pod::POD; mod simulation; mod unified; mod simd64; pub use self::simd64::SIMD64; mod simd128; pub use self::simd128::SIMD128; #[macro_use] mod simd256; pub use self::simd256::SIMD256; mod scalable; pub use self::scalable::Scalable; mod algorithm; pub mod tools; #[macro_use] pub mod alsw; pub mod ascii; pub mod bswap; pub mod hex; pub mod mask; pub mod native; pub mod table; #[cfg(feature = "unstable")] pub mod unstable; vsimd-0.8.0/src/macros.rs000064400000000000000000000303101046102023000133710ustar 00000000000000#[macro_export] macro_rules! item_group { ($($item:item)*) => { $($item)* } } macro_rules! debug_assert_ptr_align { ($ptr:expr, $align:literal) => {{ let align: usize = $align; let ptr = <*const _>::cast::<()>($ptr); let addr = ptr as usize; debug_assert!(addr % align == 0) }}; } #[macro_export] macro_rules! shared_docs { () => { r#" # Profile settings To ensure maximum performance, the following [profile settings](https://doc.rust-lang.org/cargo/reference/profiles.html#profile-settings) are recommended when compiling this crate: ```toml opt-level = 3 lto = "fat" codegen-units = 1 ``` # CPU feature detection The feature flag `detect` is enabled by default. When the feature flag `detect` is enabled, the APIs will *test at runtime* whether *the CPU (and OS)* supports the required instruction set. The runtime detection will be skipped if the fastest implementation is already available at compile-time. When the feature flag `detect` is disabled, the APIs will *test at compile-time* whether *the compiler flags* supports the required instruction set. If the environment supports SIMD acceleration, the APIs will call SIMD functions under the hood. Otherwise, the APIs will call fallback functions. When the feature flag `unstable` is enabled, this crate requires the nightly toolchain to compile. # `no_std` support You can disable the default features to use this crate in a `no_std` environment. You can enable the feature flag `alloc` if the environment supports heap allocation. Currently the feature flag `detect` depends on the standard library. Dynamic CPU feature detection is not available in `no_std` environments. "# }; } #[macro_export] macro_rules! dispatch { ( name = {$name:ident}, signature = {$vis:vis unsafe fn($($arg_name: ident: $arg_type: ty),*) -> $ret:ty}, fallback = {$fallback_fn:path}, simd = {$simd_fn:path}, targets = {$($target:tt),+}, fastest = {$($fastest:tt),*}, ) => { $vis mod $name { #![allow( clippy::missing_safety_doc, clippy::must_use_candidate, )] use super::*; use $crate::SIMD256; #[allow(dead_code)] #[inline] $vis unsafe fn simd(s: S $(,$arg_name: $arg_type)*) -> $ret { $simd_fn(s, $($arg_name),*) } $crate::dispatch!( @iter_compile, signature = {$vis unsafe fn($($arg_name: $arg_type),*) -> $ret}, simd = {$simd_fn}, targets = {$($target),+}, ); #[allow(unreachable_code)] #[cfg(not(all(feature = "detect", not(target_arch = "wasm32"))))] // auto_direct #[inline] $vis unsafe fn auto($($arg_name: $arg_type),*) -> $ret { $crate::dispatch!( @iter_resolve_static, targets = {$($target),+}, args = {$($arg_name),*}, ); $fallback_fn($($arg_name),*) } #[cfg(all(feature = "detect", not(target_arch = "wasm32")))] // auto_indirect $crate::item_group! { use core::sync::atomic::{AtomicPtr, Ordering::Relaxed}; static IFUNC: AtomicPtr<()> = AtomicPtr::new(init_ifunc as *mut ()); #[inline(always)] fn resolve() -> unsafe fn($($arg_type),*) -> $ret { use $crate::isa::InstructionSet; $crate::dispatch!(@iter_resolve_dynamic, targets = {$($target),+},); $fallback_fn } #[inline] unsafe fn init_ifunc($($arg_name: $arg_type),*) -> $ret { let f = resolve(); IFUNC.store(f as *mut (), Relaxed); f($($arg_name),*) } #[allow(unreachable_code)] #[inline] $vis unsafe fn auto($($arg_name: $arg_type),*) -> $ret { $crate::dispatch!( @iter_resolve_static, targets = {$($fastest),+}, args = {$($arg_name),*}, ); let f: unsafe fn($($arg_type),*) -> $ret = core::mem::transmute(IFUNC.load(Relaxed)); f($($arg_name),*) } } } }; ( @iter_resolve_static, targets = {$x:tt, $($xs:tt),+}, args = {$($arg_name: ident),*}, ) => { $crate::dispatch!(@resolve_static, $x, $($arg_name),*); $crate::dispatch!(@iter_resolve_static, targets = {$($xs),+}, args = {$($arg_name),*},); }; ( @iter_resolve_static, targets = {$x:tt}, args = {$($arg_name: ident),*}, ) => { $crate::dispatch!(@resolve_static, $x, $($arg_name),*); }; (@resolve_static, "avx2", $($arg_name: ident),*) => { #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx2" ))] { return unsafe { avx2($($arg_name),*) } } }; (@resolve_static, "sse4.1", $($arg_name: ident),*) => { #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse4.1" ))] { return unsafe { sse41($($arg_name),*) } } }; (@resolve_static, "ssse3", $($arg_name: ident),*) => { #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "ssse3" ))] { return unsafe { ssse3($($arg_name),*) } } }; (@resolve_static, "sse2", $($arg_name: ident),*) => { #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2" ))] { return unsafe { sse2($($arg_name),*) } } }; (@resolve_static, "neon", $($arg_name: ident),*) => { #[cfg(any( all(feature = "unstable", target_arch = "arm", target_feature = "neon"), all(target_arch = "aarch64", target_feature = "neon"), ))] { return unsafe { neon($($arg_name),*) } } }; (@resolve_static, "simd128", $($arg_name: ident),*) => { #[cfg(all( target_arch = "wasm32", target_feature = "simd128", ))] { return unsafe { simd128($($arg_name),*) } } }; ( @iter_resolve_dynamic, targets = {$x:tt, $($xs:tt),+}, ) => { $crate::dispatch!(@resolve_dynamic, $x); $crate::dispatch!(@iter_resolve_dynamic, targets = {$($xs),+},); }; ( @iter_resolve_dynamic, targets = {$x:tt}, ) => { $crate::dispatch!(@resolve_dynamic, $x); }; (@resolve_dynamic, "avx2") => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if $crate::isa::AVX2::is_enabled() { return avx2; } }; (@resolve_dynamic, "sse4.1") => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if $crate::isa::SSE41::is_enabled() { return sse41; } }; (@resolve_dynamic, "ssse3") => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if $crate::isa::SSSE3::is_enabled() { return ssse3; } }; (@resolve_dynamic, "sse2") => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if $crate::isa::SSE2::is_enabled() { return sse2; } }; (@resolve_dynamic, "neon") => { #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if $crate::isa::NEON::is_enabled() { return neon; } }; (@resolve_dynamic, "simd128") => { #[cfg(target_arch = "wasm32")] if $crate::isa::WASM128::is_enabled() { return simd128; } }; ( @iter_compile, signature = {$vis:vis unsafe fn($($arg_name: ident: $arg_type: ty),*) -> $ret:ty}, simd = {$simd_fn:path}, targets = {$x:tt, $($xs:tt),+}, ) => { $crate::dispatch!( @compile, signature = {$vis unsafe fn($($arg_name: $arg_type),*) -> $ret}, simd = {$simd_fn}, target = {$x}, ); $crate::dispatch!( @iter_compile, signature = {$vis unsafe fn($($arg_name: $arg_type),*) -> $ret}, simd = {$simd_fn}, targets = {$($xs),+}, ); }; ( @iter_compile, signature = {$vis:vis unsafe fn($($arg_name: ident: $arg_type: ty),*) -> $ret:ty}, simd = {$simd_fn:path}, targets = {$x:tt}, ) => { $crate::dispatch!( @compile, signature = {$vis unsafe fn($($arg_name: $arg_type),*) -> $ret}, simd = {$simd_fn}, target = {$x}, ); }; ( @compile, signature = {$vis:vis unsafe fn($($arg_name: ident: $arg_type: ty),*) -> $ret:ty}, simd = {$simd_fn:path}, target = {"avx2"}, ) => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline] #[target_feature(enable = "avx2")] $vis unsafe fn avx2($($arg_name:$arg_type),*) -> $ret { use $crate::isa::{AVX2, InstructionSet as _}; $simd_fn(AVX2::new() $(,$arg_name)*) } }; ( @compile, signature = {$vis:vis unsafe fn($($arg_name: ident: $arg_type: ty),*) -> $ret:ty}, simd = {$simd_fn:path}, target = {"sse4.1"}, ) => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline] #[target_feature(enable = "sse4.1")] $vis unsafe fn sse41($($arg_name:$arg_type),*) -> $ret { use $crate::isa::{SSE41, InstructionSet as _}; $simd_fn(SSE41::new() $(,$arg_name)*) } }; ( @compile, signature = {$vis:vis unsafe fn($($arg_name: ident: $arg_type: ty),*) -> $ret:ty}, simd = {$simd_fn:path}, target = {"ssse3"}, ) => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline] #[target_feature(enable = "ssse3")] $vis unsafe fn ssse3($($arg_name:$arg_type),*) -> $ret { use $crate::isa::{SSSE3, InstructionSet as _}; $simd_fn(SSSE3::new() $(,$arg_name)*) } }; ( @compile, signature = {$vis:vis unsafe fn($($arg_name: ident: $arg_type: ty),*) -> $ret:ty}, simd = {$simd_fn:path}, target = {"sse2"}, ) => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline] #[target_feature(enable = "sse2")] $vis unsafe fn sse2($($arg_name:$arg_type),*) -> $ret { use $crate::isa::{SSE2, InstructionSet as _}; $simd_fn(SSE2::new() $(,$arg_name)*) } }; ( @compile, signature = {$vis:vis unsafe fn($($arg_name: ident: $arg_type: ty),*) -> $ret:ty}, simd = {$simd_fn:path}, target = {"neon"}, ) => { #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] #[inline] #[target_feature(enable = "neon")] $vis unsafe fn neon($($arg_name:$arg_type),*) -> $ret { use $crate::isa::{NEON, InstructionSet as _}; $simd_fn(NEON::new() $(,$arg_name)*) } }; ( @compile, signature = {$vis:vis unsafe fn($($arg_name: ident: $arg_type: ty),*) -> $ret:ty}, simd = {$simd_fn:path}, target = {"simd128"}, ) => { #[cfg(target_arch = "wasm32")] #[cfg_attr(not(vsimd_dump_symbols), inline)] #[target_feature(enable = "simd128")] $vis unsafe fn simd128($($arg_name:$arg_type),*) -> $ret { use $crate::isa::{WASM128, InstructionSet as _}; $simd_fn(WASM128::new() $(,$arg_name)*) } } } vsimd-0.8.0/src/mask.rs000064400000000000000000000060141046102023000130440ustar 00000000000000use crate::isa::{AVX2, NEON, SSE2, WASM128}; use crate::vector::{V128, V256}; use crate::{SIMD128, SIMD256}; use core::ops::Not; #[inline(always)] pub fn mask8x16_all(s: S, x: V128) -> bool { if matches_isa!(S, SSE2 | WASM128) { return s.u8x16_bitmask(x) == u16::MAX; } if matches_isa!(S, NEON) { if cfg!(target_arch = "arm") { return s.u8x16_any_zero(x).not(); } if cfg!(target_arch = "aarch64") { return s.u8x16_reduce_min(x) != 0; } } unreachable!() } #[inline(always)] pub fn mask8x32_all(s: S, x: V256) -> bool { if matches_isa!(S, AVX2) { return s.u8x32_bitmask(x) == u32::MAX; } if matches_isa!(S, SSE2 | WASM128 | NEON) { let x = x.to_v128x2(); let x = s.v128_and(x.0, x.1); return mask8x16_all(s, x); } unreachable!() } #[inline(always)] pub fn mask8x16_any(s: S, x: V128) -> bool { if matches_isa!(S, SSE2 | WASM128) { return s.u8x16_bitmask(x) != 0; } if matches_isa!(S, NEON) { return s.v128_all_zero(x).not(); } unreachable!() } #[inline(always)] pub fn mask8x32_any(s: S, x: V256) -> bool { if matches_isa!(S, AVX2) { return s.u8x32_bitmask(x) != 0; } if matches_isa!(S, SSE2 | WASM128 | NEON) { let x = x.to_v128x2(); let x = s.v128_or(x.0, x.1); return mask8x16_any(s, x); } unreachable!() } #[inline(always)] pub fn u8x16_highbit_all(s: S, x: V128) -> bool { if matches_isa!(S, SSE2 | WASM128) { return s.u8x16_bitmask(x) == u16::MAX; } if matches_isa!(S, NEON) { if cfg!(target_arch = "arm") { return mask8x16_all(s, s.i8x16_lt(x, s.v128_create_zero())); } if cfg!(target_arch = "aarch64") { return s.u8x16_reduce_min(x) >= 0x80; } } unreachable!() } #[inline(always)] pub fn u8x32_highbit_all(s: S, x: V256) -> bool { if matches_isa!(S, AVX2) { return s.u8x32_bitmask(x) == u32::MAX; } if matches_isa!(S, SSE2 | WASM128 | NEON) { let x = x.to_v128x2(); let x = s.v128_and(x.0, x.1); return u8x16_highbit_all(s, x); } unreachable!() } #[inline(always)] pub fn u8x16_highbit_any(s: S, x: V128) -> bool { if matches_isa!(S, SSE2 | WASM128) { return s.u8x16_bitmask(x) != 0; } if matches_isa!(S, NEON) { if cfg!(target_arch = "arm") { return mask8x16_any(s, s.i8x16_lt(x, s.v128_create_zero())); } if cfg!(target_arch = "aarch64") { return s.u8x16_reduce_max(x) >= 0x80; } } unreachable!() } #[inline(always)] pub fn u8x32_highbit_any(s: S, x: V256) -> bool { if matches_isa!(S, AVX2) { return s.u8x32_bitmask(x) != 0; } if matches_isa!(S, SSE2 | WASM128 | NEON) { let x = x.to_v128x2(); let x = s.v128_or(x.0, x.1); return u8x16_highbit_any(s, x); } unreachable!() } vsimd-0.8.0/src/native.rs000064400000000000000000000062221046102023000134000ustar 00000000000000#[derive(Debug, Clone, Copy)] pub struct Native(Arch); #[derive(Debug, Clone, Copy)] enum Arch { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Avx2, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Sse41, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Sse2, #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] Neon, #[cfg(target_arch = "wasm32")] Simd128, Fallback, } impl Native { #[inline] #[must_use] pub fn detect() -> Self { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if is_feature_detected!("avx2") { return Self(Arch::Avx2); } if is_feature_detected!("sse4.1") { return Self(Arch::Sse41); } if is_feature_detected!("sse2") { return Self(Arch::Sse2); } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] { if is_feature_detected!("neon") { return Self(Arch::Neon); } } #[cfg(target_arch = "wasm32")] { if is_feature_detected!("simd128") { return Self(Arch::Simd128); } } Self(Arch::Fallback) } #[inline] pub fn exec(self, f: F) -> O where F: FnOnce() -> O, { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { match self.0 { Arch::Avx2 => unsafe { x86::avx2(f) }, Arch::Sse41 => unsafe { x86::sse41(f) }, Arch::Sse2 => unsafe { x86::sse2(f) }, Arch::Fallback => f(), } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] { match self.0 { Arch::Neon => unsafe { arm::neon(f) }, Arch::Fallback => f(), } } #[cfg(target_arch = "wasm32")] { match self.0 { Arch::Simd128 => unsafe { wasm::simd128(f) }, Arch::Fallback => f(), } } #[cfg(not(any( // any(target_arch = "x86", target_arch = "x86_64"), // any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"), // target_arch = "wasm32" // )))] { f() } } } #[allow(unused_macros)] macro_rules! generic_dispatch { ($name: ident, $feature: tt) => { #[inline] #[target_feature(enable = $feature)] pub unsafe fn $name(f: F) -> O where F: FnOnce() -> O, { f() } }; } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod x86 { generic_dispatch!(avx2, "avx2"); generic_dispatch!(sse41, "sse4.1"); generic_dispatch!(sse2, "sse2"); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] mod arm { generic_dispatch!(neon, "neon"); } #[cfg(target_arch = "wasm32")] mod wasm { generic_dispatch!(simd128, "simd128"); } vsimd-0.8.0/src/pod.rs000064400000000000000000000023021046102023000126670ustar 00000000000000use crate::vector::{V128, V256, V512, V64}; pub unsafe trait POD: Copy + 'static { const ID: PodTypeId; } macro_rules! mark_pod { ($($ty:ident),*) => { $( unsafe impl POD for $ty { const ID: PodTypeId = PodTypeId::$ty; } )* }; } mark_pod!(u8, u16, u32, u64, u128, usize); mark_pod!(i8, i16, i32, i64, i128, isize); mark_pod!(f32, f64); mark_pod!(V64, V128, V256, V512); #[inline(always)] pub fn align(slice: &[T]) -> (&[T], &[U], &[T]) { unsafe { slice.align_to() } } #[allow(non_camel_case_types)] #[derive(Debug, Clone, Copy)] pub enum PodTypeId { u8, u16, u32, u64, u128, usize, i8, i16, i32, i64, i128, isize, f32, f64, V64, V128, V256, V512, } #[macro_export] macro_rules! is_pod_type { ($self:ident, $x:ident $(| $xs:ident)*) => {{ // TODO: inline const use $crate::pod::POD; struct IsPodType(T); impl IsPodType { const VALUE: bool = { matches!(::ID, $crate::pod::PodTypeId::$x $(| $crate::pod::PodTypeId::$xs)*) }; } IsPodType::<$self>::VALUE }}; } vsimd-0.8.0/src/scalable.rs000064400000000000000000000130431046102023000136570ustar 00000000000000use crate::isa::InstructionSet; use crate::pod::POD; use crate::vector::{V128, V256}; use crate::{mask::*, unified}; use crate::{SIMD128, SIMD256}; pub unsafe trait Scalable: InstructionSet { #[inline(always)] fn and(self, a: V, b: V) -> V { unified::and(self, a, b) } #[inline(always)] fn or(self, a: V, b: V) -> V { unified::or(self, a, b) } #[inline(always)] fn xor(self, a: V, b: V) -> V { unified::xor(self, a, b) } #[inline(always)] fn andnot(self, a: V, b: V) -> V { unified::andnot(self, a, b) } #[inline(always)] fn u8xn_splat(self, x: u8) -> V { unified::splat::<_, u8, _>(self, x) } #[inline(always)] fn i8xn_splat(self, x: i8) -> V { unified::splat::<_, i8, _>(self, x) } #[inline(always)] fn u32xn_splat(self, x: u32) -> V { unified::splat::<_, u32, _>(self, x) } #[inline(always)] fn u8xn_add(self, a: V, b: V) -> V { unified::add::<_, u8, _>(self, a, b) } #[inline(always)] fn u8xn_sub(self, a: V, b: V) -> V { unified::sub::<_, u8, _>(self, a, b) } #[inline(always)] fn u32xn_sub(self, a: V, b: V) -> V { unified::sub::<_, u32, _>(self, a, b) } #[inline(always)] fn u8xn_add_sat(self, a: V, b: V) -> V { unified::add_sat::<_, u8, _>(self, a, b) } #[inline(always)] fn i8xn_add_sat(self, a: V, b: V) -> V { unified::add_sat::<_, i8, _>(self, a, b) } #[inline(always)] fn u8xn_sub_sat(self, a: V, b: V) -> V { unified::sub_sat::<_, u8, _>(self, a, b) } #[inline(always)] fn u8xn_eq(self, a: V, b: V) -> V { unified::eq::<_, u8, _>(self, a, b) } #[inline(always)] fn i8xn_lt(self, a: V, b: V) -> V { unified::lt::<_, i8, _>(self, a, b) } #[inline(always)] fn u32xn_lt(self, a: V, b: V) -> V { unified::lt::<_, u32, _>(self, a, b) } #[inline(always)] fn u32xn_max(self, a: V, b: V) -> V { unified::max::<_, u32, _>(self, a, b) } fn u16xn_shl(self, a: V) -> V; fn u16xn_shr(self, a: V) -> V; fn u32xn_shr(self, a: V) -> V; fn u8xn_avgr(self, a: V, b: V) -> V; fn u8x16xn_swizzle(self, a: V, b: V) -> V; fn all_zero(self, a: V) -> bool; fn mask8xn_all(self, a: V) -> bool; fn mask8xn_any(self, a: V) -> bool; fn u8xn_highbit_all(self, a: V) -> bool; fn u8xn_highbit_any(self, a: V) -> bool; fn u16xn_bswap(self, a: V) -> V; fn u32xn_bswap(self, a: V) -> V; fn u64xn_bswap(self, a: V) -> V; } unsafe impl Scalable for S where S: SIMD128, { #[inline(always)] fn u16xn_shl(self, a: V128) -> V128 { self.u16x8_shl::(a) } #[inline(always)] fn u16xn_shr(self, a: V128) -> V128 { self.u16x8_shr::(a) } #[inline(always)] fn u32xn_shr(self, a: V128) -> V128 { self.u32x4_shr::(a) } #[inline(always)] fn u8xn_avgr(self, a: V128, b: V128) -> V128 { self.u8x16_avgr(a, b) } #[inline(always)] fn u8x16xn_swizzle(self, a: V128, b: V128) -> V128 { self.u8x16_swizzle(a, b) } #[inline(always)] fn all_zero(self, a: V128) -> bool { self.v128_all_zero(a) } #[inline(always)] fn mask8xn_all(self, a: V128) -> bool { mask8x16_all(self, a) } #[inline(always)] fn mask8xn_any(self, a: V128) -> bool { mask8x16_any(self, a) } #[inline(always)] fn u8xn_highbit_all(self, a: V128) -> bool { u8x16_highbit_all(self, a) } #[inline(always)] fn u8xn_highbit_any(self, a: V128) -> bool { u8x16_highbit_any(self, a) } #[inline(always)] fn u16xn_bswap(self, a: V128) -> V128 { self.u16x8_bswap(a) } #[inline(always)] fn u32xn_bswap(self, a: V128) -> V128 { self.u32x4_bswap(a) } #[inline(always)] fn u64xn_bswap(self, a: V128) -> V128 { self.u64x2_bswap(a) } } unsafe impl Scalable for S where S: SIMD256, { #[inline(always)] fn u16xn_shl(self, a: V256) -> V256 { self.u16x16_shl::(a) } #[inline(always)] fn u16xn_shr(self, a: V256) -> V256 { self.u16x16_shr::(a) } #[inline(always)] fn u32xn_shr(self, a: V256) -> V256 { self.u32x8_shr::(a) } #[inline(always)] fn u8xn_avgr(self, a: V256, b: V256) -> V256 { self.u8x32_avgr(a, b) } #[inline(always)] fn u8x16xn_swizzle(self, a: V256, b: V256) -> V256 { self.u8x16x2_swizzle(a, b) } #[inline(always)] fn all_zero(self, a: V256) -> bool { self.v256_all_zero(a) } #[inline(always)] fn mask8xn_all(self, a: V256) -> bool { mask8x32_all(self, a) } #[inline(always)] fn mask8xn_any(self, a: V256) -> bool { mask8x32_any(self, a) } #[inline(always)] fn u8xn_highbit_all(self, a: V256) -> bool { u8x32_highbit_all(self, a) } #[inline(always)] fn u8xn_highbit_any(self, a: V256) -> bool { u8x32_highbit_any(self, a) } #[inline(always)] fn u16xn_bswap(self, a: V256) -> V256 { self.u16x16_bswap(a) } #[inline(always)] fn u32xn_bswap(self, a: V256) -> V256 { self.u32x8_bswap(a) } #[inline(always)] fn u64xn_bswap(self, a: V256) -> V256 { self.u64x4_bswap(a) } } vsimd-0.8.0/src/simd128.rs000064400000000000000000001102641046102023000133030ustar 00000000000000use crate::isa::{NEON, SSE2, SSE41, WASM128}; use crate::unified; use crate::vector::V128; use crate::SIMD64; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] use crate::isa::SSSE3; #[cfg(any( any(target_arch = "x86", target_arch = "x86_64"), any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"), target_arch = "wasm32" ))] use core::mem::transmute as t; #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; #[cfg(all(feature = "unstable", target_arch = "arm"))] use core::arch::arm::*; #[cfg(target_arch = "aarch64")] use core::arch::aarch64::*; #[cfg(target_arch = "wasm32")] use core::arch::wasm32::*; pub unsafe trait SIMD128: SIMD64 { /// T1: SSE2, NEON, WASM128 #[inline(always)] unsafe fn v128_load(self, addr: *const u8) -> V128 { debug_assert_ptr_align!(addr, 16); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return t(_mm_load_si128(addr.cast())); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return self.v128_load_unaligned(addr); } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return self.v128_load_unaligned(addr); } { let _ = addr; unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] unsafe fn v128_load_unaligned(self, addr: *const u8) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return t(_mm_loadu_si128(addr.cast())); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return t(vld1q_u8(addr.cast())); } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return t(v128_load(addr.cast())); } { let _ = addr; unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] unsafe fn v128_store(self, addr: *mut u8, a: V128) { debug_assert_ptr_align!(addr, 16); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return _mm_store_si128(addr.cast(), t(a)); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return self.v128_store_unaligned(addr, a); } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return self.v128_store_unaligned(addr, a); } { let _ = (addr, a); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] unsafe fn v128_store_unaligned(self, addr: *mut u8, a: V128) { if cfg!(miri) { return addr.cast::().write_unaligned(a); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return _mm_storeu_si128(addr.cast(), t(a)); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return vst1q_u8(addr.cast(), t(a)); } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return v128_store(addr.cast(), t(a)); } { let _ = (addr, a); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn v128_create_zero(self) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_setzero_si128()) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vdupq_n_u8(0)) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(u8x16_splat(0)) }; } { unreachable!() } } /// T1: NEON, WASM128 /// /// T2: SSE2 #[inline(always)] fn v128_not(self, a: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { let a = t(a); t(_mm_xor_si128(a, _mm_cmpeq_epi8(a, a))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vmvnq_u8(t(a))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(v128_not(t(a))) }; } { let _ = a; unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn v128_and(self, a: V128, b: V128) -> V128 { unified::and(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn v128_or(self, a: V128, b: V128) -> V128 { unified::or(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn v128_xor(self, a: V128, b: V128) -> V128 { unified::xor(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn v128_andnot(self, a: V128, b: V128) -> V128 { unified::andnot(self, a, b) } /// T1: SSE41, NEON-A64, WASM128 /// /// T2: NEON-A32 #[inline(always)] fn v128_all_zero(self, a: V128) -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE41) { return unsafe { let a = t(a); _mm_testz_si128(a, a) != 0 }; } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { let a1: uint32x2x2_t = t(a); let a2: uint32x2_t = vorr_u32(a1.0, a1.1); vget_lane_u32::<0>(vpmax_u32(a2, a2)) == 0 }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { vmaxvq_u8(t(a)) == 0 }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { !v128_any_true(t(a)) }; } { let _ = a; unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_splat(self, x: u8) -> V128 { unified::splat(self, x) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u16x8_splat(self, x: u16) -> V128 { unified::splat(self, x) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u32x4_splat(self, x: u32) -> V128 { unified::splat(self, x) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u64x2_splat(self, x: u64) -> V128 { unified::splat(self, x) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i8x16_splat(self, x: i8) -> V128 { unified::splat(self, x) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i16x8_splat(self, x: i16) -> V128 { unified::splat(self, x) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i32x4_splat(self, x: i32) -> V128 { unified::splat(self, x) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i64x2_splat(self, x: i64) -> V128 { unified::splat(self, x) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_add(self, a: V128, b: V128) -> V128 { unified::add::<_, u8, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u16x8_add(self, a: V128, b: V128) -> V128 { unified::add::<_, u16, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u32x4_add(self, a: V128, b: V128) -> V128 { unified::add::<_, u32, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u64x2_add(self, a: V128, b: V128) -> V128 { unified::add::<_, u64, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_sub(self, a: V128, b: V128) -> V128 { unified::sub::<_, u8, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u16x8_sub(self, a: V128, b: V128) -> V128 { unified::sub::<_, u16, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u32x4_sub(self, a: V128, b: V128) -> V128 { unified::sub::<_, u32, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u64x2_sub(self, a: V128, b: V128) -> V128 { unified::sub::<_, u64, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_sub_sat(self, a: V128, b: V128) -> V128 { unified::sub_sat::<_, u8, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u16x8_sub_sat(self, a: V128, b: V128) -> V128 { unified::sub_sat::<_, u16, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i8x16_sub_sat(self, a: V128, b: V128) -> V128 { unified::sub_sat::<_, i8, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i16x8_sub_sat(self, a: V128, b: V128) -> V128 { unified::sub_sat::<_, i16, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i16x8_mul_lo(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_mullo_epi16(t(a), t(b))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vmulq_s16(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(i16x8_mul(t(a), t(b))) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn i32x4_mul_lo(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE41) { return unsafe { t(_mm_mullo_epi32(t(a), t(b))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vmulq_s32(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(i32x4_mul(t(a), t(b))) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u16x8_shl(self, a: V128) -> V128 { if cfg!(miri) { return crate::simulation::u16x8_shl(a, IMM8 as u8); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_slli_epi16::(t(a))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vshlq_n_u16::(t(a))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(u16x8_shl(t(a), IMM8 as u32)) }; } { let _ = a; unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u32x4_shl(self, a: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_slli_epi32::(t(a))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vshlq_n_u32::(t(a))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(u32x4_shl(t(a), IMM8 as u32)) }; } { let _ = a; unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u16x8_shr(self, a: V128) -> V128 { if cfg!(miri) { return crate::simulation::u16x8_shr(a, IMM8 as u8); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_srli_epi16::(t(a))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vshrq_n_u16::(t(a))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(u16x8_shr(t(a), IMM8 as u32)) }; } { let _ = a; unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u32x4_shr(self, a: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_srli_epi32::(t(a))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vshrq_n_u32::(t(a))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(u32x4_shr(t(a), IMM8 as u32)) }; } { let _ = a; unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_eq(self, a: V128, b: V128) -> V128 { unified::eq::<_, u8, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u16x8_eq(self, a: V128, b: V128) -> V128 { unified::eq::<_, u16, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u32x4_eq(self, a: V128, b: V128) -> V128 { unified::eq::<_, u32, _>(self, a, b) } /// T1: NEON, WASM128 /// /// T2: SSE2 #[inline(always)] fn u8x16_lt(self, a: V128, b: V128) -> V128 { unified::lt::<_, u8, _>(self, a, b) } /// T1: NEON, WASM128 /// /// T2: SSE2 #[inline(always)] fn u16x8_lt(self, a: V128, b: V128) -> V128 { unified::lt::<_, u16, _>(self, a, b) } /// T1: NEON, WASM128 /// /// T2: SSE2 #[inline(always)] fn u32x4_lt(self, a: V128, b: V128) -> V128 { unified::lt::<_, u32, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i8x16_lt(self, a: V128, b: V128) -> V128 { unified::lt::<_, i8, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i16x8_lt(self, a: V128, b: V128) -> V128 { unified::lt::<_, i16, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i32x4_lt(self, a: V128, b: V128) -> V128 { unified::lt::<_, i32, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_max(self, a: V128, b: V128) -> V128 { unified::max::<_, u8, _>(self, a, b) } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn u16x8_max(self, a: V128, b: V128) -> V128 { unified::max::<_, u16, _>(self, a, b) } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn u32x4_max(self, a: V128, b: V128) -> V128 { unified::max::<_, u32, _>(self, a, b) } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn i8x16_max(self, a: V128, b: V128) -> V128 { unified::max::<_, i8, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i16x8_max(self, a: V128, b: V128) -> V128 { unified::max::<_, i16, _>(self, a, b) } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn i32x4_max(self, a: V128, b: V128) -> V128 { unified::max::<_, i32, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_min(self, a: V128, b: V128) -> V128 { unified::min::<_, u8, _>(self, a, b) } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn u16x8_min(self, a: V128, b: V128) -> V128 { unified::min::<_, u16, _>(self, a, b) } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn u32x4_min(self, a: V128, b: V128) -> V128 { unified::min::<_, u32, _>(self, a, b) } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn i8x16_min(self, a: V128, b: V128) -> V128 { unified::min::<_, i8, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i16x8_min(self, a: V128, b: V128) -> V128 { unified::min::<_, i16, _>(self, a, b) } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn i32x4_min(self, a: V128, b: V128) -> V128 { unified::min::<_, i32, _>(self, a, b) } /// T1: SSSE3, NEON-A64, WASM128 /// /// T2: NEON-A32 #[inline(always)] fn u8x16_swizzle(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSSE3) { return unsafe { t(_mm_shuffle_epi8(t(a), t(b))) }; } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { let (a, b) = (t(a), t(b)); let a = uint8x8x2_t(vget_low_u8(a), vget_high_u8(a)); let b = (vget_low_u8(b), vget_high_u8(b)); let c = (vtbl2_u8(a, b.0), vtbl2_u8(a, b.1)); t([c.0, c.1]) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vqtbl1q_u8(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(u8x16_swizzle(t(a), t(b))) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn u16x8_bswap(self, a: V128) -> V128 { if matches_isa!(Self, SSE41 | WASM128) { return self.u8x16_swizzle(a, crate::bswap::SHUFFLE_U16X8); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vrev16q_u8(t(a))) }; } { let _ = a; unreachable!() } } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn u32x4_bswap(self, a: V128) -> V128 { if matches_isa!(Self, SSE41 | WASM128) { return self.u8x16_swizzle(a, crate::bswap::SHUFFLE_U32X4); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vrev32q_u8(t(a))) }; } { let _ = a; unreachable!() } } /// T1: SSE41, NEON, WASM128 #[inline(always)] fn u64x2_bswap(self, a: V128) -> V128 { if matches_isa!(Self, SSE41 | WASM128) { return self.u8x16_swizzle(a, crate::bswap::SHUFFLE_U64X2); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vrev64q_u8(t(a))) }; } { let _ = a; unreachable!() } } /// T1: NEON-A64, WASM128 /// /// T2: SSE2, NEON-A32 #[inline(always)] fn u8x16_any_zero(self, a: V128) -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { let is_zero = self.u8x16_eq(a, self.v128_create_zero()); return self.u8x16_bitmask(is_zero) != 0; } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { let a: uint8x8x2_t = t(a); let a = vpmin_u8(a.0, a.1); let m: u64 = t(vtst_u8(a, a)); m != u64::MAX }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { vminvq_u8(t(a)) == 0 }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { !u8x16_all_true(t(a)) }; } { let _ = a; unreachable!() } } /// T1: SSE2, WASM128 #[inline(always)] fn u8x16_bitmask(self, a: V128) -> u16 { if cfg!(miri) { return crate::simulation::u8x16_bitmask(a); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { _mm_movemask_epi8(t(a)) as u16 }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { unimplemented!() } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { u8x16_bitmask(t(a)) }; } { let _ = a; unreachable!() } } /// T1: NEON-A64 #[inline(always)] fn u8x16_reduce_max(self, a: V128) -> u8 { if matches_isa!(Self, SSE41 | WASM128) { unimplemented!() } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { unimplemented!() } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { vmaxvq_u8(t(a)) }; } { let _ = a; unreachable!() } } /// T1: NEON-A64 #[inline(always)] fn u8x16_reduce_min(self, a: V128) -> u8 { if matches_isa!(Self, SSE41 | WASM128) { unimplemented!() } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { unimplemented!() } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { vminvq_u8(t(a)) }; } { let _ = a; unreachable!() } } /// T1: NEON /// /// T2: SSE2, WASM128 #[inline(always)] fn v128_bsl(self, a: V128, b: V128, c: V128) -> V128 { if matches_isa!(Self, SSE2 | WASM128) { return self.v128_xor(self.v128_and(self.v128_xor(b, c), a), c); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vbslq_u8(t(a), t(b), t(c))) }; } { let _ = (a, b, c); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_zip_lo(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_unpacklo_epi8(t(a), t(b))) }; } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { t(vzipq_u8(t(a), t(b)).0) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vzip1q_u8(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_zip_hi(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_unpackhi_epi8(t(a), t(b))) }; } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { t(vzipq_u8(t(a), t(b)).1) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vzip2q_u8(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u16x8_zip_lo(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_unpacklo_epi16(t(a), t(b))) }; } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { t(vzipq_u16(t(a), t(b)).0) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vzip1q_u16(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u16x8_zip_hi(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_unpackhi_epi16(t(a), t(b))) }; } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { t(vzipq_u16(t(a), t(b)).1) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vzip2q_u16(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u32x4_zip_lo(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_unpacklo_epi32(t(a), t(b))) }; } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { t(vzipq_u32(t(a), t(b)).0) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vzip1q_u32(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u32x4_shuffle::<0, 4, 1, 5>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u32x4_zip_hi(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_unpackhi_epi32(t(a), t(b))) }; } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { t(vzipq_u32(t(a), t(b)).1) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vzip2q_u32(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u32x4_shuffle::<2, 6, 3, 7>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u64x2_zip_lo(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_unpacklo_epi64(t(a), t(b))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { let (a, b): ([u64; 2], [u64; 2]) = (t(a), t(b)); t([a[0], b[0]]) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u64x2_shuffle::<0, 2>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u64x2_zip_hi(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_unpackhi_epi64(t(a), t(b))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { let (a, b): ([u64; 2], [u64; 2]) = (t(a), t(b)); t([a[1], b[1]]) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u64x2_shuffle::<1, 3>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: NEON, WASM128 #[inline(always)] fn u8x16_unzip_even(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { unimplemented!() } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { t(vuzpq_u8(t(a), t(b)).0) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vuzp1q_u8(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: NEON, WASM128 #[inline(always)] fn u8x16_unzip_odd(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { unimplemented!() } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { t(vuzpq_u8(t(a), t(b)).1) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vuzp2q_u8(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { let (a, b) = unsafe { (t(a), t(b)) }; let ans = u8x16_shuffle::<1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31>(a, b); return unsafe { t(ans) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2 #[inline(always)] fn u16x8_mul_hi(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_mulhi_epu16(t(a), t(b))) }; } if matches_isa!(Self, NEON | WASM128) { unimplemented!() } { let _ = (a, b); unreachable!() } } /// T1: SSE2 #[inline(always)] fn i16x8_mul_hi(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_mulhi_epi16(t(a), t(b))) }; } if matches_isa!(Self, NEON | WASM128) { unimplemented!() } { let _ = (a, b); unreachable!() } } /// T1: SSSE3 #[inline(always)] fn i16x8_maddubs(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSSE3) { return unsafe { t(_mm_maddubs_epi16(t(a), t(b))) }; } if matches_isa!(Self, NEON | WASM128) { unimplemented!() } { let _ = (a, b); unreachable!() } } /// T1: SSE41 #[inline(always)] fn u16x8_blend(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE41) { return unsafe { t(_mm_blend_epi16::(t(a), t(b))) }; } if matches_isa!(Self, NEON | WASM128) { unimplemented!() } { let _ = (a, b); unreachable!() } } /// if highbit(c) { b } else { a } /// /// T1: SSE41 #[inline(always)] fn u8x16_blendv(self, a: V128, b: V128, c: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE41) { return unsafe { t(_mm_blendv_epi8(t(a), t(b), t(c))) }; } if matches_isa!(Self, NEON | WASM128) { unimplemented!() } { let _ = (a, b, c); unreachable!() } } /// T1: SSE2 #[inline(always)] fn i16x8_madd(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_madd_epi16(t(a), t(b))) }; } if matches_isa!(Self, NEON | WASM128) { unimplemented!() } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_avgr(self, a: V128, b: V128) -> V128 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_avg_epu8(t(a), t(b))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { t(vrhaddq_u8(t(a), t(b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { t(u8x16_avgr(t(a), t(b))) }; } { let _ = (a, b); unreachable!() } } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn i8x16_add_sat(self, a: V128, b: V128) -> V128 { unified::add_sat::<_, i8, _>(self, a, b) } /// T1: SSE2, NEON, WASM128 #[inline(always)] fn u8x16_add_sat(self, a: V128, b: V128) -> V128 { unified::add_sat::<_, u8, _>(self, a, b) } /// T1: SSE2 #[inline(always)] fn i16x8_packus(self, a: V128, b: V128) -> V128 { if cfg!(miri) { return crate::simulation::i16x8_packus(a, b); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { return unsafe { t(_mm_packus_epi16(t(a), t(b))) }; } { let _ = (a, b); unreachable!() } } } vsimd-0.8.0/src/simd256.rs000064400000000000000000000632301046102023000133050ustar 00000000000000use crate::isa::{AVX2, NEON, SSE2, WASM128}; use crate::vector::{V128, V256}; use crate::{unified, SIMD128}; #[cfg(any( any(target_arch = "x86", target_arch = "x86_64"), any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"), target_arch = "wasm32" ))] use core::mem::transmute as t; #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; #[cfg(all(feature = "unstable", target_arch = "arm"))] use core::arch::arm::*; #[cfg(target_arch = "aarch64")] use core::arch::aarch64::*; #[cfg(target_arch = "wasm32")] use core::arch::wasm32::*; #[macro_export] macro_rules! simd256_vop { ($s:expr, $f:expr, $a:expr) => {{ let s = $s; let f = $f; let a = $a.to_v128x2(); let b = (f(s, a.0), f(s, a.1)); V256::from_v128x2(b) }}; ($s:expr, $f:expr, $a:expr, $b:expr) => {{ let s = $s; let f = $f; let a = $a.to_v128x2(); let b = $b.to_v128x2(); let c = (f(s, a.0, b.0), f(s, a.1, b.1)); V256::from_v128x2(c) }}; ($s:expr, $f:expr, $a:expr, $b:expr, $c:expr) => {{ let s = $s; let f = $f; let a = $a.to_v128x2(); let b = $b.to_v128x2(); let c = $c.to_v128x2(); let d = (f(s, a.0, b.0, c.0), f(s, a.1, b.1, c.1)); V256::from_v128x2(d) }}; } pub unsafe trait SIMD256: SIMD128 { #[inline(always)] unsafe fn v256_load(self, addr: *const u8) -> V256 { debug_assert_ptr_align!(addr, 32); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return t(_mm256_load_si256(addr.cast())); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return t(vld1q_u8_x2(addr.cast())); } { let x0 = self.v128_load(addr); let x1 = self.v128_load(addr.add(16)); V256::from_v128x2((x0, x1)) } } #[inline(always)] unsafe fn v256_load_unaligned(self, addr: *const u8) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return t(_mm256_loadu_si256(addr.cast())); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return t(vld1q_u8_x2(addr.cast())); } { let x0 = self.v128_load_unaligned(addr); let x1 = self.v128_load_unaligned(addr.add(16)); V256::from_v128x2((x0, x1)) } } #[inline(always)] unsafe fn v256_store(self, addr: *mut u8, a: V256) { debug_assert_ptr_align!(addr, 32); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return _mm256_store_si256(addr.cast(), t(a)); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return vst1q_u8_x2(addr.cast(), t(a)); } { let a = a.to_v128x2(); self.v128_store(addr, a.0); self.v128_store(addr.add(16), a.1); } } #[inline(always)] unsafe fn v256_store_unaligned(self, addr: *mut u8, a: V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return _mm256_storeu_si256(addr.cast(), t(a)); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return vst1q_u8_x2(addr.cast(), t(a)); } { let a = a.to_v128x2(); self.v128_store_unaligned(addr, a.0); self.v128_store_unaligned(addr.add(16), a.1); } } #[inline(always)] fn v256_create_zero(self) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_setzero_si256()) }; } { self.v128_create_zero().x2() } } #[inline(always)] fn v256_not(self, a: V256) -> V256 { if matches_isa!(Self, AVX2) { return self.v256_xor(a, self.u8x32_eq(a, a)); } { simd256_vop!(self, Self::v128_not, a) } } #[inline(always)] fn v256_and(self, a: V256, b: V256) -> V256 { unified::and(self, a, b) } #[inline(always)] fn v256_or(self, a: V256, b: V256) -> V256 { unified::or(self, a, b) } #[inline(always)] fn v256_xor(self, a: V256, b: V256) -> V256 { unified::xor(self, a, b) } #[inline(always)] fn v256_andnot(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_andnot_si256(t(b), t(a))) }; } { simd256_vop!(self, Self::v128_andnot, a, b) } } #[inline(always)] fn v256_all_zero(self, a: V256) -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { let a = t(a); _mm256_testz_si256(a, a) != 0 }; } { let a = a.to_v128x2(); self.v128_all_zero(self.v128_or(a.0, a.1)) } } #[inline(always)] fn u8x32_splat(self, x: u8) -> V256 { unified::splat(self, x) } #[inline(always)] fn u16x16_splat(self, x: u16) -> V256 { unified::splat(self, x) } #[inline(always)] fn u32x8_splat(self, x: u32) -> V256 { unified::splat(self, x) } #[inline(always)] fn u64x4_splat(self, x: u64) -> V256 { unified::splat(self, x) } #[inline(always)] fn i8x32_splat(self, x: i8) -> V256 { unified::splat(self, x) } #[inline(always)] fn i16x16_splat(self, x: i16) -> V256 { unified::splat(self, x) } #[inline(always)] fn i32x8_splat(self, x: i32) -> V256 { unified::splat(self, x) } #[inline(always)] fn i64x4_splat(self, x: i64) -> V256 { unified::splat(self, x) } #[inline(always)] fn u8x32_add(self, a: V256, b: V256) -> V256 { unified::add::<_, u8, _>(self, a, b) } #[inline(always)] fn u16x16_add(self, a: V256, b: V256) -> V256 { unified::add::<_, u16, _>(self, a, b) } #[inline(always)] fn u32x8_add(self, a: V256, b: V256) -> V256 { unified::add::<_, u32, _>(self, a, b) } #[inline(always)] fn u64x4_add(self, a: V256, b: V256) -> V256 { unified::add::<_, u64, _>(self, a, b) } #[inline(always)] fn u8x32_sub(self, a: V256, b: V256) -> V256 { unified::sub::<_, u8, _>(self, a, b) } #[inline(always)] fn u16x16_sub(self, a: V256, b: V256) -> V256 { unified::sub::<_, u16, _>(self, a, b) } #[inline(always)] fn u32x8_sub(self, a: V256, b: V256) -> V256 { unified::sub::<_, u32, _>(self, a, b) } #[inline(always)] fn u64x4_sub(self, a: V256, b: V256) -> V256 { unified::sub::<_, u64, _>(self, a, b) } #[inline(always)] fn u8x32_sub_sat(self, a: V256, b: V256) -> V256 { unified::sub_sat::<_, u8, _>(self, a, b) } #[inline(always)] fn u16x16_sub_sat(self, a: V256, b: V256) -> V256 { unified::sub_sat::<_, u16, _>(self, a, b) } #[inline(always)] fn i8x32_sub_sat(self, a: V256, b: V256) -> V256 { unified::sub_sat::<_, i8, _>(self, a, b) } #[inline(always)] fn i16x16_sub_sat(self, a: V256, b: V256) -> V256 { unified::sub_sat::<_, i16, _>(self, a, b) } #[inline(always)] fn i16x16_mul_lo(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_mullo_epi16(t(a), t(b))) }; } { simd256_vop!(self, Self::i16x8_mul_lo, a, b) } } #[inline(always)] fn i32x8_mul_lo(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_mullo_epi32(t(a), t(b))) }; } { simd256_vop!(self, Self::i32x4_mul_lo, a, b) } } #[inline(always)] fn u16x16_shl(self, a: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_slli_epi16::(t(a))) }; } { simd256_vop!(self, Self::u16x8_shl::, a) } } #[inline(always)] fn u32x8_shl(self, a: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_slli_epi32::(t(a))) }; } { simd256_vop!(self, Self::u32x4_shl::, a) } } #[inline(always)] fn u16x16_shr(self, a: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_srli_epi16::(t(a))) }; } { simd256_vop!(self, Self::u16x8_shr::, a) } } #[inline(always)] fn u32x8_shr(self, a: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_srli_epi32::(t(a))) }; } { simd256_vop!(self, Self::u32x4_shr::, a) } } #[inline(always)] fn u8x32_eq(self, a: V256, b: V256) -> V256 { unified::eq::<_, u8, _>(self, a, b) } #[inline(always)] fn u16x16_eq(self, a: V256, b: V256) -> V256 { unified::eq::<_, u16, _>(self, a, b) } #[inline(always)] fn u32x8_eq(self, a: V256, b: V256) -> V256 { unified::eq::<_, u32, _>(self, a, b) } #[inline(always)] fn u8x32_lt(self, a: V256, b: V256) -> V256 { unified::lt::<_, u8, _>(self, a, b) } #[inline(always)] fn u16x16_lt(self, a: V256, b: V256) -> V256 { unified::lt::<_, u16, _>(self, a, b) } #[inline(always)] fn u32x8_lt(self, a: V256, b: V256) -> V256 { unified::lt::<_, u32, _>(self, a, b) } #[inline(always)] fn i8x32_lt(self, a: V256, b: V256) -> V256 { unified::lt::<_, i8, _>(self, a, b) } #[inline(always)] fn i16x16_lt(self, a: V256, b: V256) -> V256 { unified::lt::<_, i16, _>(self, a, b) } #[inline(always)] fn i32x8_lt(self, a: V256, b: V256) -> V256 { unified::lt::<_, i32, _>(self, a, b) } #[inline(always)] fn u8x32_max(self, a: V256, b: V256) -> V256 { unified::max::<_, u8, _>(self, a, b) } #[inline(always)] fn u16x16_max(self, a: V256, b: V256) -> V256 { unified::max::<_, u16, _>(self, a, b) } #[inline(always)] fn u32x8_max(self, a: V256, b: V256) -> V256 { unified::max::<_, u32, _>(self, a, b) } #[inline(always)] fn i8x32_max(self, a: V256, b: V256) -> V256 { unified::max::<_, i8, _>(self, a, b) } #[inline(always)] fn i16x16_max(self, a: V256, b: V256) -> V256 { unified::max::<_, i16, _>(self, a, b) } #[inline(always)] fn i32x8_max(self, a: V256, b: V256) -> V256 { unified::max::<_, i32, _>(self, a, b) } #[inline(always)] fn u8x32_min(self, a: V256, b: V256) -> V256 { unified::min::<_, u8, _>(self, a, b) } #[inline(always)] fn u16x16_min(self, a: V256, b: V256) -> V256 { unified::min::<_, u16, _>(self, a, b) } #[inline(always)] fn u32x8_min(self, a: V256, b: V256) -> V256 { unified::min::<_, u32, _>(self, a, b) } #[inline(always)] fn i8x32_min(self, a: V256, b: V256) -> V256 { unified::min::<_, i8, _>(self, a, b) } #[inline(always)] fn i16x16_min(self, a: V256, b: V256) -> V256 { unified::min::<_, i16, _>(self, a, b) } #[inline(always)] fn i32x8_min(self, a: V256, b: V256) -> V256 { unified::min::<_, i32, _>(self, a, b) } #[inline(always)] fn u8x16x2_swizzle(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_shuffle_epi8(t(a), t(b))) }; } { simd256_vop!(self, Self::u8x16_swizzle, a, b) } } #[inline(always)] fn u16x16_bswap(self, a: V256) -> V256 { if matches_isa!(Self, AVX2) { return self.u8x16x2_swizzle(a, crate::bswap::SHUFFLE_U16X16); } { simd256_vop!(self, Self::u16x8_bswap, a) } } #[inline(always)] fn u32x8_bswap(self, a: V256) -> V256 { if matches_isa!(Self, AVX2) { return self.u8x16x2_swizzle(a, crate::bswap::SHUFFLE_U32X8); } { simd256_vop!(self, Self::u32x4_bswap, a) } } #[inline(always)] fn u64x4_bswap(self, a: V256) -> V256 { if matches_isa!(Self, AVX2) { return self.u8x16x2_swizzle(a, crate::bswap::SHUFFLE_U64X4); } { simd256_vop!(self, Self::u64x2_bswap, a) } } #[inline(always)] fn u8x32_swizzle(self, a: V256, b: V256) -> V256 { if matches_isa!(Self, SSE2 | WASM128) { let _ = (a, b); unimplemented!() } #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { let _ = (a, b); unimplemented!() } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { let (a, b): (uint8x16x2_t, uint8x16x2_t) = (t(a), t(b)); let c = (vqtbl2q_u8(a, b.0), vqtbl2q_u8(a, b.1)); t(uint8x16x2_t(c.0, c.1)) }; } { let _ = (a, b); unreachable!() } } #[inline(always)] fn u8x32_any_zero(self, a: V256) -> bool { if matches_isa!(Self, AVX2) { let is_zero = self.u8x32_eq(a, self.v256_create_zero()); return self.u8x32_bitmask(is_zero) != 0; } { let a = a.to_v128x2(); self.u8x16_any_zero(self.u8x16_min(a.0, a.1)) } } #[inline(always)] fn u8x32_bitmask(self, a: V256) -> u32 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { _mm256_movemask_epi8(t(a)) as u32 }; } { let a = a.to_v128x2(); let m0 = self.u8x16_bitmask(a.0) as u32; let m1 = self.u8x16_bitmask(a.1) as u32; (m1 << 16) | m0 } } #[inline(always)] fn u8x32_reduce_max(self, a: V256) -> u8 { let a = a.to_v128x2(); self.u8x16_reduce_max(self.u8x16_max(a.0, a.1)) } #[inline(always)] fn u8x32_reduce_min(self, a: V256) -> u8 { let a = a.to_v128x2(); self.u8x16_reduce_min(self.u8x16_min(a.0, a.1)) } /// for each bit: if a == 1 { b } else { c } /// /// ans = ((b ^ c) & a) ^ c #[inline(always)] fn v256_bsl(self, a: V256, b: V256, c: V256) -> V256 { if matches_isa!(Self, NEON) { return simd256_vop!(self, Self::v128_bsl, a, b, c); } { self.v256_xor(self.v256_and(self.v256_xor(b, c), a), c) } } #[inline(always)] fn u16x16_from_u8x16(self, a: V128) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_cvtepu8_epi16(t(a))) }; } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, SSE2) { let zero = self.v128_create_zero(); let lo = self.u8x16_zip_lo(a, zero); let hi = self.u8x16_zip_hi(a, zero); return V256::from_v128x2((lo, hi)); } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(Self, NEON) { return unsafe { let a = t(a); let low = vmovl_u8(vget_low_u8(a)); let high = vmovl_u8(vget_high_u8(a)); t(uint16x8x2_t(low, high)) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(Self, WASM128) { return unsafe { let a = t(a); let low = t(u16x8_extend_low_u8x16(a)); let high = t(u16x8_extend_high_u8x16(a)); V256::from_v128x2((low, high)) }; } { let _ = a; unreachable!() } } #[inline(always)] fn u8x16x2_zip_lo(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_unpacklo_epi8(t(a), t(b))) }; } { simd256_vop!(self, Self::u8x16_zip_lo, a, b) } } #[inline(always)] fn u8x16x2_zip_hi(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_unpackhi_epi8(t(a), t(b))) }; } { simd256_vop!(self, Self::u8x16_zip_hi, a, b) } } #[inline(always)] fn u16x8x2_zip_lo(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_unpacklo_epi16(t(a), t(b))) }; } { simd256_vop!(self, Self::u16x8_zip_lo, a, b) } } #[inline(always)] fn u16x8x2_zip_hi(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_unpackhi_epi16(t(a), t(b))) }; } { simd256_vop!(self, Self::u16x8_zip_hi, a, b) } } #[inline(always)] fn u32x4x2_zip_lo(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_unpacklo_epi32(t(a), t(b))) }; } { simd256_vop!(self, Self::u32x4_zip_lo, a, b) } } #[inline(always)] fn u32x4x2_zip_hi(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_unpackhi_epi32(t(a), t(b))) }; } { simd256_vop!(self, Self::u32x4_zip_hi, a, b) } } #[inline(always)] fn u64x2x2_zip_lo(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_unpacklo_epi64(t(a), t(b))) }; } { simd256_vop!(self, Self::u64x2_zip_lo, a, b) } } #[inline(always)] fn u64x2x2_zip_hi(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_unpackhi_epi64(t(a), t(b))) }; } { simd256_vop!(self, Self::u64x2_zip_hi, a, b) } } #[inline(always)] fn v128x2_zip_lo(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_permute2x128_si256::<0b0010_0000>(t(a), t(b))) }; } if matches_isa!(Self, SSE2 | NEON | WASM128) { let ((a, _), (c, _)) = (a.to_v128x2(), b.to_v128x2()); return V256::from_v128x2((a, c)); } { unreachable!() } } #[inline(always)] fn v128x2_zip_hi(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_permute2x128_si256::<0b0011_0001>(t(a), t(b))) }; } if matches_isa!(Self, SSE2 | NEON | WASM128) { let ((_, b), (_, d)) = (a.to_v128x2(), b.to_v128x2()); return V256::from_v128x2((b, d)); } { unreachable!() } } #[inline(always)] fn u64x4_permute(self, a: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_permute4x64_epi64::(t(a))) }; } if matches_isa!(Self, SSE2 | NEON | WASM128) { let _ = a; unimplemented!() } { let _ = a; unreachable!() } } #[inline(always)] fn u8x32_unzip_even(self, a: V256, b: V256) -> V256 { if matches_isa!(Self, SSE2) { unimplemented!() } { let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2()); let ab = self.u8x16_unzip_even(a, b); let cd = self.u8x16_unzip_even(c, d); V256::from_v128x2((ab, cd)) } } #[inline(always)] fn u8x32_unzip_odd(self, a: V256, b: V256) -> V256 { if matches_isa!(Self, SSE2) { unimplemented!() } { let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2()); let ab = self.u8x16_unzip_odd(a, b); let cd = self.u8x16_unzip_odd(c, d); V256::from_v128x2((ab, cd)) } } #[inline(always)] fn u64x4_unzip_even(self, a: V256, b: V256) -> V256 { if matches_isa!(Self, AVX2) { let acbd = self.u64x2x2_zip_lo(a, b); let abcd = self.u64x4_permute::<0b_1101_1000>(acbd); // 0213 return abcd; } { let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2()); let ab = self.u64x2_zip_lo(a, b); let cd = self.u64x2_zip_lo(c, d); V256::from_v128x2((ab, cd)) } } #[inline(always)] fn u64x4_unzip_odd(self, a: V256, b: V256) -> V256 { if matches_isa!(Self, AVX2) { let acbd = self.u64x2x2_zip_hi(a, b); let abcd = self.u64x4_permute::<0b_1101_1000>(acbd); // 0213 return abcd; } { let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2()); let ab = self.u64x2_zip_hi(a, b); let cd = self.u64x2_zip_hi(c, d); V256::from_v128x2((ab, cd)) } } #[inline(always)] fn u16x16_mul_hi(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_mulhi_epu16(t(a), t(b))) }; } { simd256_vop!(self, Self::u16x8_mul_hi, a, b) } } #[inline(always)] fn i16x16_mul_hi(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_mulhi_epi16(t(a), t(b))) }; } { simd256_vop!(self, Self::i16x8_mul_hi, a, b) } } #[inline(always)] fn i16x16_maddubs(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_maddubs_epi16(t(a), t(b))) }; } { simd256_vop!(self, Self::i16x8_maddubs, a, b) } } #[inline(always)] fn u32x8_blend(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_blend_epi32::(t(a), t(b))) }; } if matches_isa!(Self, NEON | WASM128) { unimplemented!() } { let _ = (a, b); unreachable!() } } /// if highbit(c) { b } else { a } #[inline(always)] fn u8x32_blendv(self, a: V256, b: V256, c: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_blendv_epi8(t(a), t(b), t(c))) }; } if matches_isa!(Self, NEON | WASM128) { unimplemented!() } { simd256_vop!(self, Self::u8x16_blendv, a, b, c) } } #[inline(always)] fn i16x16_madd(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_madd_epi16(t(a), t(b))) }; } { simd256_vop!(self, Self::i16x8_madd, a, b) } } #[inline(always)] fn u8x32_avgr(self, a: V256, b: V256) -> V256 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(Self, AVX2) { return unsafe { t(_mm256_avg_epu8(t(a), t(b))) }; } { simd256_vop!(self, Self::u8x16_avgr, a, b) } } #[inline(always)] fn i8x32_add_sat(self, a: V256, b: V256) -> V256 { unified::add_sat::<_, i8, _>(self, a, b) } #[inline(always)] fn u8x32_add_sat(self, a: V256, b: V256) -> V256 { unified::add_sat::<_, u8, _>(self, a, b) } } vsimd-0.8.0/src/simd64.rs000064400000000000000000000016661046102023000132270ustar 00000000000000use crate::isa::InstructionSet; use crate::vector::V64; #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] use crate::isa::NEON; #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] use core::mem::transmute as t; #[cfg(all(feature = "unstable", target_arch = "arm"))] use core::arch::arm::*; #[cfg(target_arch = "aarch64")] use core::arch::aarch64::*; pub unsafe trait SIMD64: InstructionSet { #[inline(always)] #[must_use] fn u8x8_unzip_even(self, a: V64, b: V64) -> V64 { #[cfg(all(feature = "unstable", target_arch = "arm"))] if matches_isa!(Self, NEON) { return unsafe { t(vuzp_u8(t(a), t(b)).0) }; } #[cfg(target_arch = "aarch64")] if matches_isa!(Self, NEON) { return unsafe { t(vuzp1_u8(t(a), t(b))) }; } { let _ = (a, b); unreachable!() } } } vsimd-0.8.0/src/simulation.rs000064400000000000000000000035031046102023000142750ustar 00000000000000use crate::vector::V128; use core::mem::transmute as t; #[cfg(miri)] use core::cmp::{max, min}; // TODO: waiting for MIRI's support #[cfg(miri)] #[inline(always)] pub fn u8x16_max(a: V128, b: V128) -> V128 { let (a, b) = (a.as_bytes(), b.as_bytes()); let mut c = [0; 16]; for i in 0..16 { c[i] = max(a[i], b[i]); } V128::from_bytes(c) } #[cfg(miri)] #[inline(always)] pub fn u8x16_min(a: V128, b: V128) -> V128 { let (a, b) = (a.as_bytes(), b.as_bytes()); let mut c = [0; 16]; for i in 0..16 { c[i] = min(a[i], b[i]); } V128::from_bytes(c) } #[allow(clippy::needless_range_loop)] #[inline(always)] pub fn u8x16_bitmask(a: V128) -> u16 { // FIXME: is it defined behavior? // https://github.com/rust-lang/miri/issues/2617 // https://github.com/rust-lang/stdarch/issues/1347 let a = a.as_bytes(); let mut m: u16 = 0; for i in 0..16 { m |= ((a[i] >> 7) as u16) << i; } m } #[allow(clippy::needless_range_loop)] #[inline(always)] pub fn u16x8_shr(a: V128, imm8: u8) -> V128 { let mut a: [u16; 8] = unsafe { t(a) }; for i in 0..8 { a[i] >>= imm8; } unsafe { t(a) } } #[allow(clippy::needless_range_loop)] #[inline(always)] pub fn u16x8_shl(a: V128, imm8: u8) -> V128 { let mut a: [u16; 8] = unsafe { t(a) }; for i in 0..8 { a[i] <<= imm8; } unsafe { t(a) } } #[inline(always)] pub fn i16x8_packus(a: V128, b: V128) -> V128 { let a: [i16; 8] = unsafe { t(a) }; let b: [i16; 8] = unsafe { t(b) }; let sat_u8 = |x: i16| { if x < 0 { 0 } else if x > 255 { 255 } else { x as u8 } }; let mut c: [u8; 16] = [0; 16]; for i in 0..8 { c[i] = sat_u8(a[i]); c[i + 8] = sat_u8(b[i]); } V128::from_bytes(c) } vsimd-0.8.0/src/table.rs000064400000000000000000000006641046102023000132050ustar 00000000000000use crate::isa::{NEON, SSSE3, WASM128}; use crate::pod::POD; use crate::Scalable; #[inline(always)] pub fn u8x16xn_lookup(s: S, lut: V, x: V) -> V where S: Scalable, V: POD, { if matches_isa!(S, SSSE3) { return s.u8x16xn_swizzle(lut, x); } if matches_isa!(S, NEON | WASM128) { let idx = s.and(x, s.u8xn_splat(0x8f)); return s.u8x16xn_swizzle(lut, idx); } unreachable!() } vsimd-0.8.0/src/tools.rs000064400000000000000000000050721046102023000132540ustar 00000000000000#[cfg(feature = "alloc")] item_group! { use core::mem::MaybeUninit; use alloc::boxed::Box; } /// Allocates uninit bytes /// /// # Safety /// This function requires: /// /// + `len > 0` /// + `len <= isize::MAX` /// #[cfg(feature = "alloc")] #[inline] #[must_use] pub unsafe fn alloc_uninit_bytes(len: usize) -> Box<[MaybeUninit]> { #[allow(clippy::checked_conversions)] #[cfg(any(debug_assertions, miri))] { assert!(len > 0 && len <= (isize::MAX as usize)); } use alloc::alloc::{alloc, handle_alloc_error, Layout}; let layout = Layout::from_size_align_unchecked(len, 1); let p = alloc(layout); if p.is_null() { handle_alloc_error(layout) } let ptr = p.cast(); Box::from_raw(core::ptr::slice_from_raw_parts_mut(ptr, len)) } #[cfg(feature = "alloc")] #[inline] #[must_use] pub unsafe fn assume_init(b: Box<[MaybeUninit]>) -> Box<[u8]> { let len = b.len(); let ptr = Box::into_raw(b).cast::(); Box::from_raw(core::ptr::slice_from_raw_parts_mut(ptr, len)) } #[inline(always)] pub unsafe fn read(base: *const T, offset: usize) -> T { base.add(offset).read() } #[inline(always)] pub unsafe fn write(base: *mut T, offset: usize, value: T) { base.add(offset).write(value); } #[inline(always)] pub unsafe fn slice<'a, T>(data: *const T, len: usize) -> &'a [T] { core::slice::from_raw_parts(data, len) } #[inline(always)] pub unsafe fn slice_mut<'a, T>(data: *mut T, len: usize) -> &'a mut [T] { core::slice::from_raw_parts_mut(data, len) } #[inline(always)] pub fn unroll(slice: &[T], chunk_size: usize, mut f: impl FnMut(&T)) { let mut iter = slice.chunks_exact(chunk_size); for chunk in &mut iter { chunk.iter().for_each(&mut f); } iter.remainder().iter().for_each(&mut f); } #[inline(always)] #[must_use] pub fn is_same_type() -> bool where A: 'static, B: 'static, { use core::any::TypeId; TypeId::of::() == TypeId::of::() } #[inline(always)] pub fn slice_parts(slice: &[T]) -> (*const T, usize) { let len = slice.len(); let ptr = slice.as_ptr(); (ptr, len) } #[cfg(feature = "alloc")] #[inline(always)] #[must_use] pub unsafe fn boxed_str(b: Box<[u8]>) -> Box { let ptr = Box::into_raw(b); Box::from_raw(core::str::from_utf8_unchecked_mut(&mut *ptr)) } #[allow(clippy::ptr_as_ptr)] #[inline(always)] #[cfg_attr(debug_assertions, track_caller)] pub unsafe fn transmute_copy(a: &A) -> B { debug_assert!(core::mem::size_of::() == core::mem::size_of::()); *(a as *const A as *const B) } vsimd-0.8.0/src/unified.rs000064400000000000000000001137021046102023000135370ustar 00000000000000#![allow(clippy::collapsible_if, clippy::too_many_lines)] use crate::isa::InstructionSet; use crate::pod::POD; use crate::tools::transmute_copy as tc; use crate::vector::{V128, V256}; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] use crate::isa::{AVX2, SSE2, SSE41}; #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] use crate::isa::NEON; #[cfg(target_arch = "wasm32")] use crate::isa::WASM128; #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; #[cfg(all(feature = "unstable", target_arch = "arm"))] use core::arch::arm::*; #[cfg(target_arch = "aarch64")] use core::arch::aarch64::*; #[cfg(target_arch = "wasm32")] use core::arch::wasm32::*; #[inline(always)] pub fn splat(s: S, x: T) -> V { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&_mm256_set1_epi8(tc(&x))) }; } } { let c = splat::(s, x).x2(); return unsafe { tc(&c) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&_mm_set1_epi8(tc(&x))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&_mm_set1_epi16(tc(&x))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&_mm_set1_epi32(tc(&x))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&_mm_set1_epi64x(tc(&x))) }; } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&vld1q_dup_u8(&tc(&x))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&vld1q_dup_u16(&tc(&x))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&vld1q_dup_u32(&tc(&x))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&vld1q_dup_u64(&tc(&x))) }; } } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&u8x16_splat(tc(&x))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&u16x8_splat(tc(&x))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&u32x4_splat(tc(&x))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&u64x2_splat(tc(&x))) }; } } } { let _ = (s, x); unreachable!() } } #[inline(always)] pub fn add(s: S, a: V, b: V) -> V { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&_mm256_add_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&_mm256_add_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&_mm256_add_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&_mm256_add_epi64(tc(&a), tc(&b))) }; } } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = add::(s, a.0, b.0); let c1 = add::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&_mm_add_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&_mm_add_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&_mm_add_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&_mm_add_epi64(tc(&a), tc(&b))) }; } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&vaddq_u8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&vaddq_u16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&vaddq_u32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&vaddq_u64(tc(&a), tc(&b))) }; } } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&u8x16_add(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&u16x8_add(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&u32x4_add(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&u64x2_add(tc(&a), tc(&b))) }; } } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn sub(s: S, a: V, b: V) -> V { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&_mm256_sub_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&_mm256_sub_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&_mm256_sub_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&_mm256_sub_epi64(tc(&a), tc(&b))) }; } } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = sub::(s, a.0, b.0); let c1 = sub::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&_mm_sub_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&_mm_sub_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&_mm_sub_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&_mm_sub_epi64(tc(&a), tc(&b))) }; } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&vsubq_u8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&vsubq_u16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&vsubq_u32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&vsubq_u64(tc(&a), tc(&b))) }; } } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&u8x16_sub(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&u16x8_sub(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&u32x4_sub(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&u64x2_sub(tc(&a), tc(&b))) }; } } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn eq(s: S, a: V, b: V) -> V { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&_mm256_cmpeq_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&_mm256_cmpeq_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&_mm256_cmpeq_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&_mm256_cmpeq_epi64(tc(&a), tc(&b))) }; } } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = eq::(s, a.0, b.0); let c1 = eq::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&_mm_cmpeq_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&_mm_cmpeq_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&_mm_cmpeq_epi32(tc(&a), tc(&b))) }; } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&vceqq_u8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&vceqq_u16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&vceqq_u32(tc(&a), tc(&b))) }; } #[cfg(target_arch = "aarch64")] if is_pod_type!(T, u64 | i64) { return unsafe { tc(&vceqq_u64(tc(&a), tc(&b))) }; } } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { if is_pod_type!(T, u8 | i8) { return unsafe { tc(&u8x16_eq(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16 | i16) { return unsafe { tc(&u16x8_eq(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32 | i32) { return unsafe { tc(&u32x4_eq(tc(&a), tc(&b))) }; } if is_pod_type!(T, u64 | i64) { return unsafe { tc(&u64x2_eq(tc(&a), tc(&b))) }; } } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn lt(s: S, a: V, b: V) -> V { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm256_cmpgt_epi8(tc(&b), tc(&a))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&_mm256_cmpgt_epi16(tc(&b), tc(&a))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&_mm256_cmpgt_epi32(tc(&b), tc(&a))) }; } if is_pod_type!(T, i64) { return unsafe { tc(&_mm256_cmpgt_epi64(tc(&b), tc(&a))) }; } if is_pod_type!(T, u8) { return unsafe { let (a, b) = (tc(&a), tc(&b)); let c = _mm256_cmpeq_epi8(a, _mm256_max_epu8(a, b)); tc(&_mm256_xor_si256(c, _mm256_cmpeq_epi8(a, a))) }; } if is_pod_type!(T, u16) { return unsafe { let (a, b) = (tc(&a), tc(&b)); let c = _mm256_cmpeq_epi16(a, _mm256_max_epu16(a, b)); tc(&_mm256_xor_si256(c, _mm256_cmpeq_epi16(a, a))) }; } if is_pod_type!(T, u32) { return unsafe { let (a, b) = (tc(&a), tc(&b)); let c = _mm256_cmpeq_epi32(a, _mm256_max_epu32(a, b)); tc(&_mm256_xor_si256(c, _mm256_cmpeq_epi32(a, a))) }; } } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = lt::(s, a.0, b.0); let c1 = lt::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm_cmplt_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&_mm_cmplt_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&_mm_cmplt_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { let (a, b) = (tc(&a), tc(&b)); let c = _mm_cmpeq_epi8(a, _mm_max_epu8(a, b)); tc(&_mm_xor_si128(c, _mm_cmpeq_epi8(a, a))) }; } if is_pod_type!(T, u16) { return unsafe { let m = _mm_set1_epi16(i16::MIN); let a = _mm_xor_si128(tc(&a), m); let b = _mm_xor_si128(tc(&b), m); tc(&_mm_cmplt_epi16(a, b)) }; } if is_pod_type!(T, u32) { return unsafe { let m = _mm_set1_epi32(i32::MIN); let a = _mm_xor_si128(tc(&a), m); let b = _mm_xor_si128(tc(&b), m); tc(&_mm_cmplt_epi32(a, b)) }; } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { if is_pod_type!(T, i8) { return unsafe { tc(&vcltq_s8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&vcltq_s16(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&vcltq_s32(tc(&a), tc(&b))) }; } #[cfg(target_arch = "aarch64")] if is_pod_type!(T, i64) { return unsafe { tc(&vcltq_s64(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&vcltq_u8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&vcltq_u16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&vcltq_u32(tc(&a), tc(&b))) }; } #[cfg(target_arch = "aarch64")] if is_pod_type!(T, u64) { return unsafe { tc(&vcltq_u64(tc(&a), tc(&b))) }; } } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { if is_pod_type!(T, i8) { return unsafe { tc(&i8x16_lt(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&i16x8_lt(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&i32x4_lt(tc(&a), tc(&b))) }; } if is_pod_type!(T, i64) { return unsafe { tc(&i64x2_lt(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&u8x16_lt(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&u16x8_lt(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&u32x4_lt(tc(&a), tc(&b))) }; } // if is_pod_type!(T, u64) { // return unsafe { tc(&u64x2_lt(tc(&a), tc(&b))) }; // } } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn add_sat(s: S, a: V, b: V) -> V { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm256_adds_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&_mm256_adds_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&_mm256_adds_epu8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&_mm256_adds_epu16(tc(&a), tc(&b))) }; } } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = add_sat::(s, a.0, b.0); let c1 = add_sat::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm_adds_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&_mm_adds_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&_mm_adds_epu8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&_mm_adds_epu16(tc(&a), tc(&b))) }; } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { if is_pod_type!(T, i8) { return unsafe { tc(&vqaddq_s8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&vqaddq_s16(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&vqaddq_s32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&vqaddq_u8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&vqaddq_u16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&vqaddq_u32(tc(&a), tc(&b))) }; } } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { if is_pod_type!(T, i8) { return unsafe { tc(&i8x16_add_sat(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&i16x8_add_sat(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&u8x16_add_sat(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&u16x8_add_sat(tc(&a), tc(&b))) }; } } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn sub_sat(s: S, a: V, b: V) -> V { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm256_subs_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&_mm256_subs_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&_mm256_subs_epu8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&_mm256_subs_epu16(tc(&a), tc(&b))) }; } } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = sub_sat::(s, a.0, b.0); let c1 = sub_sat::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm_subs_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&_mm_subs_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&_mm_subs_epu8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&_mm_subs_epu16(tc(&a), tc(&b))) }; } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { if is_pod_type!(T, i8) { return unsafe { tc(&vqsubq_s8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&vqsubq_s16(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&vqsubq_s32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&vqsubq_u8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&vqsubq_u16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&vqsubq_u32(tc(&a), tc(&b))) }; } } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { if is_pod_type!(T, i8) { return unsafe { tc(&i8x16_sub_sat(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&i16x8_sub_sat(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&u8x16_sub_sat(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&u16x8_sub_sat(tc(&a), tc(&b))) }; } } } { let _ = (s, a, b); unreachable!() } } pub fn max(s: S, a: V, b: V) -> V { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm256_max_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&_mm256_max_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&_mm256_max_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&_mm256_max_epu8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&_mm256_max_epu16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&_mm256_max_epu32(tc(&a), tc(&b))) }; } if is_pod_type!(T, f32) { return unsafe { tc(&_mm256_max_ps(tc(&a), tc(&b))) }; } if is_pod_type!(T, f64) { return unsafe { tc(&_mm256_max_pd(tc(&a), tc(&b))) }; } } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = max::(s, a.0, b.0); let c1 = max::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(miri)] { if is_pod_type!(T, u8) { return unsafe { tc(&crate::simulation::u8x16_max(tc(&a), tc(&b))) }; } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE41) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm_max_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&_mm_max_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&_mm_max_epu16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&_mm_max_epu32(tc(&a), tc(&b))) }; } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { if is_pod_type!(T, i16) { return unsafe { tc(&_mm_max_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&_mm_max_epu8(tc(&a), tc(&b))) }; } if is_pod_type!(T, f32) { return unsafe { tc(&_mm_max_ps(tc(&a), tc(&b))) }; } if is_pod_type!(T, f64) { return unsafe { tc(&_mm_max_pd(tc(&a), tc(&b))) }; } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { if is_pod_type!(T, i8) { return unsafe { tc(&vmaxq_s8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&vmaxq_s16(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&vmaxq_s32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&vmaxq_u8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&vmaxq_u16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&vmaxq_u32(tc(&a), tc(&b))) }; } if is_pod_type!(T, f32) { return unsafe { tc(&vmaxq_f32(tc(&a), tc(&b))) }; } } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { if is_pod_type!(T, i8) { return unsafe { tc(&i8x16_max(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&i16x8_max(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&i32x4_max(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&u8x16_max(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&u16x8_max(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&u32x4_max(tc(&a), tc(&b))) }; } if is_pod_type!(T, f32) { return unsafe { tc(&f32x4_max(tc(&a), tc(&b))) }; } } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn min(s: S, a: V, b: V) -> V { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm256_min_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&_mm256_min_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&_mm256_min_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&_mm256_min_epu8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&_mm256_min_epu16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&_mm256_min_epu32(tc(&a), tc(&b))) }; } if is_pod_type!(T, f32) { return unsafe { tc(&_mm256_min_ps(tc(&a), tc(&b))) }; } if is_pod_type!(T, f64) { return unsafe { tc(&_mm256_min_pd(tc(&a), tc(&b))) }; } } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = min::(s, a.0, b.0); let c1 = min::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(miri)] { if is_pod_type!(T, u8) { return unsafe { tc(&crate::simulation::u8x16_min(tc(&a), tc(&b))) }; } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE41) { if is_pod_type!(T, i8) { return unsafe { tc(&_mm_min_epi8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&_mm_min_epi32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&_mm_min_epu16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&_mm_min_epu32(tc(&a), tc(&b))) }; } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { if is_pod_type!(T, i16) { return unsafe { tc(&_mm_min_epi16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&_mm_min_epu8(tc(&a), tc(&b))) }; } if is_pod_type!(T, f32) { return unsafe { tc(&_mm_min_ps(tc(&a), tc(&b))) }; } if is_pod_type!(T, f64) { return unsafe { tc(&_mm_min_pd(tc(&a), tc(&b))) }; } } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { if is_pod_type!(T, i8) { return unsafe { tc(&vminq_s8(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&vminq_s16(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&vminq_s32(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&vminq_u8(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&vminq_u16(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&vminq_u32(tc(&a), tc(&b))) }; } if is_pod_type!(T, f32) { return unsafe { tc(&vminq_f32(tc(&a), tc(&b))) }; } } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { if is_pod_type!(T, i8) { return unsafe { tc(&i8x16_min(tc(&a), tc(&b))) }; } if is_pod_type!(T, i16) { return unsafe { tc(&i16x8_min(tc(&a), tc(&b))) }; } if is_pod_type!(T, i32) { return unsafe { tc(&i32x4_min(tc(&a), tc(&b))) }; } if is_pod_type!(T, u8) { return unsafe { tc(&u8x16_min(tc(&a), tc(&b))) }; } if is_pod_type!(T, u16) { return unsafe { tc(&u16x8_min(tc(&a), tc(&b))) }; } if is_pod_type!(T, u32) { return unsafe { tc(&u32x4_min(tc(&a), tc(&b))) }; } if is_pod_type!(T, f32) { return unsafe { tc(&f32x4_min(tc(&a), tc(&b))) }; } } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn and(s: S, a: V, b: V) -> V where S: InstructionSet, V: POD, { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { return unsafe { tc(&_mm256_and_si256(tc(&a), tc(&b))) }; } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = and::(s, a.0, b.0); let c1 = and::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { return unsafe { tc(&_mm_and_si128(tc(&a), tc(&b))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { return unsafe { tc(&vandq_u8(tc(&a), tc(&b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { return unsafe { tc(&v128_and(tc(&a), tc(&b))) }; } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn or(s: S, a: V, b: V) -> V where S: InstructionSet, V: POD, { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { return unsafe { tc(&_mm256_or_si256(tc(&a), tc(&b))) }; } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = or::(s, a.0, b.0); let c1 = or::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { return unsafe { tc(&_mm_or_si128(tc(&a), tc(&b))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { return unsafe { tc(&vorrq_u8(tc(&a), tc(&b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { return unsafe { tc(&v128_or(tc(&a), tc(&b))) }; } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn xor(s: S, a: V, b: V) -> V where S: InstructionSet, V: POD, { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { return unsafe { tc(&_mm256_xor_si256(tc(&a), tc(&b))) }; } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = xor::(s, a.0, b.0); let c1 = xor::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { return unsafe { tc(&_mm_xor_si128(tc(&a), tc(&b))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { return unsafe { tc(&veorq_u8(tc(&a), tc(&b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { return unsafe { tc(&v128_xor(tc(&a), tc(&b))) }; } } { let _ = (s, a, b); unreachable!() } } #[inline(always)] pub fn andnot(s: S, a: V, b: V) -> V where S: InstructionSet, V: POD, { if is_pod_type!(V, V256) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, AVX2) { let (a, b) = (b, a); return unsafe { tc(&_mm256_andnot_si256(tc(&a), tc(&b))) }; } { let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) }; let (a, b) = (a.to_v128x2(), b.to_v128x2()); let c0 = andnot::(s, a.0, b.0); let c1 = andnot::(s, a.1, b.1); return unsafe { tc(&V256::from_v128x2((c0, c1))) }; } } if is_pod_type!(V, V128) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if matches_isa!(S, SSE2) { let (a, b) = (b, a); return unsafe { tc(&_mm_andnot_si128(tc(&a), tc(&b))) }; } #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))] if matches_isa!(S, NEON) { return unsafe { tc(&vbicq_u8(tc(&a), tc(&b))) }; } #[cfg(target_arch = "wasm32")] if matches_isa!(S, WASM128) { return unsafe { tc(&v128_andnot(tc(&a), tc(&b))) }; } } { let _ = (s, a, b); unreachable!() } } vsimd-0.8.0/src/unstable.rs000064400000000000000000000002611046102023000137240ustar 00000000000000use core::simd::*; #[inline(always)] pub fn splat(x: T) -> Simd where T: SimdElement, LaneCount: SupportedLaneCount, { Simd::splat(x) } vsimd-0.8.0/src/vector.rs000064400000000000000000000122051046102023000134120ustar 00000000000000use core::mem::transmute; // vectors should have `repr(simd)` if possible. #[cfg(feature = "unstable")] item_group! { use core::simd::{u8x16, u8x32, u8x64, u8x8}; #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V64(u8x8); #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V128(u8x16); #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V256(u8x32); #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V512(u8x64); } #[cfg(all(not(feature = "unstable"), any(target_arch = "x86", target_arch = "x86_64")))] item_group! { #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V64(u64); #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V128(__m128i); #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V256(__m256i); #[derive(Debug, Clone, Copy)] #[repr(C, align(64))] pub struct V512(__m256i, __m256i); } #[cfg(all(not(feature = "unstable"), target_arch = "aarch64"))] item_group! { use core::arch::aarch64::*; #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V64(uint8x8_t); #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V128(uint8x16_t); #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V256(uint8x16x2_t); #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V512(uint8x16x4_t); } #[cfg(all(not(feature = "unstable"), target_arch = "wasm32"))] item_group! { #[cfg(target_arch = "wasm32")] use core::arch::wasm32::*; #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V64(u64); #[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct V128(v128); #[derive(Debug, Clone, Copy)] #[repr(C, align(32))] pub struct V256(v128, v128); #[derive(Debug, Clone, Copy)] #[repr(C, align(64))] pub struct V512(v128, v128, v128, v128); } #[cfg(all( not(feature = "unstable"), not(any( any(target_arch = "x86", target_arch = "x86_64"), target_arch = "aarch64", target_arch = "wasm32" )) ))] item_group! { #[derive(Debug, Clone, Copy)] #[repr(C, align(8))] pub struct V64([u8; 8]); #[derive(Debug, Clone, Copy)] #[repr(C, align(16))] pub struct V128([u8; 16]); #[derive(Debug, Clone, Copy)] #[repr(C, align(32))] pub struct V256([u8; 32]); #[derive(Debug, Clone, Copy)] #[repr(C, align(64))] pub struct V512([u8; 64]); } impl V64 { #[inline(always)] #[must_use] pub const fn from_bytes(bytes: [u8; 8]) -> Self { unsafe { transmute(bytes) } } #[inline(always)] #[must_use] pub const fn as_bytes(&self) -> &[u8; 8] { unsafe { transmute(self) } } #[inline(always)] #[must_use] pub fn to_u64(self) -> u64 { unsafe { transmute(self) } } } impl V128 { #[inline(always)] #[must_use] pub const fn from_bytes(bytes: [u8; 16]) -> Self { unsafe { transmute(bytes) } } #[inline(always)] #[must_use] pub const fn as_bytes(&self) -> &[u8; 16] { unsafe { transmute(self) } } #[inline(always)] #[must_use] pub const fn to_v64x2(self) -> (V64, V64) { let x: [V64; 2] = unsafe { transmute(self) }; (x[0], x[1]) } #[inline(always)] #[must_use] pub const fn x2(self) -> V256 { unsafe { transmute([self, self]) } } } impl V256 { #[inline(always)] #[must_use] pub const fn from_bytes(bytes: [u8; 32]) -> Self { unsafe { transmute(bytes) } } #[inline(always)] #[must_use] pub const fn as_bytes(&self) -> &[u8; 32] { unsafe { transmute(self) } } #[inline(always)] #[must_use] pub const fn from_v128x2(x: (V128, V128)) -> Self { unsafe { transmute([x.0, x.1]) } } #[inline(always)] #[must_use] pub const fn to_v128x2(self) -> (V128, V128) { let x: [V128; 2] = unsafe { transmute(self) }; (x[0], x[1]) } #[inline(always)] #[must_use] pub const fn double_bytes(bytes: [u8; 16]) -> Self { unsafe { transmute([bytes, bytes]) } } #[inline(always)] #[must_use] pub const fn x2(self) -> V512 { unsafe { transmute([self, self]) } } } impl V512 { #[inline(always)] #[must_use] pub const fn from_bytes(bytes: [u8; 64]) -> Self { unsafe { transmute(bytes) } } #[inline(always)] #[must_use] pub const fn as_bytes(&self) -> &[u8; 64] { unsafe { transmute(self) } } #[inline(always)] #[must_use] pub const fn from_v256x2(x: (V256, V256)) -> Self { unsafe { transmute([x.0, x.1]) } } #[inline(always)] #[must_use] pub const fn to_v256x2(self) -> (V256, V256) { let x: [V256; 2] = unsafe { transmute(self) }; (x[0], x[1]) } #[inline(always)] #[must_use] pub const fn double_bytes(bytes: [u8; 32]) -> Self { unsafe { transmute([bytes, bytes]) } } } vsimd-0.8.0/tests/it.rs000064400000000000000000000030341046102023000130770ustar 00000000000000use vsimd::isa::detect; use vsimd::isa::{NEON, SSE2, WASM128}; use vsimd::vector::V128; use vsimd::SIMD128; use const_str::hex; #[cfg(not(miri))] #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] fn native_sum() { use vsimd::native::Native; let x: u32 = rand::random::() / 2; let y: u32 = rand::random::() / 2; const N: usize = 100; let a = [x; N]; let b = [y; N]; let mut c = [0; N]; Native::detect().exec(|| { assert!(a.len() == N && b.len() == N && c.len() == N); for i in 0..N { c[i] = a[i] + b[i]; } }); assert!(c.iter().copied().all(|z| z == x + y)); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] fn u8x16_any_zero() { fn f(a: [u8; 16]) -> bool { let a = V128::from_bytes(a); if let Some(s) = detect::() { return s.u8x16_any_zero(a); } if let Some(s) = detect::() { return s.u8x16_any_zero(a); } if let Some(s) = detect::() { return s.u8x16_any_zero(a); } a.as_bytes().iter().any(|&x| x == 0) } fn test(a: [u8; 16], expected: bool) { assert_eq!(f(a), expected); } test([0x00; 16], true); test([0xff; 16], false); test(hex!("00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F"), true); test(hex!("10 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F"), false); }