aho_corasick/packed/
vector.rs

1// NOTE: The descriptions for each of the vector methods on the traits below
2// are pretty inscrutable. For this reason, there are tests for every method
3// on for every trait impl below. If you're confused about what an op does,
4// consult its test. (They probably should be doc tests, but I couldn't figure
5// out how to write them in a non-annoying way.)
6
7use core::{
8    fmt::Debug,
9    panic::{RefUnwindSafe, UnwindSafe},
10};
11
12/// A trait for describing vector operations used by vectorized searchers.
13///
14/// The trait is highly constrained to low level vector operations needed for
15/// the specific algorithms used in this crate. In general, it was invented
16/// mostly to be generic over x86's __m128i and __m256i types. At time of
17/// writing, it also supports wasm and aarch64 128-bit vector types as well.
18///
19/// # Safety
20///
21/// All methods are not safe since they are intended to be implemented using
22/// vendor intrinsics, which are also not safe. Callers must ensure that
23/// the appropriate target features are enabled in the calling function,
24/// and that the current CPU supports them. All implementations should
25/// avoid marking the routines with `#[target_feature]` and instead mark
26/// them as `#[inline(always)]` to ensure they get appropriately inlined.
27/// (`inline(always)` cannot be used with target_feature.)
28pub(crate) trait Vector:
29    Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe
30{
31    /// The number of bits in the vector.
32    const BITS: usize;
33    /// The number of bytes in the vector. That is, this is the size of the
34    /// vector in memory.
35    const BYTES: usize;
36
37    /// Create a vector with 8-bit lanes with the given byte repeated into each
38    /// lane.
39    ///
40    /// # Safety
41    ///
42    /// Callers must ensure that this is okay to call in the current target for
43    /// the current CPU.
44    unsafe fn splat(byte: u8) -> Self;
45
46    /// Read a vector-size number of bytes from the given pointer. The pointer
47    /// does not need to be aligned.
48    ///
49    /// # Safety
50    ///
51    /// Callers must ensure that this is okay to call in the current target for
52    /// the current CPU.
53    ///
54    /// Callers must guarantee that at least `BYTES` bytes are readable from
55    /// `data`.
56    unsafe fn load_unaligned(data: *const u8) -> Self;
57
58    /// Returns true if and only if this vector has zero in all of its lanes.
59    ///
60    /// # Safety
61    ///
62    /// Callers must ensure that this is okay to call in the current target for
63    /// the current CPU.
64    unsafe fn is_zero(self) -> bool;
65
66    /// Do an 8-bit pairwise equality check. If lane `i` is equal in this
67    /// vector and the one given, then lane `i` in the resulting vector is set
68    /// to `0xFF`. Otherwise, it is set to `0x00`.
69    ///
70    /// # Safety
71    ///
72    /// Callers must ensure that this is okay to call in the current target for
73    /// the current CPU.
74    unsafe fn cmpeq(self, vector2: Self) -> Self;
75
76    /// Perform a bitwise 'and' of this vector and the one given and return
77    /// the result.
78    ///
79    /// # Safety
80    ///
81    /// Callers must ensure that this is okay to call in the current target for
82    /// the current CPU.
83    unsafe fn and(self, vector2: Self) -> Self;
84
85    /// Perform a bitwise 'or' of this vector and the one given and return
86    /// the result.
87    ///
88    /// # Safety
89    ///
90    /// Callers must ensure that this is okay to call in the current target for
91    /// the current CPU.
92    #[allow(dead_code)] // unused, but useful enough to keep around?
93    unsafe fn or(self, vector2: Self) -> Self;
94
95    /// Shift each 8-bit lane in this vector to the right by the number of
96    /// bits indictated by the `BITS` type parameter.
97    ///
98    /// # Safety
99    ///
100    /// Callers must ensure that this is okay to call in the current target for
101    /// the current CPU.
102    unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self;
103
104    /// Shift this vector to the left by one byte and shift the most
105    /// significant byte of `vector2` into the least significant position of
106    /// this vector.
107    ///
108    /// Stated differently, this behaves as if `self` and `vector2` were
109    /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
110    /// right by `Self::BYTES - 1` bytes.
111    ///
112    /// With respect to the Teddy algorithm, `vector2` is usually a previous
113    /// `Self::BYTES` chunk from the haystack and `self` is the chunk
114    /// immediately following it. This permits combining the last two bytes
115    /// from the previous chunk (`vector2`) with the first `Self::BYTES - 1`
116    /// bytes from the current chunk. This permits aligning the result of
117    /// various shuffles so that they can be and-ed together and a possible
118    /// candidate discovered.
119    ///
120    /// # Safety
121    ///
122    /// Callers must ensure that this is okay to call in the current target for
123    /// the current CPU.
124    unsafe fn shift_in_one_byte(self, vector2: Self) -> Self;
125
126    /// Shift this vector to the left by two bytes and shift the two most
127    /// significant bytes of `vector2` into the least significant position of
128    /// this vector.
129    ///
130    /// Stated differently, this behaves as if `self` and `vector2` were
131    /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
132    /// right by `Self::BYTES - 2` bytes.
133    ///
134    /// With respect to the Teddy algorithm, `vector2` is usually a previous
135    /// `Self::BYTES` chunk from the haystack and `self` is the chunk
136    /// immediately following it. This permits combining the last two bytes
137    /// from the previous chunk (`vector2`) with the first `Self::BYTES - 2`
138    /// bytes from the current chunk. This permits aligning the result of
139    /// various shuffles so that they can be and-ed together and a possible
140    /// candidate discovered.
141    ///
142    /// # Safety
143    ///
144    /// Callers must ensure that this is okay to call in the current target for
145    /// the current CPU.
146    unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self;
147
148    /// Shift this vector to the left by three bytes and shift the three most
149    /// significant bytes of `vector2` into the least significant position of
150    /// this vector.
151    ///
152    /// Stated differently, this behaves as if `self` and `vector2` were
153    /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
154    /// right by `Self::BYTES - 3` bytes.
155    ///
156    /// With respect to the Teddy algorithm, `vector2` is usually a previous
157    /// `Self::BYTES` chunk from the haystack and `self` is the chunk
158    /// immediately following it. This permits combining the last three bytes
159    /// from the previous chunk (`vector2`) with the first `Self::BYTES - 3`
160    /// bytes from the current chunk. This permits aligning the result of
161    /// various shuffles so that they can be and-ed together and a possible
162    /// candidate discovered.
163    ///
164    /// # Safety
165    ///
166    /// Callers must ensure that this is okay to call in the current target for
167    /// the current CPU.
168    unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self;
169
170    /// Shuffles the bytes in this vector according to the indices in each of
171    /// the corresponding lanes in `indices`.
172    ///
173    /// If `i` is the index of corresponding lanes, `A` is this vector, `B` is
174    /// indices and `C` is the resulting vector, then `C = A[B[i]]`.
175    ///
176    /// # Safety
177    ///
178    /// Callers must ensure that this is okay to call in the current target for
179    /// the current CPU.
180    unsafe fn shuffle_bytes(self, indices: Self) -> Self;
181
182    /// Call the provided function for each 64-bit lane in this vector. The
183    /// given function is provided the lane index and lane value as a `u64`.
184    ///
185    /// If `f` returns `Some`, then iteration over the lanes is stopped and the
186    /// value is returned. Otherwise, this returns `None`.
187    ///
188    /// # Notes
189    ///
190    /// Conceptually it would be nice if we could have a
191    /// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is
192    /// tricky given Rust's [current support for const generics][support].
193    /// And even if we could, it would be tricky to write generic code over
194    /// it. (Not impossible. We could introduce another layer that requires
195    /// `AsRef<[u64]>` or something.)
196    ///
197    /// [support]: https://github.com/rust-lang/rust/issues/60551
198    ///
199    /// # Safety
200    ///
201    /// Callers must ensure that this is okay to call in the current target for
202    /// the current CPU.
203    unsafe fn for_each_64bit_lane<T>(
204        self,
205        f: impl FnMut(usize, u64) -> Option<T>,
206    ) -> Option<T>;
207}
208
209/// This trait extends the `Vector` trait with additional operations to support
210/// Fat Teddy.
211///
212/// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as
213/// the vector size) instead of the full size of a vector per iteration. For
214/// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr
215/// but Fat Teddy reads 16 bytes at a time.
216///
217/// Fat Teddy is useful when searching for a large number of literals.
218/// The extra number of buckets spreads the literals out more and reduces
219/// verification time.
220///
221/// Currently we only implement this for AVX on x86_64. It would be nice to
222/// implement this for SSE on x86_64 and NEON on aarch64, with the latter two
223/// only reading 8 bytes at a time. It's not clear how well it would work, but
224/// there are some tricky things to figure out in terms of implementation. The
225/// `half_shift_in_{one,two,three}_bytes` methods in particular are probably
226/// the trickiest of the bunch. For AVX2, these are implemented by taking
227/// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit
228/// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8`
229/// operates on the full 128-bit vector and not on each 64-bit half.) I didn't
230/// do a careful survey of NEON to see if it could easily support these
231/// operations.
232pub(crate) trait FatVector: Vector {
233    type Half: Vector;
234
235    /// Read a half-vector-size number of bytes from the given pointer, and
236    /// broadcast it across both halfs of a full vector. The pointer does not
237    /// need to be aligned.
238    ///
239    /// # Safety
240    ///
241    /// Callers must ensure that this is okay to call in the current target for
242    /// the current CPU.
243    ///
244    /// Callers must guarantee that at least `Self::HALF::BYTES` bytes are
245    /// readable from `data`.
246    unsafe fn load_half_unaligned(data: *const u8) -> Self;
247
248    /// Like `Vector::shift_in_one_byte`, except this is done for each half
249    /// of the vector instead.
250    ///
251    /// # Safety
252    ///
253    /// Callers must ensure that this is okay to call in the current target for
254    /// the current CPU.
255    unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self;
256
257    /// Like `Vector::shift_in_two_bytes`, except this is done for each half
258    /// of the vector instead.
259    ///
260    /// # Safety
261    ///
262    /// Callers must ensure that this is okay to call in the current target for
263    /// the current CPU.
264    unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self;
265
266    /// Like `Vector::shift_in_two_bytes`, except this is done for each half
267    /// of the vector instead.
268    ///
269    /// # Safety
270    ///
271    /// Callers must ensure that this is okay to call in the current target for
272    /// the current CPU.
273    unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self;
274
275    /// Swap the 128-bit lanes in this vector.
276    ///
277    /// # Safety
278    ///
279    /// Callers must ensure that this is okay to call in the current target for
280    /// the current CPU.
281    unsafe fn swap_halves(self) -> Self;
282
283    /// Unpack and interleave the 8-bit lanes from the low 128 bits of each
284    /// vector and return the result.
285    ///
286    /// # Safety
287    ///
288    /// Callers must ensure that this is okay to call in the current target for
289    /// the current CPU.
290    unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self;
291
292    /// Unpack and interleave the 8-bit lanes from the high 128 bits of each
293    /// vector and return the result.
294    ///
295    /// # Safety
296    ///
297    /// Callers must ensure that this is okay to call in the current target for
298    /// the current CPU.
299    unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self;
300
301    /// Call the provided function for each 64-bit lane in the lower half
302    /// of this vector and then in the other vector. The given function is
303    /// provided the lane index and lane value as a `u64`. (The high 128-bits
304    /// of each vector are ignored.)
305    ///
306    /// If `f` returns `Some`, then iteration over the lanes is stopped and the
307    /// value is returned. Otherwise, this returns `None`.
308    ///
309    /// # Safety
310    ///
311    /// Callers must ensure that this is okay to call in the current target for
312    /// the current CPU.
313    unsafe fn for_each_low_64bit_lane<T>(
314        self,
315        vector2: Self,
316        f: impl FnMut(usize, u64) -> Option<T>,
317    ) -> Option<T>;
318}
319
320#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
321mod x86_64_ssse3 {
322    use core::arch::x86_64::*;
323
324    use crate::util::int::{I32, I8};
325
326    use super::Vector;
327
328    impl Vector for __m128i {
329        const BITS: usize = 128;
330        const BYTES: usize = 16;
331
332        #[inline(always)]
333        unsafe fn splat(byte: u8) -> __m128i {
334            _mm_set1_epi8(i8::from_bits(byte))
335        }
336
337        #[inline(always)]
338        unsafe fn load_unaligned(data: *const u8) -> __m128i {
339            _mm_loadu_si128(data.cast::<__m128i>())
340        }
341
342        #[inline(always)]
343        unsafe fn is_zero(self) -> bool {
344            let cmp = self.cmpeq(Self::splat(0));
345            _mm_movemask_epi8(cmp).to_bits() == 0xFFFF
346        }
347
348        #[inline(always)]
349        unsafe fn cmpeq(self, vector2: Self) -> __m128i {
350            _mm_cmpeq_epi8(self, vector2)
351        }
352
353        #[inline(always)]
354        unsafe fn and(self, vector2: Self) -> __m128i {
355            _mm_and_si128(self, vector2)
356        }
357
358        #[inline(always)]
359        unsafe fn or(self, vector2: Self) -> __m128i {
360            _mm_or_si128(self, vector2)
361        }
362
363        #[inline(always)]
364        unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
365            // Apparently there is no _mm_srli_epi8, so we emulate it by
366            // shifting 16-bit integers and masking out the high nybble of each
367            // 8-bit lane (since that nybble will contain bits from the low
368            // nybble of the previous lane).
369            let lomask = Self::splat(0xF);
370            _mm_srli_epi16(self, BITS).and(lomask)
371        }
372
373        #[inline(always)]
374        unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
375            _mm_alignr_epi8(self, vector2, 15)
376        }
377
378        #[inline(always)]
379        unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
380            _mm_alignr_epi8(self, vector2, 14)
381        }
382
383        #[inline(always)]
384        unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
385            _mm_alignr_epi8(self, vector2, 13)
386        }
387
388        #[inline(always)]
389        unsafe fn shuffle_bytes(self, indices: Self) -> Self {
390            _mm_shuffle_epi8(self, indices)
391        }
392
393        #[inline(always)]
394        unsafe fn for_each_64bit_lane<T>(
395            self,
396            mut f: impl FnMut(usize, u64) -> Option<T>,
397        ) -> Option<T> {
398            // We could just use _mm_extract_epi64 here, but that requires
399            // SSE 4.1. It isn't necessarily a problem to just require SSE 4.1,
400            // but everything else works with SSSE3 so we stick to that subset.
401            let lanes: [u64; 2] = core::mem::transmute(self);
402            if let Some(t) = f(0, lanes[0]) {
403                return Some(t);
404            }
405            if let Some(t) = f(1, lanes[1]) {
406                return Some(t);
407            }
408            None
409        }
410    }
411}
412
413#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
414mod x86_64_avx2 {
415    use core::arch::x86_64::*;
416
417    use crate::util::int::{I32, I64, I8};
418
419    use super::{FatVector, Vector};
420
421    impl Vector for __m256i {
422        const BITS: usize = 256;
423        const BYTES: usize = 32;
424
425        #[inline(always)]
426        unsafe fn splat(byte: u8) -> __m256i {
427            _mm256_set1_epi8(i8::from_bits(byte))
428        }
429
430        #[inline(always)]
431        unsafe fn load_unaligned(data: *const u8) -> __m256i {
432            _mm256_loadu_si256(data.cast::<__m256i>())
433        }
434
435        #[inline(always)]
436        unsafe fn is_zero(self) -> bool {
437            let cmp = self.cmpeq(Self::splat(0));
438            _mm256_movemask_epi8(cmp).to_bits() == 0xFFFFFFFF
439        }
440
441        #[inline(always)]
442        unsafe fn cmpeq(self, vector2: Self) -> __m256i {
443            _mm256_cmpeq_epi8(self, vector2)
444        }
445
446        #[inline(always)]
447        unsafe fn and(self, vector2: Self) -> __m256i {
448            _mm256_and_si256(self, vector2)
449        }
450
451        #[inline(always)]
452        unsafe fn or(self, vector2: Self) -> __m256i {
453            _mm256_or_si256(self, vector2)
454        }
455
456        #[inline(always)]
457        unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
458            let lomask = Self::splat(0xF);
459            _mm256_srli_epi16(self, BITS).and(lomask)
460        }
461
462        #[inline(always)]
463        unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
464            // Credit goes to jneem for figuring this out:
465            // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
466            //
467            // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
468            // PALIGNR instructions, which is not what we want, so we need to
469            // do some extra shuffling.
470            let v = _mm256_permute2x128_si256(vector2, self, 0x21);
471            _mm256_alignr_epi8(self, v, 15)
472        }
473
474        #[inline(always)]
475        unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
476            // Credit goes to jneem for figuring this out:
477            // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
478            //
479            // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
480            // PALIGNR instructions, which is not what we want, so we need to
481            // do some extra shuffling.
482            let v = _mm256_permute2x128_si256(vector2, self, 0x21);
483            _mm256_alignr_epi8(self, v, 14)
484        }
485
486        #[inline(always)]
487        unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
488            // Credit goes to jneem for figuring this out:
489            // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
490            //
491            // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
492            // PALIGNR instructions, which is not what we want, so we need to
493            // do some extra shuffling.
494            let v = _mm256_permute2x128_si256(vector2, self, 0x21);
495            _mm256_alignr_epi8(self, v, 13)
496        }
497
498        #[inline(always)]
499        unsafe fn shuffle_bytes(self, indices: Self) -> Self {
500            _mm256_shuffle_epi8(self, indices)
501        }
502
503        #[inline(always)]
504        unsafe fn for_each_64bit_lane<T>(
505            self,
506            mut f: impl FnMut(usize, u64) -> Option<T>,
507        ) -> Option<T> {
508            // NOTE: At one point in the past, I used transmute to this to
509            // get a [u64; 4], but it turned out to lead to worse codegen IIRC.
510            // I've tried it more recently, and it looks like that's no longer
511            // the case. But since there's no difference, we stick with the
512            // slightly more complicated but transmute-free version.
513            let lane = _mm256_extract_epi64(self, 0).to_bits();
514            if let Some(t) = f(0, lane) {
515                return Some(t);
516            }
517            let lane = _mm256_extract_epi64(self, 1).to_bits();
518            if let Some(t) = f(1, lane) {
519                return Some(t);
520            }
521            let lane = _mm256_extract_epi64(self, 2).to_bits();
522            if let Some(t) = f(2, lane) {
523                return Some(t);
524            }
525            let lane = _mm256_extract_epi64(self, 3).to_bits();
526            if let Some(t) = f(3, lane) {
527                return Some(t);
528            }
529            None
530        }
531    }
532
533    impl FatVector for __m256i {
534        type Half = __m128i;
535
536        #[inline(always)]
537        unsafe fn load_half_unaligned(data: *const u8) -> Self {
538            let half = Self::Half::load_unaligned(data);
539            _mm256_broadcastsi128_si256(half)
540        }
541
542        #[inline(always)]
543        unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self {
544            _mm256_alignr_epi8(self, vector2, 15)
545        }
546
547        #[inline(always)]
548        unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self {
549            _mm256_alignr_epi8(self, vector2, 14)
550        }
551
552        #[inline(always)]
553        unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self {
554            _mm256_alignr_epi8(self, vector2, 13)
555        }
556
557        #[inline(always)]
558        unsafe fn swap_halves(self) -> Self {
559            _mm256_permute4x64_epi64(self, 0x4E)
560        }
561
562        #[inline(always)]
563        unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self {
564            _mm256_unpacklo_epi8(self, vector2)
565        }
566
567        #[inline(always)]
568        unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self {
569            _mm256_unpackhi_epi8(self, vector2)
570        }
571
572        #[inline(always)]
573        unsafe fn for_each_low_64bit_lane<T>(
574            self,
575            vector2: Self,
576            mut f: impl FnMut(usize, u64) -> Option<T>,
577        ) -> Option<T> {
578            let lane = _mm256_extract_epi64(self, 0).to_bits();
579            if let Some(t) = f(0, lane) {
580                return Some(t);
581            }
582            let lane = _mm256_extract_epi64(self, 1).to_bits();
583            if let Some(t) = f(1, lane) {
584                return Some(t);
585            }
586            let lane = _mm256_extract_epi64(vector2, 0).to_bits();
587            if let Some(t) = f(2, lane) {
588                return Some(t);
589            }
590            let lane = _mm256_extract_epi64(vector2, 1).to_bits();
591            if let Some(t) = f(3, lane) {
592                return Some(t);
593            }
594            None
595        }
596    }
597}
598
599#[cfg(all(
600    target_arch = "aarch64",
601    target_feature = "neon",
602    target_endian = "little"
603))]
604mod aarch64_neon {
605    use core::arch::aarch64::*;
606
607    use super::Vector;
608
609    impl Vector for uint8x16_t {
610        const BITS: usize = 128;
611        const BYTES: usize = 16;
612
613        #[inline(always)]
614        unsafe fn splat(byte: u8) -> uint8x16_t {
615            vdupq_n_u8(byte)
616        }
617
618        #[inline(always)]
619        unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
620            vld1q_u8(data)
621        }
622
623        #[inline(always)]
624        unsafe fn is_zero(self) -> bool {
625            // Could also use vmaxvq_u8.
626            // ... I tried that and couldn't observe any meaningful difference
627            // in benchmarks.
628            let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
629            vgetq_lane_u64(maxes, 0) == 0
630        }
631
632        #[inline(always)]
633        unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
634            vceqq_u8(self, vector2)
635        }
636
637        #[inline(always)]
638        unsafe fn and(self, vector2: Self) -> uint8x16_t {
639            vandq_u8(self, vector2)
640        }
641
642        #[inline(always)]
643        unsafe fn or(self, vector2: Self) -> uint8x16_t {
644            vorrq_u8(self, vector2)
645        }
646
647        #[inline(always)]
648        unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
649            debug_assert!(BITS <= 7);
650            vshrq_n_u8(self, BITS)
651        }
652
653        #[inline(always)]
654        unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
655            vextq_u8(vector2, self, 15)
656        }
657
658        #[inline(always)]
659        unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
660            vextq_u8(vector2, self, 14)
661        }
662
663        #[inline(always)]
664        unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
665            vextq_u8(vector2, self, 13)
666        }
667
668        #[inline(always)]
669        unsafe fn shuffle_bytes(self, indices: Self) -> Self {
670            vqtbl1q_u8(self, indices)
671        }
672
673        #[inline(always)]
674        unsafe fn for_each_64bit_lane<T>(
675            self,
676            mut f: impl FnMut(usize, u64) -> Option<T>,
677        ) -> Option<T> {
678            let this = vreinterpretq_u64_u8(self);
679            let lane = vgetq_lane_u64(this, 0);
680            if let Some(t) = f(0, lane) {
681                return Some(t);
682            }
683            let lane = vgetq_lane_u64(this, 1);
684            if let Some(t) = f(1, lane) {
685                return Some(t);
686            }
687            None
688        }
689    }
690}
691
692#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
693mod tests_x86_64_ssse3 {
694    use core::arch::x86_64::*;
695
696    use crate::util::int::{I32, U32};
697
698    use super::*;
699
700    fn is_runnable() -> bool {
701        std::is_x86_feature_detected!("ssse3")
702    }
703
704    #[target_feature(enable = "ssse3")]
705    unsafe fn load(lanes: [u8; 16]) -> __m128i {
706        __m128i::load_unaligned(&lanes as *const u8)
707    }
708
709    #[target_feature(enable = "ssse3")]
710    unsafe fn unload(v: __m128i) -> [u8; 16] {
711        [
712            _mm_extract_epi8(v, 0).to_bits().low_u8(),
713            _mm_extract_epi8(v, 1).to_bits().low_u8(),
714            _mm_extract_epi8(v, 2).to_bits().low_u8(),
715            _mm_extract_epi8(v, 3).to_bits().low_u8(),
716            _mm_extract_epi8(v, 4).to_bits().low_u8(),
717            _mm_extract_epi8(v, 5).to_bits().low_u8(),
718            _mm_extract_epi8(v, 6).to_bits().low_u8(),
719            _mm_extract_epi8(v, 7).to_bits().low_u8(),
720            _mm_extract_epi8(v, 8).to_bits().low_u8(),
721            _mm_extract_epi8(v, 9).to_bits().low_u8(),
722            _mm_extract_epi8(v, 10).to_bits().low_u8(),
723            _mm_extract_epi8(v, 11).to_bits().low_u8(),
724            _mm_extract_epi8(v, 12).to_bits().low_u8(),
725            _mm_extract_epi8(v, 13).to_bits().low_u8(),
726            _mm_extract_epi8(v, 14).to_bits().low_u8(),
727            _mm_extract_epi8(v, 15).to_bits().low_u8(),
728        ]
729    }
730
731    #[test]
732    fn vector_splat() {
733        #[target_feature(enable = "ssse3")]
734        unsafe fn test() {
735            let v = __m128i::splat(0xAF);
736            assert_eq!(
737                unload(v),
738                [
739                    0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
740                    0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
741                ]
742            );
743        }
744        if !is_runnable() {
745            return;
746        }
747        unsafe { test() }
748    }
749
750    #[test]
751    fn vector_is_zero() {
752        #[target_feature(enable = "ssse3")]
753        unsafe fn test() {
754            let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
755            assert!(!v.is_zero());
756            let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
757            assert!(v.is_zero());
758        }
759        if !is_runnable() {
760            return;
761        }
762        unsafe { test() }
763    }
764
765    #[test]
766    fn vector_cmpeq() {
767        #[target_feature(enable = "ssse3")]
768        unsafe fn test() {
769            let v1 =
770                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
771            let v2 =
772                load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
773            assert_eq!(
774                unload(v1.cmpeq(v2)),
775                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
776            );
777        }
778        if !is_runnable() {
779            return;
780        }
781        unsafe { test() }
782    }
783
784    #[test]
785    fn vector_and() {
786        #[target_feature(enable = "ssse3")]
787        unsafe fn test() {
788            let v1 =
789                load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
790            let v2 =
791                load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
792            assert_eq!(
793                unload(v1.and(v2)),
794                [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
795            );
796        }
797        if !is_runnable() {
798            return;
799        }
800        unsafe { test() }
801    }
802
803    #[test]
804    fn vector_or() {
805        #[target_feature(enable = "ssse3")]
806        unsafe fn test() {
807            let v1 =
808                load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
809            let v2 =
810                load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
811            assert_eq!(
812                unload(v1.or(v2)),
813                [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
814            );
815        }
816        if !is_runnable() {
817            return;
818        }
819        unsafe { test() }
820    }
821
822    #[test]
823    fn vector_shift_8bit_lane_right() {
824        #[target_feature(enable = "ssse3")]
825        unsafe fn test() {
826            let v = load([
827                0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
828            ]);
829            assert_eq!(
830                unload(v.shift_8bit_lane_right::<2>()),
831                [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
832            );
833        }
834        if !is_runnable() {
835            return;
836        }
837        unsafe { test() }
838    }
839
840    #[test]
841    fn vector_shift_in_one_byte() {
842        #[target_feature(enable = "ssse3")]
843        unsafe fn test() {
844            let v1 =
845                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
846            let v2 = load([
847                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
848            ]);
849            assert_eq!(
850                unload(v1.shift_in_one_byte(v2)),
851                [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
852            );
853        }
854        if !is_runnable() {
855            return;
856        }
857        unsafe { test() }
858    }
859
860    #[test]
861    fn vector_shift_in_two_bytes() {
862        #[target_feature(enable = "ssse3")]
863        unsafe fn test() {
864            let v1 =
865                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
866            let v2 = load([
867                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
868            ]);
869            assert_eq!(
870                unload(v1.shift_in_two_bytes(v2)),
871                [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
872            );
873        }
874        if !is_runnable() {
875            return;
876        }
877        unsafe { test() }
878    }
879
880    #[test]
881    fn vector_shift_in_three_bytes() {
882        #[target_feature(enable = "ssse3")]
883        unsafe fn test() {
884            let v1 =
885                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
886            let v2 = load([
887                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
888            ]);
889            assert_eq!(
890                unload(v1.shift_in_three_bytes(v2)),
891                [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
892            );
893        }
894        if !is_runnable() {
895            return;
896        }
897        unsafe { test() }
898    }
899
900    #[test]
901    fn vector_shuffle_bytes() {
902        #[target_feature(enable = "ssse3")]
903        unsafe fn test() {
904            let v1 =
905                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
906            let v2 =
907                load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
908            assert_eq!(
909                unload(v1.shuffle_bytes(v2)),
910                [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
911            );
912        }
913        if !is_runnable() {
914            return;
915        }
916        unsafe { test() }
917    }
918
919    #[test]
920    fn vector_for_each_64bit_lane() {
921        #[target_feature(enable = "ssse3")]
922        unsafe fn test() {
923            let v = load([
924                0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
925                0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
926            ]);
927            let mut lanes = [0u64; 2];
928            v.for_each_64bit_lane(|i, lane| {
929                lanes[i] = lane;
930                None::<()>
931            });
932            assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
933        }
934        if !is_runnable() {
935            return;
936        }
937        unsafe { test() }
938    }
939}
940
941#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
942mod tests_x86_64_avx2 {
943    use core::arch::x86_64::*;
944
945    use crate::util::int::{I32, U32};
946
947    use super::*;
948
949    fn is_runnable() -> bool {
950        std::is_x86_feature_detected!("avx2")
951    }
952
953    #[target_feature(enable = "avx2")]
954    unsafe fn load(lanes: [u8; 32]) -> __m256i {
955        __m256i::load_unaligned(&lanes as *const u8)
956    }
957
958    #[target_feature(enable = "avx2")]
959    unsafe fn load_half(lanes: [u8; 16]) -> __m256i {
960        __m256i::load_half_unaligned(&lanes as *const u8)
961    }
962
963    #[target_feature(enable = "avx2")]
964    unsafe fn unload(v: __m256i) -> [u8; 32] {
965        [
966            _mm256_extract_epi8(v, 0).to_bits().low_u8(),
967            _mm256_extract_epi8(v, 1).to_bits().low_u8(),
968            _mm256_extract_epi8(v, 2).to_bits().low_u8(),
969            _mm256_extract_epi8(v, 3).to_bits().low_u8(),
970            _mm256_extract_epi8(v, 4).to_bits().low_u8(),
971            _mm256_extract_epi8(v, 5).to_bits().low_u8(),
972            _mm256_extract_epi8(v, 6).to_bits().low_u8(),
973            _mm256_extract_epi8(v, 7).to_bits().low_u8(),
974            _mm256_extract_epi8(v, 8).to_bits().low_u8(),
975            _mm256_extract_epi8(v, 9).to_bits().low_u8(),
976            _mm256_extract_epi8(v, 10).to_bits().low_u8(),
977            _mm256_extract_epi8(v, 11).to_bits().low_u8(),
978            _mm256_extract_epi8(v, 12).to_bits().low_u8(),
979            _mm256_extract_epi8(v, 13).to_bits().low_u8(),
980            _mm256_extract_epi8(v, 14).to_bits().low_u8(),
981            _mm256_extract_epi8(v, 15).to_bits().low_u8(),
982            _mm256_extract_epi8(v, 16).to_bits().low_u8(),
983            _mm256_extract_epi8(v, 17).to_bits().low_u8(),
984            _mm256_extract_epi8(v, 18).to_bits().low_u8(),
985            _mm256_extract_epi8(v, 19).to_bits().low_u8(),
986            _mm256_extract_epi8(v, 20).to_bits().low_u8(),
987            _mm256_extract_epi8(v, 21).to_bits().low_u8(),
988            _mm256_extract_epi8(v, 22).to_bits().low_u8(),
989            _mm256_extract_epi8(v, 23).to_bits().low_u8(),
990            _mm256_extract_epi8(v, 24).to_bits().low_u8(),
991            _mm256_extract_epi8(v, 25).to_bits().low_u8(),
992            _mm256_extract_epi8(v, 26).to_bits().low_u8(),
993            _mm256_extract_epi8(v, 27).to_bits().low_u8(),
994            _mm256_extract_epi8(v, 28).to_bits().low_u8(),
995            _mm256_extract_epi8(v, 29).to_bits().low_u8(),
996            _mm256_extract_epi8(v, 30).to_bits().low_u8(),
997            _mm256_extract_epi8(v, 31).to_bits().low_u8(),
998        ]
999    }
1000
1001    #[test]
1002    fn vector_splat() {
1003        #[target_feature(enable = "avx2")]
1004        unsafe fn test() {
1005            let v = __m256i::splat(0xAF);
1006            assert_eq!(
1007                unload(v),
1008                [
1009                    0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1010                    0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1011                    0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1012                    0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1013                ]
1014            );
1015        }
1016        if !is_runnable() {
1017            return;
1018        }
1019        unsafe { test() }
1020    }
1021
1022    #[test]
1023    fn vector_is_zero() {
1024        #[target_feature(enable = "avx2")]
1025        unsafe fn test() {
1026            let v = load([
1027                0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1028                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1029            ]);
1030            assert!(!v.is_zero());
1031            let v = load([
1032                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1033                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1034            ]);
1035            assert!(v.is_zero());
1036        }
1037        if !is_runnable() {
1038            return;
1039        }
1040        unsafe { test() }
1041    }
1042
1043    #[test]
1044    fn vector_cmpeq() {
1045        #[target_feature(enable = "avx2")]
1046        unsafe fn test() {
1047            let v1 = load([
1048                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1049                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1,
1050            ]);
1051            let v2 = load([
1052                32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
1053                17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
1054            ]);
1055            assert_eq!(
1056                unload(v1.cmpeq(v2)),
1057                [
1058                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1059                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF
1060                ]
1061            );
1062        }
1063        if !is_runnable() {
1064            return;
1065        }
1066        unsafe { test() }
1067    }
1068
1069    #[test]
1070    fn vector_and() {
1071        #[target_feature(enable = "avx2")]
1072        unsafe fn test() {
1073            let v1 = load([
1074                0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076            ]);
1077            let v2 = load([
1078                0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1079                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1080            ]);
1081            assert_eq!(
1082                unload(v1.and(v2)),
1083                [
1084                    0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1085                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1086                ]
1087            );
1088        }
1089        if !is_runnable() {
1090            return;
1091        }
1092        unsafe { test() }
1093    }
1094
1095    #[test]
1096    fn vector_or() {
1097        #[target_feature(enable = "avx2")]
1098        unsafe fn test() {
1099            let v1 = load([
1100                0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1101                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1102            ]);
1103            let v2 = load([
1104                0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1105                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1106            ]);
1107            assert_eq!(
1108                unload(v1.or(v2)),
1109                [
1110                    0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1111                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1112                ]
1113            );
1114        }
1115        if !is_runnable() {
1116            return;
1117        }
1118        unsafe { test() }
1119    }
1120
1121    #[test]
1122    fn vector_shift_8bit_lane_right() {
1123        #[target_feature(enable = "avx2")]
1124        unsafe fn test() {
1125            let v = load([
1126                0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1127                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128            ]);
1129            assert_eq!(
1130                unload(v.shift_8bit_lane_right::<2>()),
1131                [
1132                    0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1133                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1134                ]
1135            );
1136        }
1137        if !is_runnable() {
1138            return;
1139        }
1140        unsafe { test() }
1141    }
1142
1143    #[test]
1144    fn vector_shift_in_one_byte() {
1145        #[target_feature(enable = "avx2")]
1146        unsafe fn test() {
1147            let v1 = load([
1148                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1149                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1150            ]);
1151            let v2 = load([
1152                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1153                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1154                63, 64,
1155            ]);
1156            assert_eq!(
1157                unload(v1.shift_in_one_byte(v2)),
1158                [
1159                    64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1160                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
1161                    31,
1162                ],
1163            );
1164        }
1165        if !is_runnable() {
1166            return;
1167        }
1168        unsafe { test() }
1169    }
1170
1171    #[test]
1172    fn vector_shift_in_two_bytes() {
1173        #[target_feature(enable = "avx2")]
1174        unsafe fn test() {
1175            let v1 = load([
1176                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1177                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1178            ]);
1179            let v2 = load([
1180                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1181                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1182                63, 64,
1183            ]);
1184            assert_eq!(
1185                unload(v1.shift_in_two_bytes(v2)),
1186                [
1187                    63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1188                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
1189                    30,
1190                ],
1191            );
1192        }
1193        if !is_runnable() {
1194            return;
1195        }
1196        unsafe { test() }
1197    }
1198
1199    #[test]
1200    fn vector_shift_in_three_bytes() {
1201        #[target_feature(enable = "avx2")]
1202        unsafe fn test() {
1203            let v1 = load([
1204                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1205                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1206            ]);
1207            let v2 = load([
1208                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1209                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1210                63, 64,
1211            ]);
1212            assert_eq!(
1213                unload(v1.shift_in_three_bytes(v2)),
1214                [
1215                    62, 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1216                    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
1217                    29,
1218                ],
1219            );
1220        }
1221        if !is_runnable() {
1222            return;
1223        }
1224        unsafe { test() }
1225    }
1226
1227    #[test]
1228    fn vector_shuffle_bytes() {
1229        #[target_feature(enable = "avx2")]
1230        unsafe fn test() {
1231            let v1 = load([
1232                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1233                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1234            ]);
1235            let v2 = load([
1236                0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16,
1237                16, 16, 20, 20, 20, 20, 24, 24, 24, 24, 28, 28, 28, 28,
1238            ]);
1239            assert_eq!(
1240                unload(v1.shuffle_bytes(v2)),
1241                [
1242                    1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13, 17,
1243                    17, 17, 17, 21, 21, 21, 21, 25, 25, 25, 25, 29, 29, 29,
1244                    29
1245                ],
1246            );
1247        }
1248        if !is_runnable() {
1249            return;
1250        }
1251        unsafe { test() }
1252    }
1253
1254    #[test]
1255    fn vector_for_each_64bit_lane() {
1256        #[target_feature(enable = "avx2")]
1257        unsafe fn test() {
1258            let v = load([
1259                0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1260                0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
1261                0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
1262                0x1F, 0x20,
1263            ]);
1264            let mut lanes = [0u64; 4];
1265            v.for_each_64bit_lane(|i, lane| {
1266                lanes[i] = lane;
1267                None::<()>
1268            });
1269            assert_eq!(
1270                lanes,
1271                [
1272                    0x0807060504030201,
1273                    0x100F0E0D0C0B0A09,
1274                    0x1817161514131211,
1275                    0x201F1E1D1C1B1A19
1276                ]
1277            );
1278        }
1279        if !is_runnable() {
1280            return;
1281        }
1282        unsafe { test() }
1283    }
1284
1285    #[test]
1286    fn fat_vector_half_shift_in_one_byte() {
1287        #[target_feature(enable = "avx2")]
1288        unsafe fn test() {
1289            let v1 = load_half([
1290                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1291            ]);
1292            let v2 = load_half([
1293                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1294            ]);
1295            assert_eq!(
1296                unload(v1.half_shift_in_one_byte(v2)),
1297                [
1298                    32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32,
1299                    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1300                ],
1301            );
1302        }
1303        if !is_runnable() {
1304            return;
1305        }
1306        unsafe { test() }
1307    }
1308
1309    #[test]
1310    fn fat_vector_half_shift_in_two_bytes() {
1311        #[target_feature(enable = "avx2")]
1312        unsafe fn test() {
1313            let v1 = load_half([
1314                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1315            ]);
1316            let v2 = load_half([
1317                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1318            ]);
1319            assert_eq!(
1320                unload(v1.half_shift_in_two_bytes(v2)),
1321                [
1322                    31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31,
1323                    32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1324                ],
1325            );
1326        }
1327        if !is_runnable() {
1328            return;
1329        }
1330        unsafe { test() }
1331    }
1332
1333    #[test]
1334    fn fat_vector_half_shift_in_three_bytes() {
1335        #[target_feature(enable = "avx2")]
1336        unsafe fn test() {
1337            let v1 = load_half([
1338                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1339            ]);
1340            let v2 = load_half([
1341                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1342            ]);
1343            assert_eq!(
1344                unload(v1.half_shift_in_three_bytes(v2)),
1345                [
1346                    30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 30,
1347                    31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1348                ],
1349            );
1350        }
1351        if !is_runnable() {
1352            return;
1353        }
1354        unsafe { test() }
1355    }
1356
1357    #[test]
1358    fn fat_vector_swap_halves() {
1359        #[target_feature(enable = "avx2")]
1360        unsafe fn test() {
1361            let v = load([
1362                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1363                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1364            ]);
1365            assert_eq!(
1366                unload(v.swap_halves()),
1367                [
1368                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
1369                    31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1370                    16,
1371                ],
1372            );
1373        }
1374        if !is_runnable() {
1375            return;
1376        }
1377        unsafe { test() }
1378    }
1379
1380    #[test]
1381    fn fat_vector_interleave_low_8bit_lanes() {
1382        #[target_feature(enable = "avx2")]
1383        unsafe fn test() {
1384            let v1 = load([
1385                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1386                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1387            ]);
1388            let v2 = load([
1389                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1390                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1391                63, 64,
1392            ]);
1393            assert_eq!(
1394                unload(v1.interleave_low_8bit_lanes(v2)),
1395                [
1396                    1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40,
1397                    17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
1398                    24, 56,
1399                ],
1400            );
1401        }
1402        if !is_runnable() {
1403            return;
1404        }
1405        unsafe { test() }
1406    }
1407
1408    #[test]
1409    fn fat_vector_interleave_high_8bit_lanes() {
1410        #[target_feature(enable = "avx2")]
1411        unsafe fn test() {
1412            let v1 = load([
1413                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1414                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1415            ]);
1416            let v2 = load([
1417                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1418                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1419                63, 64,
1420            ]);
1421            assert_eq!(
1422                unload(v1.interleave_high_8bit_lanes(v2)),
1423                [
1424                    9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16,
1425                    48, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31,
1426                    63, 32, 64,
1427                ],
1428            );
1429        }
1430        if !is_runnable() {
1431            return;
1432        }
1433        unsafe { test() }
1434    }
1435
1436    #[test]
1437    fn fat_vector_for_each_low_64bit_lane() {
1438        #[target_feature(enable = "avx2")]
1439        unsafe fn test() {
1440            let v1 = load([
1441                0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1442                0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
1443                0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
1444                0x1F, 0x20,
1445            ]);
1446            let v2 = load([
1447                0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A,
1448                0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34,
1449                0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E,
1450                0x3F, 0x40,
1451            ]);
1452            let mut lanes = [0u64; 4];
1453            v1.for_each_low_64bit_lane(v2, |i, lane| {
1454                lanes[i] = lane;
1455                None::<()>
1456            });
1457            assert_eq!(
1458                lanes,
1459                [
1460                    0x0807060504030201,
1461                    0x100F0E0D0C0B0A09,
1462                    0x2827262524232221,
1463                    0x302F2E2D2C2B2A29
1464                ]
1465            );
1466        }
1467        if !is_runnable() {
1468            return;
1469        }
1470        unsafe { test() }
1471    }
1472}
1473
1474#[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))]
1475mod tests_aarch64_neon {
1476    use core::arch::aarch64::*;
1477
1478    use super::*;
1479
1480    #[target_feature(enable = "neon")]
1481    unsafe fn load(lanes: [u8; 16]) -> uint8x16_t {
1482        uint8x16_t::load_unaligned(&lanes as *const u8)
1483    }
1484
1485    #[target_feature(enable = "neon")]
1486    unsafe fn unload(v: uint8x16_t) -> [u8; 16] {
1487        [
1488            vgetq_lane_u8(v, 0),
1489            vgetq_lane_u8(v, 1),
1490            vgetq_lane_u8(v, 2),
1491            vgetq_lane_u8(v, 3),
1492            vgetq_lane_u8(v, 4),
1493            vgetq_lane_u8(v, 5),
1494            vgetq_lane_u8(v, 6),
1495            vgetq_lane_u8(v, 7),
1496            vgetq_lane_u8(v, 8),
1497            vgetq_lane_u8(v, 9),
1498            vgetq_lane_u8(v, 10),
1499            vgetq_lane_u8(v, 11),
1500            vgetq_lane_u8(v, 12),
1501            vgetq_lane_u8(v, 13),
1502            vgetq_lane_u8(v, 14),
1503            vgetq_lane_u8(v, 15),
1504        ]
1505    }
1506
1507    // Example functions. These don't test the Vector traits, but rather,
1508    // specific NEON instructions. They are basically little experiments I
1509    // wrote to figure out what an instruction does since their descriptions
1510    // are so dense. I decided to keep the experiments around as example tests
1511    // in case there' useful.
1512
1513    #[test]
1514    fn example_vmaxvq_u8_non_zero() {
1515        #[target_feature(enable = "neon")]
1516        unsafe fn example() {
1517            let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1518            assert_eq!(vmaxvq_u8(v), 1);
1519        }
1520        unsafe { example() }
1521    }
1522
1523    #[test]
1524    fn example_vmaxvq_u8_zero() {
1525        #[target_feature(enable = "neon")]
1526        unsafe fn example() {
1527            let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1528            assert_eq!(vmaxvq_u8(v), 0);
1529        }
1530        unsafe { example() }
1531    }
1532
1533    #[test]
1534    fn example_vpmaxq_u8_non_zero() {
1535        #[target_feature(enable = "neon")]
1536        unsafe fn example() {
1537            let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1538            let r = vpmaxq_u8(v, v);
1539            assert_eq!(
1540                unload(r),
1541                [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
1542            );
1543        }
1544        unsafe { example() }
1545    }
1546
1547    #[test]
1548    fn example_vpmaxq_u8_self() {
1549        #[target_feature(enable = "neon")]
1550        unsafe fn example() {
1551            let v =
1552                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1553            let r = vpmaxq_u8(v, v);
1554            assert_eq!(
1555                unload(r),
1556                [2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16]
1557            );
1558        }
1559        unsafe { example() }
1560    }
1561
1562    #[test]
1563    fn example_vpmaxq_u8_other() {
1564        #[target_feature(enable = "neon")]
1565        unsafe fn example() {
1566            let v1 =
1567                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1568            let v2 = load([
1569                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1570            ]);
1571            let r = vpmaxq_u8(v1, v2);
1572            assert_eq!(
1573                unload(r),
1574                [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32]
1575            );
1576        }
1577        unsafe { example() }
1578    }
1579
1580    // Now we test the actual methods on the Vector trait.
1581
1582    #[test]
1583    fn vector_splat() {
1584        #[target_feature(enable = "neon")]
1585        unsafe fn test() {
1586            let v = uint8x16_t::splat(0xAF);
1587            assert_eq!(
1588                unload(v),
1589                [
1590                    0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1591                    0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
1592                ]
1593            );
1594        }
1595        unsafe { test() }
1596    }
1597
1598    #[test]
1599    fn vector_is_zero() {
1600        #[target_feature(enable = "neon")]
1601        unsafe fn test() {
1602            let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1603            assert!(!v.is_zero());
1604            let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1605            assert!(v.is_zero());
1606        }
1607        unsafe { test() }
1608    }
1609
1610    #[test]
1611    fn vector_cmpeq() {
1612        #[target_feature(enable = "neon")]
1613        unsafe fn test() {
1614            let v1 =
1615                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
1616            let v2 =
1617                load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
1618            assert_eq!(
1619                unload(v1.cmpeq(v2)),
1620                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
1621            );
1622        }
1623        unsafe { test() }
1624    }
1625
1626    #[test]
1627    fn vector_and() {
1628        #[target_feature(enable = "neon")]
1629        unsafe fn test() {
1630            let v1 =
1631                load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1632            let v2 =
1633                load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1634            assert_eq!(
1635                unload(v1.and(v2)),
1636                [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1637            );
1638        }
1639        unsafe { test() }
1640    }
1641
1642    #[test]
1643    fn vector_or() {
1644        #[target_feature(enable = "neon")]
1645        unsafe fn test() {
1646            let v1 =
1647                load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1648            let v2 =
1649                load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1650            assert_eq!(
1651                unload(v1.or(v2)),
1652                [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1653            );
1654        }
1655        unsafe { test() }
1656    }
1657
1658    #[test]
1659    fn vector_shift_8bit_lane_right() {
1660        #[target_feature(enable = "neon")]
1661        unsafe fn test() {
1662            let v = load([
1663                0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1664            ]);
1665            assert_eq!(
1666                unload(v.shift_8bit_lane_right::<2>()),
1667                [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1668            );
1669        }
1670        unsafe { test() }
1671    }
1672
1673    #[test]
1674    fn vector_shift_in_one_byte() {
1675        #[target_feature(enable = "neon")]
1676        unsafe fn test() {
1677            let v1 =
1678                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1679            let v2 = load([
1680                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1681            ]);
1682            assert_eq!(
1683                unload(v1.shift_in_one_byte(v2)),
1684                [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
1685            );
1686        }
1687        unsafe { test() }
1688    }
1689
1690    #[test]
1691    fn vector_shift_in_two_bytes() {
1692        #[target_feature(enable = "neon")]
1693        unsafe fn test() {
1694            let v1 =
1695                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1696            let v2 = load([
1697                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1698            ]);
1699            assert_eq!(
1700                unload(v1.shift_in_two_bytes(v2)),
1701                [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
1702            );
1703        }
1704        unsafe { test() }
1705    }
1706
1707    #[test]
1708    fn vector_shift_in_three_bytes() {
1709        #[target_feature(enable = "neon")]
1710        unsafe fn test() {
1711            let v1 =
1712                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1713            let v2 = load([
1714                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1715            ]);
1716            assert_eq!(
1717                unload(v1.shift_in_three_bytes(v2)),
1718                [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
1719            );
1720        }
1721        unsafe { test() }
1722    }
1723
1724    #[test]
1725    fn vector_shuffle_bytes() {
1726        #[target_feature(enable = "neon")]
1727        unsafe fn test() {
1728            let v1 =
1729                load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1730            let v2 =
1731                load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
1732            assert_eq!(
1733                unload(v1.shuffle_bytes(v2)),
1734                [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
1735            );
1736        }
1737        unsafe { test() }
1738    }
1739
1740    #[test]
1741    fn vector_for_each_64bit_lane() {
1742        #[target_feature(enable = "neon")]
1743        unsafe fn test() {
1744            let v = load([
1745                0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1746                0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
1747            ]);
1748            let mut lanes = [0u64; 2];
1749            v.for_each_64bit_lane(|i, lane| {
1750                lanes[i] = lane;
1751                None::<()>
1752            });
1753            assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
1754        }
1755        unsafe { test() }
1756    }
1757}