memchr/arch/generic/
memchr.rs

1/*!
2Generic crate-internal routines for the `memchr` family of functions.
3*/
4
5// What follows is a vector algorithm generic over the specific vector
6// type to detect the position of one, two or three needles in a haystack.
7// From what I know, this is a "classic" algorithm, although I don't
8// believe it has been published in any peer reviewed journal. I believe
9// it can be found in places like glibc and Go's standard library. It
10// appears to be well known and is elaborated on in more detail here:
11// https://gms.tf/stdfind-and-memchr-optimizations.html
12//
13// While the routine below is fairly long and perhaps intimidating, the basic
14// idea is actually very simple and can be expressed straight-forwardly in
15// pseudo code. The psuedo code below is written for 128 bit vectors, but the
16// actual code below works for anything that implements the Vector trait.
17//
18//     needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
19//     // Note: shift amount is in bytes
20//
21//     while i <= haystack.len() - 16:
22//       // A 16 byte vector. Each byte in chunk corresponds to a byte in
23//       // the haystack.
24//       chunk = haystack[i:i+16]
25//       // Compare bytes in needle with bytes in chunk. The result is a 16
26//       // byte chunk where each byte is 0xFF if the corresponding bytes
27//       // in needle and chunk were equal, or 0x00 otherwise.
28//       eqs = cmpeq(needle, chunk)
29//       // Return a 32 bit integer where the most significant 16 bits
30//       // are always 0 and the lower 16 bits correspond to whether the
31//       // most significant bit in the correspond byte in `eqs` is set.
32//       // In other words, `mask as u16` has bit i set if and only if
33//       // needle[i] == chunk[i].
34//       mask = movemask(eqs)
35//
36//       // Mask is 0 if there is no match, and non-zero otherwise.
37//       if mask != 0:
38//         // trailing_zeros tells us the position of the least significant
39//         // bit that is set.
40//         return i + trailing_zeros(mask)
41//
42//     // haystack length may not be a multiple of 16, so search the rest.
43//     while i < haystack.len():
44//       if haystack[i] == n1:
45//         return i
46//
47//     // No match found.
48//     return NULL
49//
50// In fact, we could loosely translate the above code to Rust line-for-line
51// and it would be a pretty fast algorithm. But, we pull out all the stops
52// to go as fast as possible:
53//
54// 1. We use aligned loads. That is, we do some finagling to make sure our
55//    primary loop not only proceeds in increments of 16 bytes, but that
56//    the address of haystack's pointer that we dereference is aligned to
57//    16 bytes. 16 is a magic number here because it is the size of SSE2
58//    128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
59//    Therefore, to get aligned loads, our pointer's address must be evenly
60//    divisible by 16.
61// 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
62//    kind of like loop unrolling, but we combine the equality comparisons
63//    using a vector OR such that we only need to extract a single mask to
64//    determine whether a match exists or not. If so, then we do some
65//    book-keeping to determine the precise location but otherwise mush on.
66// 3. We use our "chunk" comparison routine in as many places as possible,
67//    even if it means using unaligned loads. In particular, if haystack
68//    starts with an unaligned address, then we do an unaligned load to
69//    search the first 16 bytes. We then start our primary loop at the
70//    smallest subsequent aligned address, which will actually overlap with
71//    previously searched bytes. But we're OK with that. We do a similar
72//    dance at the end of our primary loop. Finally, to avoid a
73//    byte-at-a-time loop at the end, we do a final 16 byte unaligned load
74//    that may overlap with a previous load. This is OK because it converts
75//    a loop into a small number of very fast vector instructions. The overlap
76//    is OK because we know the place where the overlap occurs does not
77//    contain a match.
78//
79// And that's pretty all there is to it. Note that since the below is
80// generic and since it's meant to be inlined into routines with a
81// `#[target_feature(enable = "...")]` annotation, we must mark all routines as
82// both unsafe and `#[inline(always)]`.
83//
84// The fact that the code below is generic does somewhat inhibit us. For
85// example, I've noticed that introducing an unlineable `#[cold]` function to
86// handle the match case in the loop generates tighter assembly, but there is
87// no way to do this in the generic code below because the generic code doesn't
88// know what `target_feature` annotation to apply to the unlineable function.
89// We could make such functions part of the `Vector` trait, but we instead live
90// with the slightly sub-optimal codegen for now since it doesn't seem to have
91// a noticeable perf difference.
92
93use crate::{
94    ext::Pointer,
95    vector::{MoveMask, Vector},
96};
97
98/// Finds all occurrences of a single byte in a haystack.
99#[derive(Clone, Copy, Debug)]
100pub(crate) struct One<V> {
101    s1: u8,
102    v1: V,
103}
104
105impl<V: Vector> One<V> {
106    /// The number of bytes we examine per each iteration of our search loop.
107    const LOOP_SIZE: usize = 4 * V::BYTES;
108
109    /// Create a new searcher that finds occurrences of the byte given.
110    #[inline(always)]
111    pub(crate) unsafe fn new(needle: u8) -> One<V> {
112        One { s1: needle, v1: V::splat(needle) }
113    }
114
115    /// Returns the needle given to `One::new`.
116    #[inline(always)]
117    pub(crate) fn needle1(&self) -> u8 {
118        self.s1
119    }
120
121    /// Return a pointer to the first occurrence of the needle in the given
122    /// haystack. If no such occurrence exists, then `None` is returned.
123    ///
124    /// When a match is found, the pointer returned is guaranteed to be
125    /// `>= start` and `< end`.
126    ///
127    /// # Safety
128    ///
129    /// * It must be the case that `start < end` and that the distance between
130    /// them is at least equal to `V::BYTES`. That is, it must always be valid
131    /// to do at least an unaligned load of `V` at `start`.
132    /// * Both `start` and `end` must be valid for reads.
133    /// * Both `start` and `end` must point to an initialized value.
134    /// * Both `start` and `end` must point to the same allocated object and
135    /// must either be in bounds or at most one byte past the end of the
136    /// allocated object.
137    /// * Both `start` and `end` must be _derived from_ a pointer to the same
138    /// object.
139    /// * The distance between `start` and `end` must not overflow `isize`.
140    /// * The distance being in bounds must not rely on "wrapping around" the
141    /// address space.
142    #[inline(always)]
143    pub(crate) unsafe fn find_raw(
144        &self,
145        start: *const u8,
146        end: *const u8,
147    ) -> Option<*const u8> {
148        // If we want to support vectors bigger than 256 bits, we probably
149        // need to move up to using a u64 for the masks used below. Currently
150        // they are 32 bits, which means we're SOL for vectors that need masks
151        // bigger than 32 bits. Overall unclear until there's a use case.
152        debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
153
154        let topos = V::Mask::first_offset;
155        let len = end.distance(start);
156        debug_assert!(
157            len >= V::BYTES,
158            "haystack has length {}, but must be at least {}",
159            len,
160            V::BYTES
161        );
162
163        // Search a possibly unaligned chunk at `start`. This covers any part
164        // of the haystack prior to where aligned loads can start.
165        if let Some(cur) = self.search_chunk(start, topos) {
166            return Some(cur);
167        }
168        // Set `cur` to the first V-aligned pointer greater than `start`.
169        let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
170        debug_assert!(cur > start && end.sub(V::BYTES) >= start);
171        if len >= Self::LOOP_SIZE {
172            while cur <= end.sub(Self::LOOP_SIZE) {
173                debug_assert_eq!(0, cur.as_usize() % V::BYTES);
174
175                let a = V::load_aligned(cur);
176                let b = V::load_aligned(cur.add(1 * V::BYTES));
177                let c = V::load_aligned(cur.add(2 * V::BYTES));
178                let d = V::load_aligned(cur.add(3 * V::BYTES));
179                let eqa = self.v1.cmpeq(a);
180                let eqb = self.v1.cmpeq(b);
181                let eqc = self.v1.cmpeq(c);
182                let eqd = self.v1.cmpeq(d);
183                let or1 = eqa.or(eqb);
184                let or2 = eqc.or(eqd);
185                let or3 = or1.or(or2);
186                if or3.movemask_will_have_non_zero() {
187                    let mask = eqa.movemask();
188                    if mask.has_non_zero() {
189                        return Some(cur.add(topos(mask)));
190                    }
191
192                    let mask = eqb.movemask();
193                    if mask.has_non_zero() {
194                        return Some(cur.add(1 * V::BYTES).add(topos(mask)));
195                    }
196
197                    let mask = eqc.movemask();
198                    if mask.has_non_zero() {
199                        return Some(cur.add(2 * V::BYTES).add(topos(mask)));
200                    }
201
202                    let mask = eqd.movemask();
203                    debug_assert!(mask.has_non_zero());
204                    return Some(cur.add(3 * V::BYTES).add(topos(mask)));
205                }
206                cur = cur.add(Self::LOOP_SIZE);
207            }
208        }
209        // Handle any leftovers after the aligned loop above. We use unaligned
210        // loads here, but I believe we are guaranteed that they are aligned
211        // since `cur` is aligned.
212        while cur <= end.sub(V::BYTES) {
213            debug_assert!(end.distance(cur) >= V::BYTES);
214            if let Some(cur) = self.search_chunk(cur, topos) {
215                return Some(cur);
216            }
217            cur = cur.add(V::BYTES);
218        }
219        // Finally handle any remaining bytes less than the size of V. In this
220        // case, our pointer may indeed be unaligned and the load may overlap
221        // with the previous one. But that's okay since we know the previous
222        // load didn't lead to a match (otherwise we wouldn't be here).
223        if cur < end {
224            debug_assert!(end.distance(cur) < V::BYTES);
225            cur = cur.sub(V::BYTES - end.distance(cur));
226            debug_assert_eq!(end.distance(cur), V::BYTES);
227            return self.search_chunk(cur, topos);
228        }
229        None
230    }
231
232    /// Return a pointer to the last occurrence of the needle in the given
233    /// haystack. If no such occurrence exists, then `None` is returned.
234    ///
235    /// When a match is found, the pointer returned is guaranteed to be
236    /// `>= start` and `< end`.
237    ///
238    /// # Safety
239    ///
240    /// * It must be the case that `start < end` and that the distance between
241    /// them is at least equal to `V::BYTES`. That is, it must always be valid
242    /// to do at least an unaligned load of `V` at `start`.
243    /// * Both `start` and `end` must be valid for reads.
244    /// * Both `start` and `end` must point to an initialized value.
245    /// * Both `start` and `end` must point to the same allocated object and
246    /// must either be in bounds or at most one byte past the end of the
247    /// allocated object.
248    /// * Both `start` and `end` must be _derived from_ a pointer to the same
249    /// object.
250    /// * The distance between `start` and `end` must not overflow `isize`.
251    /// * The distance being in bounds must not rely on "wrapping around" the
252    /// address space.
253    #[inline(always)]
254    pub(crate) unsafe fn rfind_raw(
255        &self,
256        start: *const u8,
257        end: *const u8,
258    ) -> Option<*const u8> {
259        // If we want to support vectors bigger than 256 bits, we probably
260        // need to move up to using a u64 for the masks used below. Currently
261        // they are 32 bits, which means we're SOL for vectors that need masks
262        // bigger than 32 bits. Overall unclear until there's a use case.
263        debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
264
265        let topos = V::Mask::last_offset;
266        let len = end.distance(start);
267        debug_assert!(
268            len >= V::BYTES,
269            "haystack has length {}, but must be at least {}",
270            len,
271            V::BYTES
272        );
273
274        if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
275            return Some(cur);
276        }
277        let mut cur = end.sub(end.as_usize() & V::ALIGN);
278        debug_assert!(start <= cur && cur <= end);
279        if len >= Self::LOOP_SIZE {
280            while cur >= start.add(Self::LOOP_SIZE) {
281                debug_assert_eq!(0, cur.as_usize() % V::BYTES);
282
283                cur = cur.sub(Self::LOOP_SIZE);
284                let a = V::load_aligned(cur);
285                let b = V::load_aligned(cur.add(1 * V::BYTES));
286                let c = V::load_aligned(cur.add(2 * V::BYTES));
287                let d = V::load_aligned(cur.add(3 * V::BYTES));
288                let eqa = self.v1.cmpeq(a);
289                let eqb = self.v1.cmpeq(b);
290                let eqc = self.v1.cmpeq(c);
291                let eqd = self.v1.cmpeq(d);
292                let or1 = eqa.or(eqb);
293                let or2 = eqc.or(eqd);
294                let or3 = or1.or(or2);
295                if or3.movemask_will_have_non_zero() {
296                    let mask = eqd.movemask();
297                    if mask.has_non_zero() {
298                        return Some(cur.add(3 * V::BYTES).add(topos(mask)));
299                    }
300
301                    let mask = eqc.movemask();
302                    if mask.has_non_zero() {
303                        return Some(cur.add(2 * V::BYTES).add(topos(mask)));
304                    }
305
306                    let mask = eqb.movemask();
307                    if mask.has_non_zero() {
308                        return Some(cur.add(1 * V::BYTES).add(topos(mask)));
309                    }
310
311                    let mask = eqa.movemask();
312                    debug_assert!(mask.has_non_zero());
313                    return Some(cur.add(topos(mask)));
314                }
315            }
316        }
317        while cur >= start.add(V::BYTES) {
318            debug_assert!(cur.distance(start) >= V::BYTES);
319            cur = cur.sub(V::BYTES);
320            if let Some(cur) = self.search_chunk(cur, topos) {
321                return Some(cur);
322            }
323        }
324        if cur > start {
325            debug_assert!(cur.distance(start) < V::BYTES);
326            return self.search_chunk(start, topos);
327        }
328        None
329    }
330
331    /// Return a count of all matching bytes in the given haystack.
332    ///
333    /// # Safety
334    ///
335    /// * It must be the case that `start < end` and that the distance between
336    /// them is at least equal to `V::BYTES`. That is, it must always be valid
337    /// to do at least an unaligned load of `V` at `start`.
338    /// * Both `start` and `end` must be valid for reads.
339    /// * Both `start` and `end` must point to an initialized value.
340    /// * Both `start` and `end` must point to the same allocated object and
341    /// must either be in bounds or at most one byte past the end of the
342    /// allocated object.
343    /// * Both `start` and `end` must be _derived from_ a pointer to the same
344    /// object.
345    /// * The distance between `start` and `end` must not overflow `isize`.
346    /// * The distance being in bounds must not rely on "wrapping around" the
347    /// address space.
348    #[inline(always)]
349    pub(crate) unsafe fn count_raw(
350        &self,
351        start: *const u8,
352        end: *const u8,
353    ) -> usize {
354        debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
355
356        let confirm = |b| b == self.needle1();
357        let len = end.distance(start);
358        debug_assert!(
359            len >= V::BYTES,
360            "haystack has length {}, but must be at least {}",
361            len,
362            V::BYTES
363        );
364
365        // Set `cur` to the first V-aligned pointer greater than `start`.
366        let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
367        // Count any matching bytes before we start our aligned loop.
368        let mut count = count_byte_by_byte(start, cur, confirm);
369        debug_assert!(cur > start && end.sub(V::BYTES) >= start);
370        if len >= Self::LOOP_SIZE {
371            while cur <= end.sub(Self::LOOP_SIZE) {
372                debug_assert_eq!(0, cur.as_usize() % V::BYTES);
373
374                let a = V::load_aligned(cur);
375                let b = V::load_aligned(cur.add(1 * V::BYTES));
376                let c = V::load_aligned(cur.add(2 * V::BYTES));
377                let d = V::load_aligned(cur.add(3 * V::BYTES));
378                let eqa = self.v1.cmpeq(a);
379                let eqb = self.v1.cmpeq(b);
380                let eqc = self.v1.cmpeq(c);
381                let eqd = self.v1.cmpeq(d);
382                count += eqa.movemask().count_ones();
383                count += eqb.movemask().count_ones();
384                count += eqc.movemask().count_ones();
385                count += eqd.movemask().count_ones();
386                cur = cur.add(Self::LOOP_SIZE);
387            }
388        }
389        // Handle any leftovers after the aligned loop above. We use unaligned
390        // loads here, but I believe we are guaranteed that they are aligned
391        // since `cur` is aligned.
392        while cur <= end.sub(V::BYTES) {
393            debug_assert!(end.distance(cur) >= V::BYTES);
394            let chunk = V::load_unaligned(cur);
395            count += self.v1.cmpeq(chunk).movemask().count_ones();
396            cur = cur.add(V::BYTES);
397        }
398        // And finally count any leftovers that weren't caught above.
399        count += count_byte_by_byte(cur, end, confirm);
400        count
401    }
402
403    /// Search `V::BYTES` starting at `cur` via an unaligned load.
404    ///
405    /// `mask_to_offset` should be a function that converts a `movemask` to
406    /// an offset such that `cur.add(offset)` corresponds to a pointer to the
407    /// match location if one is found. Generally it is expected to use either
408    /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
409    /// one is implementing a forward or reverse search, respectively.
410    ///
411    /// # Safety
412    ///
413    /// `cur` must be a valid pointer and it must be valid to do an unaligned
414    /// load of size `V::BYTES` at `cur`.
415    #[inline(always)]
416    unsafe fn search_chunk(
417        &self,
418        cur: *const u8,
419        mask_to_offset: impl Fn(V::Mask) -> usize,
420    ) -> Option<*const u8> {
421        let chunk = V::load_unaligned(cur);
422        let mask = self.v1.cmpeq(chunk).movemask();
423        if mask.has_non_zero() {
424            Some(cur.add(mask_to_offset(mask)))
425        } else {
426            None
427        }
428    }
429}
430
431/// Finds all occurrences of two bytes in a haystack.
432///
433/// That is, this reports matches of one of two possible bytes. For example,
434/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
435/// `4` and `5`.
436#[derive(Clone, Copy, Debug)]
437pub(crate) struct Two<V> {
438    s1: u8,
439    s2: u8,
440    v1: V,
441    v2: V,
442}
443
444impl<V: Vector> Two<V> {
445    /// The number of bytes we examine per each iteration of our search loop.
446    const LOOP_SIZE: usize = 2 * V::BYTES;
447
448    /// Create a new searcher that finds occurrences of the byte given.
449    #[inline(always)]
450    pub(crate) unsafe fn new(needle1: u8, needle2: u8) -> Two<V> {
451        Two {
452            s1: needle1,
453            s2: needle2,
454            v1: V::splat(needle1),
455            v2: V::splat(needle2),
456        }
457    }
458
459    /// Returns the first needle given to `Two::new`.
460    #[inline(always)]
461    pub(crate) fn needle1(&self) -> u8 {
462        self.s1
463    }
464
465    /// Returns the second needle given to `Two::new`.
466    #[inline(always)]
467    pub(crate) fn needle2(&self) -> u8 {
468        self.s2
469    }
470
471    /// Return a pointer to the first occurrence of one of the needles in the
472    /// given haystack. If no such occurrence exists, then `None` is returned.
473    ///
474    /// When a match is found, the pointer returned is guaranteed to be
475    /// `>= start` and `< end`.
476    ///
477    /// # Safety
478    ///
479    /// * It must be the case that `start < end` and that the distance between
480    /// them is at least equal to `V::BYTES`. That is, it must always be valid
481    /// to do at least an unaligned load of `V` at `start`.
482    /// * Both `start` and `end` must be valid for reads.
483    /// * Both `start` and `end` must point to an initialized value.
484    /// * Both `start` and `end` must point to the same allocated object and
485    /// must either be in bounds or at most one byte past the end of the
486    /// allocated object.
487    /// * Both `start` and `end` must be _derived from_ a pointer to the same
488    /// object.
489    /// * The distance between `start` and `end` must not overflow `isize`.
490    /// * The distance being in bounds must not rely on "wrapping around" the
491    /// address space.
492    #[inline(always)]
493    pub(crate) unsafe fn find_raw(
494        &self,
495        start: *const u8,
496        end: *const u8,
497    ) -> Option<*const u8> {
498        // If we want to support vectors bigger than 256 bits, we probably
499        // need to move up to using a u64 for the masks used below. Currently
500        // they are 32 bits, which means we're SOL for vectors that need masks
501        // bigger than 32 bits. Overall unclear until there's a use case.
502        debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
503
504        let topos = V::Mask::first_offset;
505        let len = end.distance(start);
506        debug_assert!(
507            len >= V::BYTES,
508            "haystack has length {}, but must be at least {}",
509            len,
510            V::BYTES
511        );
512
513        // Search a possibly unaligned chunk at `start`. This covers any part
514        // of the haystack prior to where aligned loads can start.
515        if let Some(cur) = self.search_chunk(start, topos) {
516            return Some(cur);
517        }
518        // Set `cur` to the first V-aligned pointer greater than `start`.
519        let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
520        debug_assert!(cur > start && end.sub(V::BYTES) >= start);
521        if len >= Self::LOOP_SIZE {
522            while cur <= end.sub(Self::LOOP_SIZE) {
523                debug_assert_eq!(0, cur.as_usize() % V::BYTES);
524
525                let a = V::load_aligned(cur);
526                let b = V::load_aligned(cur.add(V::BYTES));
527                let eqa1 = self.v1.cmpeq(a);
528                let eqb1 = self.v1.cmpeq(b);
529                let eqa2 = self.v2.cmpeq(a);
530                let eqb2 = self.v2.cmpeq(b);
531                let or1 = eqa1.or(eqb1);
532                let or2 = eqa2.or(eqb2);
533                let or3 = or1.or(or2);
534                if or3.movemask_will_have_non_zero() {
535                    let mask = eqa1.movemask().or(eqa2.movemask());
536                    if mask.has_non_zero() {
537                        return Some(cur.add(topos(mask)));
538                    }
539
540                    let mask = eqb1.movemask().or(eqb2.movemask());
541                    debug_assert!(mask.has_non_zero());
542                    return Some(cur.add(V::BYTES).add(topos(mask)));
543                }
544                cur = cur.add(Self::LOOP_SIZE);
545            }
546        }
547        // Handle any leftovers after the aligned loop above. We use unaligned
548        // loads here, but I believe we are guaranteed that they are aligned
549        // since `cur` is aligned.
550        while cur <= end.sub(V::BYTES) {
551            debug_assert!(end.distance(cur) >= V::BYTES);
552            if let Some(cur) = self.search_chunk(cur, topos) {
553                return Some(cur);
554            }
555            cur = cur.add(V::BYTES);
556        }
557        // Finally handle any remaining bytes less than the size of V. In this
558        // case, our pointer may indeed be unaligned and the load may overlap
559        // with the previous one. But that's okay since we know the previous
560        // load didn't lead to a match (otherwise we wouldn't be here).
561        if cur < end {
562            debug_assert!(end.distance(cur) < V::BYTES);
563            cur = cur.sub(V::BYTES - end.distance(cur));
564            debug_assert_eq!(end.distance(cur), V::BYTES);
565            return self.search_chunk(cur, topos);
566        }
567        None
568    }
569
570    /// Return a pointer to the last occurrence of the needle in the given
571    /// haystack. If no such occurrence exists, then `None` is returned.
572    ///
573    /// When a match is found, the pointer returned is guaranteed to be
574    /// `>= start` and `< end`.
575    ///
576    /// # Safety
577    ///
578    /// * It must be the case that `start < end` and that the distance between
579    /// them is at least equal to `V::BYTES`. That is, it must always be valid
580    /// to do at least an unaligned load of `V` at `start`.
581    /// * Both `start` and `end` must be valid for reads.
582    /// * Both `start` and `end` must point to an initialized value.
583    /// * Both `start` and `end` must point to the same allocated object and
584    /// must either be in bounds or at most one byte past the end of the
585    /// allocated object.
586    /// * Both `start` and `end` must be _derived from_ a pointer to the same
587    /// object.
588    /// * The distance between `start` and `end` must not overflow `isize`.
589    /// * The distance being in bounds must not rely on "wrapping around" the
590    /// address space.
591    #[inline(always)]
592    pub(crate) unsafe fn rfind_raw(
593        &self,
594        start: *const u8,
595        end: *const u8,
596    ) -> Option<*const u8> {
597        // If we want to support vectors bigger than 256 bits, we probably
598        // need to move up to using a u64 for the masks used below. Currently
599        // they are 32 bits, which means we're SOL for vectors that need masks
600        // bigger than 32 bits. Overall unclear until there's a use case.
601        debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
602
603        let topos = V::Mask::last_offset;
604        let len = end.distance(start);
605        debug_assert!(
606            len >= V::BYTES,
607            "haystack has length {}, but must be at least {}",
608            len,
609            V::BYTES
610        );
611
612        if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
613            return Some(cur);
614        }
615        let mut cur = end.sub(end.as_usize() & V::ALIGN);
616        debug_assert!(start <= cur && cur <= end);
617        if len >= Self::LOOP_SIZE {
618            while cur >= start.add(Self::LOOP_SIZE) {
619                debug_assert_eq!(0, cur.as_usize() % V::BYTES);
620
621                cur = cur.sub(Self::LOOP_SIZE);
622                let a = V::load_aligned(cur);
623                let b = V::load_aligned(cur.add(V::BYTES));
624                let eqa1 = self.v1.cmpeq(a);
625                let eqb1 = self.v1.cmpeq(b);
626                let eqa2 = self.v2.cmpeq(a);
627                let eqb2 = self.v2.cmpeq(b);
628                let or1 = eqa1.or(eqb1);
629                let or2 = eqa2.or(eqb2);
630                let or3 = or1.or(or2);
631                if or3.movemask_will_have_non_zero() {
632                    let mask = eqb1.movemask().or(eqb2.movemask());
633                    if mask.has_non_zero() {
634                        return Some(cur.add(V::BYTES).add(topos(mask)));
635                    }
636
637                    let mask = eqa1.movemask().or(eqa2.movemask());
638                    debug_assert!(mask.has_non_zero());
639                    return Some(cur.add(topos(mask)));
640                }
641            }
642        }
643        while cur >= start.add(V::BYTES) {
644            debug_assert!(cur.distance(start) >= V::BYTES);
645            cur = cur.sub(V::BYTES);
646            if let Some(cur) = self.search_chunk(cur, topos) {
647                return Some(cur);
648            }
649        }
650        if cur > start {
651            debug_assert!(cur.distance(start) < V::BYTES);
652            return self.search_chunk(start, topos);
653        }
654        None
655    }
656
657    /// Search `V::BYTES` starting at `cur` via an unaligned load.
658    ///
659    /// `mask_to_offset` should be a function that converts a `movemask` to
660    /// an offset such that `cur.add(offset)` corresponds to a pointer to the
661    /// match location if one is found. Generally it is expected to use either
662    /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
663    /// one is implementing a forward or reverse search, respectively.
664    ///
665    /// # Safety
666    ///
667    /// `cur` must be a valid pointer and it must be valid to do an unaligned
668    /// load of size `V::BYTES` at `cur`.
669    #[inline(always)]
670    unsafe fn search_chunk(
671        &self,
672        cur: *const u8,
673        mask_to_offset: impl Fn(V::Mask) -> usize,
674    ) -> Option<*const u8> {
675        let chunk = V::load_unaligned(cur);
676        let eq1 = self.v1.cmpeq(chunk);
677        let eq2 = self.v2.cmpeq(chunk);
678        let mask = eq1.or(eq2).movemask();
679        if mask.has_non_zero() {
680            let mask1 = eq1.movemask();
681            let mask2 = eq2.movemask();
682            Some(cur.add(mask_to_offset(mask1.or(mask2))))
683        } else {
684            None
685        }
686    }
687}
688
689/// Finds all occurrences of two bytes in a haystack.
690///
691/// That is, this reports matches of one of two possible bytes. For example,
692/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
693/// `4` and `5`.
694#[derive(Clone, Copy, Debug)]
695pub(crate) struct Three<V> {
696    s1: u8,
697    s2: u8,
698    s3: u8,
699    v1: V,
700    v2: V,
701    v3: V,
702}
703
704impl<V: Vector> Three<V> {
705    /// The number of bytes we examine per each iteration of our search loop.
706    const LOOP_SIZE: usize = 2 * V::BYTES;
707
708    /// Create a new searcher that finds occurrences of the byte given.
709    #[inline(always)]
710    pub(crate) unsafe fn new(
711        needle1: u8,
712        needle2: u8,
713        needle3: u8,
714    ) -> Three<V> {
715        Three {
716            s1: needle1,
717            s2: needle2,
718            s3: needle3,
719            v1: V::splat(needle1),
720            v2: V::splat(needle2),
721            v3: V::splat(needle3),
722        }
723    }
724
725    /// Returns the first needle given to `Three::new`.
726    #[inline(always)]
727    pub(crate) fn needle1(&self) -> u8 {
728        self.s1
729    }
730
731    /// Returns the second needle given to `Three::new`.
732    #[inline(always)]
733    pub(crate) fn needle2(&self) -> u8 {
734        self.s2
735    }
736
737    /// Returns the third needle given to `Three::new`.
738    #[inline(always)]
739    pub(crate) fn needle3(&self) -> u8 {
740        self.s3
741    }
742
743    /// Return a pointer to the first occurrence of one of the needles in the
744    /// given haystack. If no such occurrence exists, then `None` is returned.
745    ///
746    /// When a match is found, the pointer returned is guaranteed to be
747    /// `>= start` and `< end`.
748    ///
749    /// # Safety
750    ///
751    /// * It must be the case that `start < end` and that the distance between
752    /// them is at least equal to `V::BYTES`. That is, it must always be valid
753    /// to do at least an unaligned load of `V` at `start`.
754    /// * Both `start` and `end` must be valid for reads.
755    /// * Both `start` and `end` must point to an initialized value.
756    /// * Both `start` and `end` must point to the same allocated object and
757    /// must either be in bounds or at most one byte past the end of the
758    /// allocated object.
759    /// * Both `start` and `end` must be _derived from_ a pointer to the same
760    /// object.
761    /// * The distance between `start` and `end` must not overflow `isize`.
762    /// * The distance being in bounds must not rely on "wrapping around" the
763    /// address space.
764    #[inline(always)]
765    pub(crate) unsafe fn find_raw(
766        &self,
767        start: *const u8,
768        end: *const u8,
769    ) -> Option<*const u8> {
770        // If we want to support vectors bigger than 256 bits, we probably
771        // need to move up to using a u64 for the masks used below. Currently
772        // they are 32 bits, which means we're SOL for vectors that need masks
773        // bigger than 32 bits. Overall unclear until there's a use case.
774        debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
775
776        let topos = V::Mask::first_offset;
777        let len = end.distance(start);
778        debug_assert!(
779            len >= V::BYTES,
780            "haystack has length {}, but must be at least {}",
781            len,
782            V::BYTES
783        );
784
785        // Search a possibly unaligned chunk at `start`. This covers any part
786        // of the haystack prior to where aligned loads can start.
787        if let Some(cur) = self.search_chunk(start, topos) {
788            return Some(cur);
789        }
790        // Set `cur` to the first V-aligned pointer greater than `start`.
791        let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
792        debug_assert!(cur > start && end.sub(V::BYTES) >= start);
793        if len >= Self::LOOP_SIZE {
794            while cur <= end.sub(Self::LOOP_SIZE) {
795                debug_assert_eq!(0, cur.as_usize() % V::BYTES);
796
797                let a = V::load_aligned(cur);
798                let b = V::load_aligned(cur.add(V::BYTES));
799                let eqa1 = self.v1.cmpeq(a);
800                let eqb1 = self.v1.cmpeq(b);
801                let eqa2 = self.v2.cmpeq(a);
802                let eqb2 = self.v2.cmpeq(b);
803                let eqa3 = self.v3.cmpeq(a);
804                let eqb3 = self.v3.cmpeq(b);
805                let or1 = eqa1.or(eqb1);
806                let or2 = eqa2.or(eqb2);
807                let or3 = eqa3.or(eqb3);
808                let or4 = or1.or(or2);
809                let or5 = or3.or(or4);
810                if or5.movemask_will_have_non_zero() {
811                    let mask = eqa1
812                        .movemask()
813                        .or(eqa2.movemask())
814                        .or(eqa3.movemask());
815                    if mask.has_non_zero() {
816                        return Some(cur.add(topos(mask)));
817                    }
818
819                    let mask = eqb1
820                        .movemask()
821                        .or(eqb2.movemask())
822                        .or(eqb3.movemask());
823                    debug_assert!(mask.has_non_zero());
824                    return Some(cur.add(V::BYTES).add(topos(mask)));
825                }
826                cur = cur.add(Self::LOOP_SIZE);
827            }
828        }
829        // Handle any leftovers after the aligned loop above. We use unaligned
830        // loads here, but I believe we are guaranteed that they are aligned
831        // since `cur` is aligned.
832        while cur <= end.sub(V::BYTES) {
833            debug_assert!(end.distance(cur) >= V::BYTES);
834            if let Some(cur) = self.search_chunk(cur, topos) {
835                return Some(cur);
836            }
837            cur = cur.add(V::BYTES);
838        }
839        // Finally handle any remaining bytes less than the size of V. In this
840        // case, our pointer may indeed be unaligned and the load may overlap
841        // with the previous one. But that's okay since we know the previous
842        // load didn't lead to a match (otherwise we wouldn't be here).
843        if cur < end {
844            debug_assert!(end.distance(cur) < V::BYTES);
845            cur = cur.sub(V::BYTES - end.distance(cur));
846            debug_assert_eq!(end.distance(cur), V::BYTES);
847            return self.search_chunk(cur, topos);
848        }
849        None
850    }
851
852    /// Return a pointer to the last occurrence of the needle in the given
853    /// haystack. If no such occurrence exists, then `None` is returned.
854    ///
855    /// When a match is found, the pointer returned is guaranteed to be
856    /// `>= start` and `< end`.
857    ///
858    /// # Safety
859    ///
860    /// * It must be the case that `start < end` and that the distance between
861    /// them is at least equal to `V::BYTES`. That is, it must always be valid
862    /// to do at least an unaligned load of `V` at `start`.
863    /// * Both `start` and `end` must be valid for reads.
864    /// * Both `start` and `end` must point to an initialized value.
865    /// * Both `start` and `end` must point to the same allocated object and
866    /// must either be in bounds or at most one byte past the end of the
867    /// allocated object.
868    /// * Both `start` and `end` must be _derived from_ a pointer to the same
869    /// object.
870    /// * The distance between `start` and `end` must not overflow `isize`.
871    /// * The distance being in bounds must not rely on "wrapping around" the
872    /// address space.
873    #[inline(always)]
874    pub(crate) unsafe fn rfind_raw(
875        &self,
876        start: *const u8,
877        end: *const u8,
878    ) -> Option<*const u8> {
879        // If we want to support vectors bigger than 256 bits, we probably
880        // need to move up to using a u64 for the masks used below. Currently
881        // they are 32 bits, which means we're SOL for vectors that need masks
882        // bigger than 32 bits. Overall unclear until there's a use case.
883        debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
884
885        let topos = V::Mask::last_offset;
886        let len = end.distance(start);
887        debug_assert!(
888            len >= V::BYTES,
889            "haystack has length {}, but must be at least {}",
890            len,
891            V::BYTES
892        );
893
894        if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
895            return Some(cur);
896        }
897        let mut cur = end.sub(end.as_usize() & V::ALIGN);
898        debug_assert!(start <= cur && cur <= end);
899        if len >= Self::LOOP_SIZE {
900            while cur >= start.add(Self::LOOP_SIZE) {
901                debug_assert_eq!(0, cur.as_usize() % V::BYTES);
902
903                cur = cur.sub(Self::LOOP_SIZE);
904                let a = V::load_aligned(cur);
905                let b = V::load_aligned(cur.add(V::BYTES));
906                let eqa1 = self.v1.cmpeq(a);
907                let eqb1 = self.v1.cmpeq(b);
908                let eqa2 = self.v2.cmpeq(a);
909                let eqb2 = self.v2.cmpeq(b);
910                let eqa3 = self.v3.cmpeq(a);
911                let eqb3 = self.v3.cmpeq(b);
912                let or1 = eqa1.or(eqb1);
913                let or2 = eqa2.or(eqb2);
914                let or3 = eqa3.or(eqb3);
915                let or4 = or1.or(or2);
916                let or5 = or3.or(or4);
917                if or5.movemask_will_have_non_zero() {
918                    let mask = eqb1
919                        .movemask()
920                        .or(eqb2.movemask())
921                        .or(eqb3.movemask());
922                    if mask.has_non_zero() {
923                        return Some(cur.add(V::BYTES).add(topos(mask)));
924                    }
925
926                    let mask = eqa1
927                        .movemask()
928                        .or(eqa2.movemask())
929                        .or(eqa3.movemask());
930                    debug_assert!(mask.has_non_zero());
931                    return Some(cur.add(topos(mask)));
932                }
933            }
934        }
935        while cur >= start.add(V::BYTES) {
936            debug_assert!(cur.distance(start) >= V::BYTES);
937            cur = cur.sub(V::BYTES);
938            if let Some(cur) = self.search_chunk(cur, topos) {
939                return Some(cur);
940            }
941        }
942        if cur > start {
943            debug_assert!(cur.distance(start) < V::BYTES);
944            return self.search_chunk(start, topos);
945        }
946        None
947    }
948
949    /// Search `V::BYTES` starting at `cur` via an unaligned load.
950    ///
951    /// `mask_to_offset` should be a function that converts a `movemask` to
952    /// an offset such that `cur.add(offset)` corresponds to a pointer to the
953    /// match location if one is found. Generally it is expected to use either
954    /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
955    /// one is implementing a forward or reverse search, respectively.
956    ///
957    /// # Safety
958    ///
959    /// `cur` must be a valid pointer and it must be valid to do an unaligned
960    /// load of size `V::BYTES` at `cur`.
961    #[inline(always)]
962    unsafe fn search_chunk(
963        &self,
964        cur: *const u8,
965        mask_to_offset: impl Fn(V::Mask) -> usize,
966    ) -> Option<*const u8> {
967        let chunk = V::load_unaligned(cur);
968        let eq1 = self.v1.cmpeq(chunk);
969        let eq2 = self.v2.cmpeq(chunk);
970        let eq3 = self.v3.cmpeq(chunk);
971        let mask = eq1.or(eq2).or(eq3).movemask();
972        if mask.has_non_zero() {
973            let mask1 = eq1.movemask();
974            let mask2 = eq2.movemask();
975            let mask3 = eq3.movemask();
976            Some(cur.add(mask_to_offset(mask1.or(mask2).or(mask3))))
977        } else {
978            None
979        }
980    }
981}
982
983/// An iterator over all occurrences of a set of bytes in a haystack.
984///
985/// This iterator implements the routines necessary to provide a
986/// `DoubleEndedIterator` impl, which means it can also be used to find
987/// occurrences in reverse order.
988///
989/// The lifetime parameters are as follows:
990///
991/// * `'h` refers to the lifetime of the haystack being searched.
992///
993/// This type is intended to be used to implement all iterators for the
994/// `memchr` family of functions. It handles a tiny bit of marginally tricky
995/// raw pointer math, but otherwise expects the caller to provide `find_raw`
996/// and `rfind_raw` routines for each call of `next` and `next_back`,
997/// respectively.
998#[derive(Clone, Debug)]
999pub(crate) struct Iter<'h> {
1000    /// The original starting point into the haystack. We use this to convert
1001    /// pointers to offsets.
1002    original_start: *const u8,
1003    /// The current starting point into the haystack. That is, where the next
1004    /// search will begin.
1005    start: *const u8,
1006    /// The current ending point into the haystack. That is, where the next
1007    /// reverse search will begin.
1008    end: *const u8,
1009    /// A marker for tracking the lifetime of the start/cur_start/cur_end
1010    /// pointers above, which all point into the haystack.
1011    haystack: core::marker::PhantomData<&'h [u8]>,
1012}
1013
1014// SAFETY: Iter contains no shared references to anything that performs any
1015// interior mutations. Also, the lifetime guarantees that Iter will not outlive
1016// the haystack.
1017unsafe impl<'h> Send for Iter<'h> {}
1018
1019// SAFETY: Iter perform no interior mutations, therefore no explicit
1020// synchronization is necessary. Also, the lifetime guarantees that Iter will
1021// not outlive the haystack.
1022unsafe impl<'h> Sync for Iter<'h> {}
1023
1024impl<'h> Iter<'h> {
1025    /// Create a new generic memchr iterator.
1026    #[inline(always)]
1027    pub(crate) fn new(haystack: &'h [u8]) -> Iter<'h> {
1028        Iter {
1029            original_start: haystack.as_ptr(),
1030            start: haystack.as_ptr(),
1031            end: haystack.as_ptr().wrapping_add(haystack.len()),
1032            haystack: core::marker::PhantomData,
1033        }
1034    }
1035
1036    /// Returns the next occurrence in the forward direction.
1037    ///
1038    /// # Safety
1039    ///
1040    /// Callers must ensure that if a pointer is returned from the closure
1041    /// provided, then it must be greater than or equal to the start pointer
1042    /// and less than the end pointer.
1043    #[inline(always)]
1044    pub(crate) unsafe fn next(
1045        &mut self,
1046        mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
1047    ) -> Option<usize> {
1048        // SAFETY: Pointers are derived directly from the same &[u8] haystack.
1049        // We only ever modify start/end corresponding to a matching offset
1050        // found between start and end. Thus all changes to start/end maintain
1051        // our safety requirements.
1052        //
1053        // The only other assumption we rely on is that the pointer returned
1054        // by `find_raw` satisfies `self.start <= found < self.end`, and that
1055        // safety contract is forwarded to the caller.
1056        let found = find_raw(self.start, self.end)?;
1057        let result = found.distance(self.original_start);
1058        self.start = found.add(1);
1059        Some(result)
1060    }
1061
1062    /// Returns the number of remaining elements in this iterator.
1063    #[inline(always)]
1064    pub(crate) fn count(
1065        self,
1066        mut count_raw: impl FnMut(*const u8, *const u8) -> usize,
1067    ) -> usize {
1068        // SAFETY: Pointers are derived directly from the same &[u8] haystack.
1069        // We only ever modify start/end corresponding to a matching offset
1070        // found between start and end. Thus all changes to start/end maintain
1071        // our safety requirements.
1072        count_raw(self.start, self.end)
1073    }
1074
1075    /// Returns the next occurrence in reverse.
1076    ///
1077    /// # Safety
1078    ///
1079    /// Callers must ensure that if a pointer is returned from the closure
1080    /// provided, then it must be greater than or equal to the start pointer
1081    /// and less than the end pointer.
1082    #[inline(always)]
1083    pub(crate) unsafe fn next_back(
1084        &mut self,
1085        mut rfind_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
1086    ) -> Option<usize> {
1087        // SAFETY: Pointers are derived directly from the same &[u8] haystack.
1088        // We only ever modify start/end corresponding to a matching offset
1089        // found between start and end. Thus all changes to start/end maintain
1090        // our safety requirements.
1091        //
1092        // The only other assumption we rely on is that the pointer returned
1093        // by `rfind_raw` satisfies `self.start <= found < self.end`, and that
1094        // safety contract is forwarded to the caller.
1095        let found = rfind_raw(self.start, self.end)?;
1096        let result = found.distance(self.original_start);
1097        self.end = found;
1098        Some(result)
1099    }
1100
1101    /// Provides an implementation of `Iterator::size_hint`.
1102    #[inline(always)]
1103    pub(crate) fn size_hint(&self) -> (usize, Option<usize>) {
1104        (0, Some(self.end.as_usize().saturating_sub(self.start.as_usize())))
1105    }
1106}
1107
1108/// Search a slice using a function that operates on raw pointers.
1109///
1110/// Given a function to search a contiguous sequence of memory for the location
1111/// of a non-empty set of bytes, this will execute that search on a slice of
1112/// bytes. The pointer returned by the given function will be converted to an
1113/// offset relative to the starting point of the given slice. That is, if a
1114/// match is found, the offset returned by this routine is guaranteed to be a
1115/// valid index into `haystack`.
1116///
1117/// Callers may use this for a forward or reverse search.
1118///
1119/// # Safety
1120///
1121/// Callers must ensure that if a pointer is returned by `find_raw`, then the
1122/// pointer must be greater than or equal to the starting pointer and less than
1123/// the end pointer.
1124#[inline(always)]
1125pub(crate) unsafe fn search_slice_with_raw(
1126    haystack: &[u8],
1127    mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
1128) -> Option<usize> {
1129    // SAFETY: We rely on `find_raw` to return a correct and valid pointer, but
1130    // otherwise, `start` and `end` are valid due to the guarantees provided by
1131    // a &[u8].
1132    let start = haystack.as_ptr();
1133    let end = start.add(haystack.len());
1134    let found = find_raw(start, end)?;
1135    Some(found.distance(start))
1136}
1137
1138/// Performs a forward byte-at-a-time loop until either `ptr >= end_ptr` or
1139/// until `confirm(*ptr)` returns `true`. If the former occurs, then `None` is
1140/// returned. If the latter occurs, then the pointer at which `confirm` returns
1141/// `true` is returned.
1142///
1143/// # Safety
1144///
1145/// Callers must provide valid pointers and they must satisfy `start_ptr <=
1146/// ptr` and `ptr <= end_ptr`.
1147#[inline(always)]
1148pub(crate) unsafe fn fwd_byte_by_byte<F: Fn(u8) -> bool>(
1149    start: *const u8,
1150    end: *const u8,
1151    confirm: F,
1152) -> Option<*const u8> {
1153    debug_assert!(start <= end);
1154    let mut ptr = start;
1155    while ptr < end {
1156        if confirm(*ptr) {
1157            return Some(ptr);
1158        }
1159        ptr = ptr.offset(1);
1160    }
1161    None
1162}
1163
1164/// Performs a reverse byte-at-a-time loop until either `ptr < start_ptr` or
1165/// until `confirm(*ptr)` returns `true`. If the former occurs, then `None` is
1166/// returned. If the latter occurs, then the pointer at which `confirm` returns
1167/// `true` is returned.
1168///
1169/// # Safety
1170///
1171/// Callers must provide valid pointers and they must satisfy `start_ptr <=
1172/// ptr` and `ptr <= end_ptr`.
1173#[inline(always)]
1174pub(crate) unsafe fn rev_byte_by_byte<F: Fn(u8) -> bool>(
1175    start: *const u8,
1176    end: *const u8,
1177    confirm: F,
1178) -> Option<*const u8> {
1179    debug_assert!(start <= end);
1180
1181    let mut ptr = end;
1182    while ptr > start {
1183        ptr = ptr.offset(-1);
1184        if confirm(*ptr) {
1185            return Some(ptr);
1186        }
1187    }
1188    None
1189}
1190
1191/// Performs a forward byte-at-a-time loop until `ptr >= end_ptr` and returns
1192/// the number of times `confirm(*ptr)` returns `true`.
1193///
1194/// # Safety
1195///
1196/// Callers must provide valid pointers and they must satisfy `start_ptr <=
1197/// ptr` and `ptr <= end_ptr`.
1198#[inline(always)]
1199pub(crate) unsafe fn count_byte_by_byte<F: Fn(u8) -> bool>(
1200    start: *const u8,
1201    end: *const u8,
1202    confirm: F,
1203) -> usize {
1204    debug_assert!(start <= end);
1205    let mut ptr = start;
1206    let mut count = 0;
1207    while ptr < end {
1208        if confirm(*ptr) {
1209            count += 1;
1210        }
1211        ptr = ptr.offset(1);
1212    }
1213    count
1214}