aho_corasick/util/
search.rs

1use core::ops::{Range, RangeBounds};
2
3use crate::util::primitives::PatternID;
4
5/// The configuration and the haystack to use for an Aho-Corasick search.
6///
7/// When executing a search, there are a few parameters one might want to
8/// configure:
9///
10/// * The haystack to search, provided to the [`Input::new`] constructor. This
11/// is the only required parameter.
12/// * The span _within_ the haystack to limit a search to. (The default
13/// is the entire haystack.) This is configured via [`Input::span`] or
14/// [`Input::range`].
15/// * Whether to run an unanchored (matches can occur anywhere after the
16/// start of the search) or anchored (matches can only occur beginning at
17/// the start of the search) search. Unanchored search is the default. This is
18/// configured via [`Input::anchored`].
19/// * Whether to quit the search as soon as a match has been found, regardless
20/// of the [`MatchKind`] that the searcher was built with. This is configured
21/// via [`Input::earliest`].
22///
23/// For most cases, the defaults for all optional parameters are appropriate.
24/// The utility of this type is that it keeps the default or common case simple
25/// while permitting tweaking parameters in more niche use cases while reusing
26/// the same search APIs.
27///
28/// # Valid bounds and search termination
29///
30/// An `Input` permits setting the bounds of a search via either
31/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or
32/// else a panic will occur. Bounds are valid if and only if:
33///
34/// * The bounds represent a valid range into the input's haystack.
35/// * **or** the end bound is a valid ending bound for the haystack *and*
36/// the start bound is exactly one greater than the end bound.
37///
38/// In the latter case, [`Input::is_done`] will return true and indicates any
39/// search receiving such an input should immediately return with no match.
40///
41/// Other than representing "search is complete," the `Input::span` and
42/// `Input::range` APIs are never necessary. Instead, callers can slice the
43/// haystack instead, e.g., with `&haystack[start..end]`. With that said, they
44/// can be more convenient than slicing because the match positions reported
45/// when using `Input::span` or `Input::range` are in terms of the original
46/// haystack. If you instead use `&haystack[start..end]`, then you'll need to
47/// add `start` to any match position returned in order for it to be a correct
48/// index into `haystack`.
49///
50/// # Example: `&str` and `&[u8]` automatically convert to an `Input`
51///
52/// There is a `From<&T> for Input` implementation for all `T: AsRef<[u8]>`.
53/// Additionally, the [`AhoCorasick`](crate::AhoCorasick) search APIs accept
54/// a `Into<Input>`. These two things combined together mean you can provide
55/// things like `&str` and `&[u8]` to search APIs when the defaults are
56/// suitable, but also an `Input` when they're not. For example:
57///
58/// ```
59/// use aho_corasick::{AhoCorasick, Anchored, Input, Match, StartKind};
60///
61/// // Build a searcher that supports both unanchored and anchored modes.
62/// let ac = AhoCorasick::builder()
63///     .start_kind(StartKind::Both)
64///     .build(&["abcd", "b"])
65///     .unwrap();
66/// let haystack = "abcd";
67///
68/// // A search using default parameters is unanchored. With standard
69/// // semantics, this finds `b` first.
70/// assert_eq!(
71///     Some(Match::must(1, 1..2)),
72///     ac.find(haystack),
73/// );
74/// // Using the same 'find' routine, we can provide an 'Input' explicitly
75/// // that is configured to do an anchored search. Since 'b' doesn't start
76/// // at the beginning of the search, it is not reported as a match.
77/// assert_eq!(
78///     Some(Match::must(0, 0..4)),
79///     ac.find(Input::new(haystack).anchored(Anchored::Yes)),
80/// );
81/// ```
82#[derive(Clone)]
83pub struct Input<'h> {
84    haystack: &'h [u8],
85    span: Span,
86    anchored: Anchored,
87    earliest: bool,
88}
89
90impl<'h> Input<'h> {
91    /// Create a new search configuration for the given haystack.
92    #[inline]
93    pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> {
94        Input {
95            haystack: haystack.as_ref(),
96            span: Span { start: 0, end: haystack.as_ref().len() },
97            anchored: Anchored::No,
98            earliest: false,
99        }
100    }
101
102    /// Set the span for this search.
103    ///
104    /// This routine is generic over how a span is provided. While
105    /// a [`Span`] may be given directly, one may also provide a
106    /// `std::ops::Range<usize>`. To provide anything supported by range
107    /// syntax, use the [`Input::range`] method.
108    ///
109    /// The default span is the entire haystack.
110    ///
111    /// Note that [`Input::range`] overrides this method and vice versa.
112    ///
113    /// # Panics
114    ///
115    /// This panics if the given span does not correspond to valid bounds in
116    /// the haystack or the termination of a search.
117    ///
118    /// # Example
119    ///
120    /// This example shows how the span of the search can impact whether a
121    /// match is reported or not.
122    ///
123    /// ```
124    /// use aho_corasick::{AhoCorasick, Input, MatchKind};
125    ///
126    /// let patterns = &["b", "abcd", "abc"];
127    /// let haystack = "abcd";
128    ///
129    /// let ac = AhoCorasick::builder()
130    ///     .match_kind(MatchKind::LeftmostFirst)
131    ///     .build(patterns)
132    ///     .unwrap();
133    /// let input = Input::new(haystack).span(0..3);
134    /// let mat = ac.try_find(input)?.expect("should have a match");
135    /// // Without the span stopping the search early, 'abcd' would be reported
136    /// // because it is the correct leftmost-first match.
137    /// assert_eq!("abc", &haystack[mat.span()]);
138    ///
139    /// # Ok::<(), Box<dyn std::error::Error>>(())
140    /// ```
141    #[inline]
142    pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> {
143        self.set_span(span);
144        self
145    }
146
147    /// Like `Input::span`, but accepts any range instead.
148    ///
149    /// The default range is the entire haystack.
150    ///
151    /// Note that [`Input::span`] overrides this method and vice versa.
152    ///
153    /// # Panics
154    ///
155    /// This routine will panic if the given range could not be converted
156    /// to a valid [`Range`]. For example, this would panic when given
157    /// `0..=usize::MAX` since it cannot be represented using a half-open
158    /// interval in terms of `usize`.
159    ///
160    /// This routine also panics if the given range does not correspond to
161    /// valid bounds in the haystack or the termination of a search.
162    ///
163    /// # Example
164    ///
165    /// ```
166    /// use aho_corasick::Input;
167    ///
168    /// let input = Input::new("foobar");
169    /// assert_eq!(0..6, input.get_range());
170    ///
171    /// let input = Input::new("foobar").range(2..=4);
172    /// assert_eq!(2..5, input.get_range());
173    /// ```
174    #[inline]
175    pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> {
176        self.set_range(range);
177        self
178    }
179
180    /// Sets the anchor mode of a search.
181    ///
182    /// When a search is anchored (via [`Anchored::Yes`]), a match must begin
183    /// at the start of a search. When a search is not anchored (that's
184    /// [`Anchored::No`]), searchers will look for a match anywhere in the
185    /// haystack.
186    ///
187    /// By default, the anchored mode is [`Anchored::No`].
188    ///
189    /// # Support for anchored searches
190    ///
191    /// Anchored or unanchored searches might not always be available,
192    /// depending on the type of searcher used and its configuration:
193    ///
194    /// * [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA) always
195    /// supports both unanchored and anchored searches.
196    /// * [`contiguous::NFA`](crate::nfa::contiguous::NFA) always supports both
197    /// unanchored and anchored searches.
198    /// * [`dfa::DFA`](crate::dfa::DFA) supports only unanchored
199    /// searches by default.
200    /// [`dfa::Builder::start_kind`](crate::dfa::Builder::start_kind) can
201    /// be used to change the default to supporting both kinds of searches
202    /// or even just anchored searches.
203    /// * [`AhoCorasick`](crate::AhoCorasick) inherits the same setup as a
204    /// `DFA`. Namely, it only supports unanchored searches by default, but
205    /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
206    /// can change this.
207    ///
208    /// If you try to execute a search using a `try_` ("fallible") method with
209    /// an unsupported anchor mode, then an error will be returned. For calls
210    /// to infallible search methods, a panic will result.
211    ///
212    /// # Example
213    ///
214    /// This demonstrates the differences between an anchored search and
215    /// an unanchored search. Notice that we build our `AhoCorasick` searcher
216    /// with [`StartKind::Both`] so that it supports both unanchored and
217    /// anchored searches simultaneously.
218    ///
219    /// ```
220    /// use aho_corasick::{
221    ///     AhoCorasick, Anchored, Input, MatchKind, StartKind,
222    /// };
223    ///
224    /// let patterns = &["bcd"];
225    /// let haystack = "abcd";
226    ///
227    /// let ac = AhoCorasick::builder()
228    ///     .start_kind(StartKind::Both)
229    ///     .build(patterns)
230    ///     .unwrap();
231    ///
232    /// // Note that 'Anchored::No' is the default, so it doesn't need to
233    /// // be explicitly specified here.
234    /// let input = Input::new(haystack);
235    /// let mat = ac.try_find(input)?.expect("should have a match");
236    /// assert_eq!("bcd", &haystack[mat.span()]);
237    ///
238    /// // While 'bcd' occurs in the haystack, it does not begin where our
239    /// // search begins, so no match is found.
240    /// let input = Input::new(haystack).anchored(Anchored::Yes);
241    /// assert_eq!(None, ac.try_find(input)?);
242    ///
243    /// // However, if we start our search where 'bcd' starts, then we will
244    /// // find a match.
245    /// let input = Input::new(haystack).range(1..).anchored(Anchored::Yes);
246    /// let mat = ac.try_find(input)?.expect("should have a match");
247    /// assert_eq!("bcd", &haystack[mat.span()]);
248    ///
249    /// # Ok::<(), Box<dyn std::error::Error>>(())
250    /// ```
251    #[inline]
252    pub fn anchored(mut self, mode: Anchored) -> Input<'h> {
253        self.set_anchored(mode);
254        self
255    }
256
257    /// Whether to execute an "earliest" search or not.
258    ///
259    /// When running a non-overlapping search, an "earliest" search will
260    /// return the match location as early as possible. For example, given
261    /// the patterns `abc` and `b`, and a haystack of `abc`, a normal
262    /// leftmost-first search will return `abc` as a match. But an "earliest"
263    /// search will return as soon as it is known that a match occurs, which
264    /// happens once `b` is seen.
265    ///
266    /// Note that when using [`MatchKind::Standard`], the "earliest" option
267    /// has no effect since standard semantics are already "earliest." Note
268    /// also that this has no effect in overlapping searches, since overlapping
269    /// searches also use standard semantics and report all possible matches.
270    ///
271    /// This is disabled by default.
272    ///
273    /// # Example
274    ///
275    /// This example shows the difference between "earliest" searching and
276    /// normal leftmost searching.
277    ///
278    /// ```
279    /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind};
280    ///
281    /// let patterns = &["abc", "b"];
282    /// let haystack = "abc";
283    ///
284    /// let ac = AhoCorasick::builder()
285    ///     .match_kind(MatchKind::LeftmostFirst)
286    ///     .build(patterns)
287    ///     .unwrap();
288    ///
289    /// // The normal leftmost-first match.
290    /// let input = Input::new(haystack);
291    /// let mat = ac.try_find(input)?.expect("should have a match");
292    /// assert_eq!("abc", &haystack[mat.span()]);
293    ///
294    /// // The "earliest" possible match, even if it isn't leftmost-first.
295    /// let input = Input::new(haystack).earliest(true);
296    /// let mat = ac.try_find(input)?.expect("should have a match");
297    /// assert_eq!("b", &haystack[mat.span()]);
298    ///
299    /// # Ok::<(), Box<dyn std::error::Error>>(())
300    /// ```
301    #[inline]
302    pub fn earliest(mut self, yes: bool) -> Input<'h> {
303        self.set_earliest(yes);
304        self
305    }
306
307    /// Set the span for this search configuration.
308    ///
309    /// This is like the [`Input::span`] method, except this mutates the
310    /// span in place.
311    ///
312    /// This routine is generic over how a span is provided. While
313    /// a [`Span`] may be given directly, one may also provide a
314    /// `std::ops::Range<usize>`.
315    ///
316    /// # Panics
317    ///
318    /// This panics if the given span does not correspond to valid bounds in
319    /// the haystack or the termination of a search.
320    ///
321    /// # Example
322    ///
323    /// ```
324    /// use aho_corasick::Input;
325    ///
326    /// let mut input = Input::new("foobar");
327    /// assert_eq!(0..6, input.get_range());
328    /// input.set_span(2..4);
329    /// assert_eq!(2..4, input.get_range());
330    /// ```
331    #[inline]
332    pub fn set_span<S: Into<Span>>(&mut self, span: S) {
333        let span = span.into();
334        assert!(
335            span.end <= self.haystack.len()
336                && span.start <= span.end.wrapping_add(1),
337            "invalid span {:?} for haystack of length {}",
338            span,
339            self.haystack.len(),
340        );
341        self.span = span;
342    }
343
344    /// Set the span for this search configuration given any range.
345    ///
346    /// This is like the [`Input::range`] method, except this mutates the
347    /// span in place.
348    ///
349    /// # Panics
350    ///
351    /// This routine will panic if the given range could not be converted
352    /// to a valid [`Range`]. For example, this would panic when given
353    /// `0..=usize::MAX` since it cannot be represented using a half-open
354    /// interval in terms of `usize`.
355    ///
356    /// This routine also panics if the given range does not correspond to
357    /// valid bounds in the haystack or the termination of a search.
358    ///
359    /// # Example
360    ///
361    /// ```
362    /// use aho_corasick::Input;
363    ///
364    /// let mut input = Input::new("foobar");
365    /// assert_eq!(0..6, input.get_range());
366    /// input.set_range(2..=4);
367    /// assert_eq!(2..5, input.get_range());
368    /// ```
369    #[inline]
370    pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
371        use core::ops::Bound;
372
373        // It's a little weird to convert ranges into spans, and then spans
374        // back into ranges when we actually slice the haystack. Because
375        // of that process, we always represent everything as a half-open
376        // internal. Therefore, handling things like m..=n is a little awkward.
377        let start = match range.start_bound() {
378            Bound::Included(&i) => i,
379            // Can this case ever happen? Range syntax doesn't support it...
380            Bound::Excluded(&i) => i.checked_add(1).unwrap(),
381            Bound::Unbounded => 0,
382        };
383        let end = match range.end_bound() {
384            Bound::Included(&i) => i.checked_add(1).unwrap(),
385            Bound::Excluded(&i) => i,
386            Bound::Unbounded => self.haystack().len(),
387        };
388        self.set_span(Span { start, end });
389    }
390
391    /// Set the starting offset for the span for this search configuration.
392    ///
393    /// This is a convenience routine for only mutating the start of a span
394    /// without having to set the entire span.
395    ///
396    /// # Panics
397    ///
398    /// This panics if the given span does not correspond to valid bounds in
399    /// the haystack or the termination of a search.
400    ///
401    /// # Example
402    ///
403    /// ```
404    /// use aho_corasick::Input;
405    ///
406    /// let mut input = Input::new("foobar");
407    /// assert_eq!(0..6, input.get_range());
408    /// input.set_start(5);
409    /// assert_eq!(5..6, input.get_range());
410    /// ```
411    #[inline]
412    pub fn set_start(&mut self, start: usize) {
413        self.set_span(Span { start, ..self.get_span() });
414    }
415
416    /// Set the ending offset for the span for this search configuration.
417    ///
418    /// This is a convenience routine for only mutating the end of a span
419    /// without having to set the entire span.
420    ///
421    /// # Panics
422    ///
423    /// This panics if the given span does not correspond to valid bounds in
424    /// the haystack or the termination of a search.
425    ///
426    /// # Example
427    ///
428    /// ```
429    /// use aho_corasick::Input;
430    ///
431    /// let mut input = Input::new("foobar");
432    /// assert_eq!(0..6, input.get_range());
433    /// input.set_end(5);
434    /// assert_eq!(0..5, input.get_range());
435    /// ```
436    #[inline]
437    pub fn set_end(&mut self, end: usize) {
438        self.set_span(Span { end, ..self.get_span() });
439    }
440
441    /// Set the anchor mode of a search.
442    ///
443    /// This is like [`Input::anchored`], except it mutates the search
444    /// configuration in place.
445    ///
446    /// # Example
447    ///
448    /// ```
449    /// use aho_corasick::{Anchored, Input};
450    ///
451    /// let mut input = Input::new("foobar");
452    /// assert_eq!(Anchored::No, input.get_anchored());
453    ///
454    /// input.set_anchored(Anchored::Yes);
455    /// assert_eq!(Anchored::Yes, input.get_anchored());
456    /// ```
457    #[inline]
458    pub fn set_anchored(&mut self, mode: Anchored) {
459        self.anchored = mode;
460    }
461
462    /// Set whether the search should execute in "earliest" mode or not.
463    ///
464    /// This is like [`Input::earliest`], except it mutates the search
465    /// configuration in place.
466    ///
467    /// # Example
468    ///
469    /// ```
470    /// use aho_corasick::Input;
471    ///
472    /// let mut input = Input::new("foobar");
473    /// assert!(!input.get_earliest());
474    /// input.set_earliest(true);
475    /// assert!(input.get_earliest());
476    /// ```
477    #[inline]
478    pub fn set_earliest(&mut self, yes: bool) {
479        self.earliest = yes;
480    }
481
482    /// Return a borrow of the underlying haystack as a slice of bytes.
483    ///
484    /// # Example
485    ///
486    /// ```
487    /// use aho_corasick::Input;
488    ///
489    /// let input = Input::new("foobar");
490    /// assert_eq!(b"foobar", input.haystack());
491    /// ```
492    #[inline]
493    pub fn haystack(&self) -> &[u8] {
494        self.haystack
495    }
496
497    /// Return the start position of this search.
498    ///
499    /// This is a convenience routine for `search.get_span().start()`.
500    ///
501    /// # Example
502    ///
503    /// ```
504    /// use aho_corasick::Input;
505    ///
506    /// let input = Input::new("foobar");
507    /// assert_eq!(0, input.start());
508    ///
509    /// let input = Input::new("foobar").span(2..4);
510    /// assert_eq!(2, input.start());
511    /// ```
512    #[inline]
513    pub fn start(&self) -> usize {
514        self.get_span().start
515    }
516
517    /// Return the end position of this search.
518    ///
519    /// This is a convenience routine for `search.get_span().end()`.
520    ///
521    /// # Example
522    ///
523    /// ```
524    /// use aho_corasick::Input;
525    ///
526    /// let input = Input::new("foobar");
527    /// assert_eq!(6, input.end());
528    ///
529    /// let input = Input::new("foobar").span(2..4);
530    /// assert_eq!(4, input.end());
531    /// ```
532    #[inline]
533    pub fn end(&self) -> usize {
534        self.get_span().end
535    }
536
537    /// Return the span for this search configuration.
538    ///
539    /// If one was not explicitly set, then the span corresponds to the entire
540    /// range of the haystack.
541    ///
542    /// # Example
543    ///
544    /// ```
545    /// use aho_corasick::{Input, Span};
546    ///
547    /// let input = Input::new("foobar");
548    /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
549    /// ```
550    #[inline]
551    pub fn get_span(&self) -> Span {
552        self.span
553    }
554
555    /// Return the span as a range for this search configuration.
556    ///
557    /// If one was not explicitly set, then the span corresponds to the entire
558    /// range of the haystack.
559    ///
560    /// # Example
561    ///
562    /// ```
563    /// use aho_corasick::Input;
564    ///
565    /// let input = Input::new("foobar");
566    /// assert_eq!(0..6, input.get_range());
567    /// ```
568    #[inline]
569    pub fn get_range(&self) -> Range<usize> {
570        self.get_span().range()
571    }
572
573    /// Return the anchored mode for this search configuration.
574    ///
575    /// If no anchored mode was set, then it defaults to [`Anchored::No`].
576    ///
577    /// # Example
578    ///
579    /// ```
580    /// use aho_corasick::{Anchored, Input};
581    ///
582    /// let mut input = Input::new("foobar");
583    /// assert_eq!(Anchored::No, input.get_anchored());
584    ///
585    /// input.set_anchored(Anchored::Yes);
586    /// assert_eq!(Anchored::Yes, input.get_anchored());
587    /// ```
588    #[inline]
589    pub fn get_anchored(&self) -> Anchored {
590        self.anchored
591    }
592
593    /// Return whether this search should execute in "earliest" mode.
594    ///
595    /// # Example
596    ///
597    /// ```
598    /// use aho_corasick::Input;
599    ///
600    /// let input = Input::new("foobar");
601    /// assert!(!input.get_earliest());
602    /// ```
603    #[inline]
604    pub fn get_earliest(&self) -> bool {
605        self.earliest
606    }
607
608    /// Return true if this input has been exhausted, which in turn means all
609    /// subsequent searches will return no matches.
610    ///
611    /// This occurs precisely when the start position of this search is greater
612    /// than the end position of the search.
613    ///
614    /// # Example
615    ///
616    /// ```
617    /// use aho_corasick::Input;
618    ///
619    /// let mut input = Input::new("foobar");
620    /// assert!(!input.is_done());
621    /// input.set_start(6);
622    /// assert!(!input.is_done());
623    /// input.set_start(7);
624    /// assert!(input.is_done());
625    /// ```
626    #[inline]
627    pub fn is_done(&self) -> bool {
628        self.get_span().start > self.get_span().end
629    }
630}
631
632impl<'h> core::fmt::Debug for Input<'h> {
633    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
634        let mut fmter = f.debug_struct("Input");
635        match core::str::from_utf8(self.haystack()) {
636            Ok(nice) => fmter.field("haystack", &nice),
637            Err(_) => fmter.field("haystack", &self.haystack()),
638        }
639        .field("span", &self.span)
640        .field("anchored", &self.anchored)
641        .field("earliest", &self.earliest)
642        .finish()
643    }
644}
645
646impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> {
647    #[inline]
648    fn from(haystack: &'h H) -> Input<'h> {
649        Input::new(haystack)
650    }
651}
652
653/// A representation of a range in a haystack.
654///
655/// A span corresponds to the starting and ending _byte offsets_ of a
656/// contiguous region of bytes. The starting offset is inclusive while the
657/// ending offset is exclusive. That is, a span is a half-open interval.
658///
659/// A span is used to report the offsets of a match, but it is also used to
660/// convey which region of a haystack should be searched via routines like
661/// [`Input::span`].
662///
663/// This is basically equivalent to a `std::ops::Range<usize>`, except this
664/// type implements `Copy` which makes it more ergonomic to use in the context
665/// of this crate. Indeed, `Span` exists only because `Range<usize>` does
666/// not implement `Copy`. Like a range, this implements `Index` for `[u8]`
667/// and `str`, and `IndexMut` for `[u8]`. For convenience, this also impls
668/// `From<Range>`, which means things like `Span::from(5..10)` work.
669///
670/// There are no constraints on the values of a span. It is, for example, legal
671/// to create a span where `start > end`.
672#[derive(Clone, Copy, Eq, Hash, PartialEq)]
673pub struct Span {
674    /// The start offset of the span, inclusive.
675    pub start: usize,
676    /// The end offset of the span, exclusive.
677    pub end: usize,
678}
679
680impl Span {
681    /// Returns this span as a range.
682    #[inline]
683    pub fn range(&self) -> Range<usize> {
684        Range::from(*self)
685    }
686
687    /// Returns true when this span is empty. That is, when `start >= end`.
688    #[inline]
689    pub fn is_empty(&self) -> bool {
690        self.start >= self.end
691    }
692
693    /// Returns the length of this span.
694    ///
695    /// This returns `0` in precisely the cases that `is_empty` returns `true`.
696    #[inline]
697    pub fn len(&self) -> usize {
698        self.end.saturating_sub(self.start)
699    }
700
701    /// Returns true when the given offset is contained within this span.
702    ///
703    /// Note that an empty span contains no offsets and will always return
704    /// false.
705    #[inline]
706    pub fn contains(&self, offset: usize) -> bool {
707        !self.is_empty() && self.start <= offset && offset <= self.end
708    }
709
710    /// Returns a new span with `offset` added to this span's `start` and `end`
711    /// values.
712    #[inline]
713    pub fn offset(&self, offset: usize) -> Span {
714        Span { start: self.start + offset, end: self.end + offset }
715    }
716}
717
718impl core::fmt::Debug for Span {
719    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
720        write!(f, "{}..{}", self.start, self.end)
721    }
722}
723
724impl core::ops::Index<Span> for [u8] {
725    type Output = [u8];
726
727    #[inline]
728    fn index(&self, index: Span) -> &[u8] {
729        &self[index.range()]
730    }
731}
732
733impl core::ops::IndexMut<Span> for [u8] {
734    #[inline]
735    fn index_mut(&mut self, index: Span) -> &mut [u8] {
736        &mut self[index.range()]
737    }
738}
739
740impl core::ops::Index<Span> for str {
741    type Output = str;
742
743    #[inline]
744    fn index(&self, index: Span) -> &str {
745        &self[index.range()]
746    }
747}
748
749impl From<Range<usize>> for Span {
750    #[inline]
751    fn from(range: Range<usize>) -> Span {
752        Span { start: range.start, end: range.end }
753    }
754}
755
756impl From<Span> for Range<usize> {
757    #[inline]
758    fn from(span: Span) -> Range<usize> {
759        Range { start: span.start, end: span.end }
760    }
761}
762
763impl PartialEq<Range<usize>> for Span {
764    #[inline]
765    fn eq(&self, range: &Range<usize>) -> bool {
766        self.start == range.start && self.end == range.end
767    }
768}
769
770impl PartialEq<Span> for Range<usize> {
771    #[inline]
772    fn eq(&self, span: &Span) -> bool {
773        self.start == span.start && self.end == span.end
774    }
775}
776
777/// The type of anchored search to perform.
778///
779/// If an Aho-Corasick searcher does not support the anchored mode selected,
780/// then the search will return an error or panic, depending on whether a
781/// fallible or an infallible routine was called.
782#[non_exhaustive]
783#[derive(Clone, Copy, Debug, Eq, PartialEq)]
784pub enum Anchored {
785    /// Run an unanchored search. This means a match may occur anywhere at or
786    /// after the start position of the search up until the end position of the
787    /// search.
788    No,
789    /// Run an anchored search. This means that a match must begin at the start
790    /// position of the search and end before the end position of the search.
791    Yes,
792}
793
794impl Anchored {
795    /// Returns true if and only if this anchor mode corresponds to an anchored
796    /// search.
797    ///
798    /// # Example
799    ///
800    /// ```
801    /// use aho_corasick::Anchored;
802    ///
803    /// assert!(!Anchored::No.is_anchored());
804    /// assert!(Anchored::Yes.is_anchored());
805    /// ```
806    #[inline]
807    pub fn is_anchored(&self) -> bool {
808        matches!(*self, Anchored::Yes)
809    }
810}
811
812/// A representation of a match reported by an Aho-Corasick searcher.
813///
814/// A match has two essential pieces of information: the [`PatternID`] that
815/// matches, and the [`Span`] of the match in a haystack.
816///
817/// The pattern is identified by an ID, which corresponds to its position
818/// (starting from `0`) relative to other patterns used to construct the
819/// corresponding searcher. If only a single pattern is provided, then all
820/// matches are guaranteed to have a pattern ID of `0`.
821///
822/// Every match reported by a searcher guarantees that its span has its start
823/// offset as less than or equal to its end offset.
824#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
825pub struct Match {
826    /// The pattern ID.
827    pattern: PatternID,
828    /// The underlying match span.
829    span: Span,
830}
831
832impl Match {
833    /// Create a new match from a pattern ID and a span.
834    ///
835    /// This constructor is generic over how a span is provided. While
836    /// a [`Span`] may be given directly, one may also provide a
837    /// `std::ops::Range<usize>`.
838    ///
839    /// # Panics
840    ///
841    /// This panics if `end < start`.
842    ///
843    /// # Example
844    ///
845    /// This shows how to create a match for the first pattern in an
846    /// Aho-Corasick searcher using convenient range syntax.
847    ///
848    /// ```
849    /// use aho_corasick::{Match, PatternID};
850    ///
851    /// let m = Match::new(PatternID::ZERO, 5..10);
852    /// assert_eq!(0, m.pattern().as_usize());
853    /// assert_eq!(5, m.start());
854    /// assert_eq!(10, m.end());
855    /// ```
856    #[inline]
857    pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match {
858        let span = span.into();
859        assert!(span.start <= span.end, "invalid match span");
860        Match { pattern, span }
861    }
862
863    /// Create a new match from a pattern ID and a byte offset span.
864    ///
865    /// This constructor is generic over how a span is provided. While
866    /// a [`Span`] may be given directly, one may also provide a
867    /// `std::ops::Range<usize>`.
868    ///
869    /// This is like [`Match::new`], but accepts a `usize` instead of a
870    /// [`PatternID`]. This panics if the given `usize` is not representable
871    /// as a `PatternID`.
872    ///
873    /// # Panics
874    ///
875    /// This panics if `end < start` or if `pattern > PatternID::MAX`.
876    ///
877    /// # Example
878    ///
879    /// This shows how to create a match for the third pattern in an
880    /// Aho-Corasick searcher using convenient range syntax.
881    ///
882    /// ```
883    /// use aho_corasick::Match;
884    ///
885    /// let m = Match::must(3, 5..10);
886    /// assert_eq!(3, m.pattern().as_usize());
887    /// assert_eq!(5, m.start());
888    /// assert_eq!(10, m.end());
889    /// ```
890    #[inline]
891    pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match {
892        Match::new(PatternID::must(pattern), span)
893    }
894
895    /// Returns the ID of the pattern that matched.
896    ///
897    /// The ID of a pattern is derived from the position in which it was
898    /// originally inserted into the corresponding searcher. The first pattern
899    /// has identifier `0`, and each subsequent pattern is `1`, `2` and so on.
900    #[inline]
901    pub fn pattern(&self) -> PatternID {
902        self.pattern
903    }
904
905    /// The starting position of the match.
906    ///
907    /// This is a convenience routine for `Match::span().start`.
908    #[inline]
909    pub fn start(&self) -> usize {
910        self.span().start
911    }
912
913    /// The ending position of the match.
914    ///
915    /// This is a convenience routine for `Match::span().end`.
916    #[inline]
917    pub fn end(&self) -> usize {
918        self.span().end
919    }
920
921    /// Returns the match span as a range.
922    ///
923    /// This is a convenience routine for `Match::span().range()`.
924    #[inline]
925    pub fn range(&self) -> core::ops::Range<usize> {
926        self.span().range()
927    }
928
929    /// Returns the span for this match.
930    #[inline]
931    pub fn span(&self) -> Span {
932        self.span
933    }
934
935    /// Returns true when the span in this match is empty.
936    ///
937    /// An empty match can only be returned when empty pattern is in the
938    /// Aho-Corasick searcher.
939    #[inline]
940    pub fn is_empty(&self) -> bool {
941        self.span().is_empty()
942    }
943
944    /// Returns the length of this match.
945    ///
946    /// This returns `0` in precisely the cases that `is_empty` returns `true`.
947    #[inline]
948    pub fn len(&self) -> usize {
949        self.span().len()
950    }
951
952    /// Returns a new match with `offset` added to its span's `start` and `end`
953    /// values.
954    #[inline]
955    pub fn offset(&self, offset: usize) -> Match {
956        Match {
957            pattern: self.pattern,
958            span: Span {
959                start: self.start() + offset,
960                end: self.end() + offset,
961            },
962        }
963    }
964}
965
966/// A knob for controlling the match semantics of an Aho-Corasick automaton.
967///
968/// There are two generally different ways that Aho-Corasick automatons can
969/// report matches. The first way is the "standard" approach that results from
970/// implementing most textbook explanations of Aho-Corasick. The second way is
971/// to report only the leftmost non-overlapping matches. The leftmost approach
972/// is in turn split into two different ways of resolving ambiguous matches:
973/// leftmost-first and leftmost-longest.
974///
975/// The `Standard` match kind is the default and is the only one that supports
976/// overlapping matches and stream searching. (Trying to find overlapping or
977/// streaming matches using leftmost match semantics will result in an error in
978/// fallible APIs and a panic when using infallibe APIs.) The `Standard` match
979/// kind will report matches as they are seen. When searching for overlapping
980/// matches, then all possible matches are reported. When searching for
981/// non-overlapping matches, the first match seen is reported. For example, for
982/// non-overlapping matches, given the patterns `abcd` and `b` and the haystack
983/// `abcdef`, only a match for `b` is reported since it is detected first. The
984/// `abcd` match is never reported since it overlaps with the `b` match.
985///
986/// In contrast, the leftmost match kind always prefers the leftmost match
987/// among all possible matches. Given the same example as above with `abcd` and
988/// `b` as patterns and `abcdef` as the haystack, the leftmost match is `abcd`
989/// since it begins before the `b` match, even though the `b` match is detected
990/// before the `abcd` match. In this case, the `b` match is not reported at all
991/// since it overlaps with the `abcd` match.
992///
993/// The difference between leftmost-first and leftmost-longest is in how they
994/// resolve ambiguous matches when there are multiple leftmost matches to
995/// choose from. Leftmost-first always chooses the pattern that was provided
996/// earliest, where as leftmost-longest always chooses the longest matching
997/// pattern. For example, given the patterns `a` and `ab` and the subject
998/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
999/// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
1000/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
1001/// would be `ab`. Stated differently, the leftmost-first match depends on the
1002/// order in which the patterns were given to the Aho-Corasick automaton.
1003/// Because of that, when leftmost-first matching is used, if a pattern `A`
1004/// that appears before a pattern `B` is a prefix of `B`, then it is impossible
1005/// to ever observe a match of `B`.
1006///
1007/// If you're not sure which match kind to pick, then stick with the standard
1008/// kind, which is the default. In particular, if you need overlapping or
1009/// streaming matches, then you _must_ use the standard kind. The leftmost
1010/// kinds are useful in specific circumstances. For example, leftmost-first can
1011/// be very useful as a way to implement match priority based on the order of
1012/// patterns given and leftmost-longest can be useful for dictionary searching
1013/// such that only the longest matching words are reported.
1014///
1015/// # Relationship with regular expression alternations
1016///
1017/// Understanding match semantics can be a little tricky, and one easy way
1018/// to conceptualize non-overlapping matches from an Aho-Corasick automaton
1019/// is to think about them as a simple alternation of literals in a regular
1020/// expression. For example, let's say we wanted to match the strings
1021/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
1022/// turns out that regular expression engines have two different ways of
1023/// matching this alternation. The first way, leftmost-longest, is commonly
1024/// found in POSIX compatible implementations of regular expressions (such as
1025/// `grep`). The second way, leftmost-first, is commonly found in backtracking
1026/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
1027/// regex engine do not use backtracking, but still implement leftmost-first
1028/// semantics in an effort to match the behavior of dominant backtracking
1029/// regex engines such as those found in Perl, Ruby, Python, Javascript and
1030/// PHP.)
1031///
1032/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
1033/// will match `Samwise` because it is the longest possible match, but a
1034/// Perl-like regex will match `Sam` since it appears earlier in the
1035/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
1036/// will never match `Samwise` since `Sam` will always have higher priority.
1037/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
1038/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
1039/// still longest match, but it also appears earlier than `Sam`.
1040///
1041/// The "standard" match semantics of Aho-Corasick generally don't correspond
1042/// to the match semantics of any large group of regex implementations, so
1043/// there's no direct analogy that can be made here. Standard match semantics
1044/// are generally useful for overlapping matches, or if you just want to see
1045/// matches as they are detected.
1046///
1047/// The main conclusion to draw from this section is that the match semantics
1048/// can be tweaked to precisely match either Perl-like regex alternations or
1049/// POSIX regex alternations.
1050#[non_exhaustive]
1051#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1052pub enum MatchKind {
1053    /// Use standard match semantics, which support overlapping matches. When
1054    /// used with non-overlapping matches, matches are reported as they are
1055    /// seen.
1056    Standard,
1057    /// Use leftmost-first match semantics, which reports leftmost matches.
1058    /// When there are multiple possible leftmost matches, the match
1059    /// corresponding to the pattern that appeared earlier when constructing
1060    /// the automaton is reported.
1061    ///
1062    /// This does **not** support overlapping matches or stream searching. If
1063    /// this match kind is used, attempting to find overlapping matches or
1064    /// stream matches will fail.
1065    LeftmostFirst,
1066    /// Use leftmost-longest match semantics, which reports leftmost matches.
1067    /// When there are multiple possible leftmost matches, the longest match
1068    /// is chosen.
1069    ///
1070    /// This does **not** support overlapping matches or stream searching. If
1071    /// this match kind is used, attempting to find overlapping matches or
1072    /// stream matches will fail.
1073    LeftmostLongest,
1074}
1075
1076/// The default match kind is `MatchKind::Standard`.
1077impl Default for MatchKind {
1078    fn default() -> MatchKind {
1079        MatchKind::Standard
1080    }
1081}
1082
1083impl MatchKind {
1084    #[inline]
1085    pub(crate) fn is_standard(&self) -> bool {
1086        matches!(*self, MatchKind::Standard)
1087    }
1088
1089    #[inline]
1090    pub(crate) fn is_leftmost(&self) -> bool {
1091        matches!(*self, MatchKind::LeftmostFirst | MatchKind::LeftmostLongest)
1092    }
1093
1094    #[inline]
1095    pub(crate) fn is_leftmost_first(&self) -> bool {
1096        matches!(*self, MatchKind::LeftmostFirst)
1097    }
1098
1099    /// Convert this match kind into a packed match kind. If this match kind
1100    /// corresponds to standard semantics, then this returns None, since
1101    /// packed searching does not support standard semantics.
1102    #[inline]
1103    pub(crate) fn as_packed(&self) -> Option<crate::packed::MatchKind> {
1104        match *self {
1105            MatchKind::Standard => None,
1106            MatchKind::LeftmostFirst => {
1107                Some(crate::packed::MatchKind::LeftmostFirst)
1108            }
1109            MatchKind::LeftmostLongest => {
1110                Some(crate::packed::MatchKind::LeftmostLongest)
1111            }
1112        }
1113    }
1114}
1115
1116/// The kind of anchored starting configurations to support in an Aho-Corasick
1117/// searcher.
1118///
1119/// Depending on which searcher is used internally by
1120/// [`AhoCorasick`](crate::AhoCorasick), supporting both unanchored
1121/// and anchored searches can be quite costly. For this reason,
1122/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
1123/// can be used to configure whether your searcher supports unanchored,
1124/// anchored or both kinds of searches.
1125///
1126/// This searcher configuration knob works in concert with the search time
1127/// configuration [`Input::anchored`]. Namely, if one requests an unsupported
1128/// anchored mode, then the search will either panic or return an error,
1129/// depending on whether you're using infallible or fallibe APIs, respectively.
1130///
1131/// `AhoCorasick` by default only supports unanchored searches.
1132#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1133pub enum StartKind {
1134    /// Support both anchored and unanchored searches.
1135    Both,
1136    /// Support only unanchored searches. Requesting an anchored search will
1137    /// return an error in fallible APIs and panic in infallible APIs.
1138    Unanchored,
1139    /// Support only anchored searches. Requesting an unanchored search will
1140    /// return an error in fallible APIs and panic in infallible APIs.
1141    Anchored,
1142}
1143
1144impl Default for StartKind {
1145    fn default() -> StartKind {
1146        StartKind::Unanchored
1147    }
1148}