aho_corasick/nfa/
noncontiguous.rs

1/*!
2Provides a noncontiguous NFA implementation of Aho-Corasick.
3
4This is a low-level API that generally only needs to be used in niche
5circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6instead of a noncontiguous NFA directly. Using an `NFA` directly is typically
7only necessary when one needs access to the [`Automaton`] trait implementation.
8*/
9
10use alloc::{
11    collections::{BTreeSet, VecDeque},
12    vec,
13    vec::Vec,
14};
15
16use crate::{
17    automaton::Automaton,
18    util::{
19        alphabet::{ByteClassSet, ByteClasses},
20        error::{BuildError, MatchError},
21        prefilter::{self, opposite_ascii_case, Prefilter},
22        primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
23        remapper::Remapper,
24        search::{Anchored, MatchKind},
25        special::Special,
26    },
27};
28
29/// A noncontiguous NFA implementation of Aho-Corasick.
30///
31/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
32/// this type directly. Using an `NFA` directly is typically only necessary
33/// when one needs access to the [`Automaton`] trait implementation.
34///
35/// This NFA represents the "core" implementation of Aho-Corasick in this
36/// crate. Namely, constructing this NFA involving building a trie and then
37/// filling in the failure transitions between states, similar to what is
38/// described in any standard textbook description of Aho-Corasick.
39///
40/// In order to minimize heap usage and to avoid additional construction costs,
41/// this implementation represents the transitions of all states as distinct
42/// sparse memory allocations. This is where it gets its name from. That is,
43/// this NFA has no contiguous memory allocation for its transition table. Each
44/// state gets its own allocation.
45///
46/// While the sparse representation keeps memory usage to somewhat reasonable
47/// levels, it is still quite large and also results in somewhat mediocre
48/// search performance. For this reason, it is almost always a good idea to
49/// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is
50/// marginally slower to build, but has higher throughput and can sometimes use
51/// an order of magnitude less memory. The main reason to use a noncontiguous
52/// NFA is when you need the fastest possible construction time, or when a
53/// contiguous NFA does not have the desired capacity. (The total number of NFA
54/// states it can have is fewer than a noncontiguous NFA.)
55///
56/// # Example
57///
58/// This example shows how to build an `NFA` directly and use it to execute
59/// [`Automaton::try_find`]:
60///
61/// ```
62/// use aho_corasick::{
63///     automaton::Automaton,
64///     nfa::noncontiguous::NFA,
65///     Input, Match,
66/// };
67///
68/// let patterns = &["b", "abc", "abcd"];
69/// let haystack = "abcd";
70///
71/// let nfa = NFA::new(patterns).unwrap();
72/// assert_eq!(
73///     Some(Match::must(0, 1..2)),
74///     nfa.try_find(&Input::new(haystack))?,
75/// );
76/// # Ok::<(), Box<dyn std::error::Error>>(())
77/// ```
78///
79/// It is also possible to implement your own version of `try_find`. See the
80/// [`Automaton`] documentation for an example.
81#[derive(Clone)]
82pub struct NFA {
83    /// The match semantics built into this NFA.
84    match_kind: MatchKind,
85    /// A set of states. Each state defines its own transitions, a fail
86    /// transition and a set of indices corresponding to matches.
87    ///
88    /// The first state is always the fail state, which is used only as a
89    /// sentinel. Namely, in the final NFA, no transition into the fail state
90    /// exists. (Well, they do, but they aren't followed. Instead, the state's
91    /// failure transition is followed.)
92    ///
93    /// The second state (index 1) is always the dead state. Dead states are
94    /// in every automaton, but only used when leftmost-{first,longest} match
95    /// semantics are enabled. Specifically, they instruct search to stop
96    /// at specific points in order to report the correct match location. In
97    /// the standard Aho-Corasick construction, there are no transitions to
98    /// the dead state.
99    ///
100    /// The third state (index 2) is generally intended to be the starting or
101    /// "root" state.
102    states: Vec<State>,
103    /// Transitions stored in a sparse representation via a linked list.
104    ///
105    /// Each transition contains three pieces of information: the byte it
106    /// is defined for, the state it transitions to and a link to the next
107    /// transition in the same state (or `StateID::ZERO` if it is the last
108    /// transition).
109    ///
110    /// The first transition for each state is determined by `State::sparse`.
111    ///
112    /// Note that this contains a complete set of all transitions in this NFA,
113    /// including states that have a dense representation for transitions.
114    /// (Adding dense transitions for a state doesn't remove its sparse
115    /// transitions, since deleting transitions from this particular sparse
116    /// representation would be fairly expensive.)
117    sparse: Vec<Transition>,
118    /// Transitions stored in a dense representation.
119    ///
120    /// A state has a row in this table if and only if `State::dense` is
121    /// not equal to `StateID::ZERO`. When not zero, there are precisely
122    /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense`
123    /// in this table.
124    ///
125    /// Generally a very small minority of states have a dense representation
126    /// since it uses so much memory.
127    dense: Vec<StateID>,
128    /// Matches stored in linked list for each state.
129    ///
130    /// Like sparse transitions, each match has a link to the next match in the
131    /// state.
132    ///
133    /// The first match for each state is determined by `State::matches`.
134    matches: Vec<Match>,
135    /// The length, in bytes, of each pattern in this NFA. This slice is
136    /// indexed by `PatternID`.
137    ///
138    /// The number of entries in this vector corresponds to the total number of
139    /// patterns in this automaton.
140    pattern_lens: Vec<SmallIndex>,
141    /// A prefilter for quickly skipping to candidate matches, if pertinent.
142    prefilter: Option<Prefilter>,
143    /// A set of equivalence classes in terms of bytes. We compute this while
144    /// building the NFA, but don't use it in the NFA's states. Instead, we
145    /// use this for building the DFA. We store it on the NFA since it's easy
146    /// to compute while visiting the patterns.
147    byte_classes: ByteClasses,
148    /// The length, in bytes, of the shortest pattern in this automaton. This
149    /// information is useful for detecting whether an automaton matches the
150    /// empty string or not.
151    min_pattern_len: usize,
152    /// The length, in bytes, of the longest pattern in this automaton. This
153    /// information is useful for keeping correct buffer sizes when searching
154    /// on streams.
155    max_pattern_len: usize,
156    /// The information required to deduce which states are "special" in this
157    /// NFA.
158    ///
159    /// Since the DEAD and FAIL states are always the first two states and
160    /// there are only ever two start states (which follow all of the match
161    /// states), it follows that we can determine whether a state is a fail,
162    /// dead, match or start with just a few comparisons on the ID itself:
163    ///
164    ///    is_dead(sid): sid == NFA::DEAD
165    ///    is_fail(sid): sid == NFA::FAIL
166    ///   is_match(sid): NFA::FAIL < sid && sid <= max_match_id
167    ///   is_start(sid): sid == start_unanchored_id || sid == start_anchored_id
168    ///
169    /// Note that this only applies to the NFA after it has been constructed.
170    /// During construction, the start states are the first ones added and the
171    /// match states are inter-leaved with non-match states. Once all of the
172    /// states have been added, the states are shuffled such that the above
173    /// predicates hold.
174    special: Special,
175}
176
177impl NFA {
178    /// Create a new Aho-Corasick noncontiguous NFA using the default
179    /// configuration.
180    ///
181    /// Use a [`Builder`] if you want to change the configuration.
182    pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
183    where
184        I: IntoIterator<Item = P>,
185        P: AsRef<[u8]>,
186    {
187        NFA::builder().build(patterns)
188    }
189
190    /// A convenience method for returning a new Aho-Corasick noncontiguous NFA
191    /// builder.
192    ///
193    /// This usually permits one to just import the `NFA` type.
194    pub fn builder() -> Builder {
195        Builder::new()
196    }
197}
198
199impl NFA {
200    /// The DEAD state is a sentinel state like the FAIL state. The DEAD state
201    /// instructs any search to stop and return any currently recorded match,
202    /// or no match otherwise. Generally speaking, it is impossible for an
203    /// unanchored standard search to enter a DEAD state. But an anchored
204    /// search can, and so to can a leftmost search.
205    ///
206    /// We put DEAD before FAIL so that DEAD is always 0. We repeat this
207    /// decision across the other Aho-Corasicm automata, so that DEAD
208    /// states there are always 0 too. It's not that we need all of the
209    /// implementations to agree, but rather, the contiguous NFA and the DFA
210    /// use a sort of "premultiplied" state identifier where the only state
211    /// whose ID is always known and constant is the first state. Subsequent
212    /// state IDs depend on how much space has already been used in the
213    /// transition table.
214    pub(crate) const DEAD: StateID = StateID::new_unchecked(0);
215    /// The FAIL state mostly just corresponds to the ID of any transition on a
216    /// state that isn't explicitly defined. When one transitions into the FAIL
217    /// state, one must follow the previous state's failure transition before
218    /// doing the next state lookup. In this way, FAIL is more of a sentinel
219    /// than a state that one actually transitions into. In particular, it is
220    /// never exposed in the `Automaton` interface.
221    pub(crate) const FAIL: StateID = StateID::new_unchecked(1);
222
223    /// Returns the equivalence classes of bytes found while constructing
224    /// this NFA.
225    ///
226    /// Note that the NFA doesn't actually make use of these equivalence
227    /// classes. Instead, these are useful for building the DFA when desired.
228    pub(crate) fn byte_classes(&self) -> &ByteClasses {
229        &self.byte_classes
230    }
231
232    /// Returns a slice containing the length of each pattern in this searcher.
233    /// It is indexed by `PatternID` and has length `NFA::patterns_len`.
234    ///
235    /// This is exposed for convenience when building a contiguous NFA. But it
236    /// can be reconstructed from the `Automaton` API if necessary.
237    pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] {
238        &self.pattern_lens
239    }
240
241    /// Returns a slice of all states in this non-contiguous NFA.
242    pub(crate) fn states(&self) -> &[State] {
243        &self.states
244    }
245
246    /// Returns the underlying "special" state information for this NFA.
247    pub(crate) fn special(&self) -> &Special {
248        &self.special
249    }
250
251    /// Swaps the states at `id1` and `id2`.
252    ///
253    /// This does not update the transitions of any state to account for the
254    /// state swap.
255    pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
256        self.states.swap(id1.as_usize(), id2.as_usize());
257    }
258
259    /// Re-maps all state IDs in this NFA according to the `map` function
260    /// given.
261    pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
262        let alphabet_len = self.byte_classes.alphabet_len();
263        for state in self.states.iter_mut() {
264            state.fail = map(state.fail);
265            let mut link = state.sparse;
266            while link != StateID::ZERO {
267                let t = &mut self.sparse[link];
268                t.next = map(t.next);
269                link = t.link;
270            }
271            if state.dense != StateID::ZERO {
272                let start = state.dense.as_usize();
273                for next in self.dense[start..][..alphabet_len].iter_mut() {
274                    *next = map(*next);
275                }
276            }
277        }
278    }
279
280    /// Iterate over all of the transitions for the given state ID.
281    pub(crate) fn iter_trans(
282        &self,
283        sid: StateID,
284    ) -> impl Iterator<Item = Transition> + '_ {
285        let mut link = self.states[sid].sparse;
286        core::iter::from_fn(move || {
287            if link == StateID::ZERO {
288                return None;
289            }
290            let t = self.sparse[link];
291            link = t.link;
292            Some(t)
293        })
294    }
295
296    /// Iterate over all of the matches for the given state ID.
297    pub(crate) fn iter_matches(
298        &self,
299        sid: StateID,
300    ) -> impl Iterator<Item = PatternID> + '_ {
301        let mut link = self.states[sid].matches;
302        core::iter::from_fn(move || {
303            if link == StateID::ZERO {
304                return None;
305            }
306            let m = self.matches[link];
307            link = m.link;
308            Some(m.pid)
309        })
310    }
311
312    /// Return the link following the one given. If the one given is the last
313    /// link for the given state, then return `None`.
314    ///
315    /// If no previous link is given, then this returns the first link in the
316    /// state, if one exists.
317    ///
318    /// This is useful for manually iterating over the transitions in a single
319    /// state without borrowing the NFA. This permits mutating other parts of
320    /// the NFA during iteration. Namely, one can access the transition pointed
321    /// to by the link via `self.sparse[link]`.
322    fn next_link(
323        &self,
324        sid: StateID,
325        prev: Option<StateID>,
326    ) -> Option<StateID> {
327        let link =
328            prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link);
329        if link == StateID::ZERO {
330            None
331        } else {
332            Some(link)
333        }
334    }
335
336    /// Follow the transition for the given byte in the given state. If no such
337    /// transition exists, then the FAIL state ID is returned.
338    #[inline(always)]
339    fn follow_transition(&self, sid: StateID, byte: u8) -> StateID {
340        let s = &self.states[sid];
341        // This is a special case that targets starting states and states
342        // near a start state. Namely, after the initial trie is constructed,
343        // we look for states close to the start state to convert to a dense
344        // representation for their transitions. This winds up using a lot more
345        // memory per state in exchange for faster transition lookups. But
346        // since we only do this for a small number of states (by default), the
347        // memory usage is usually minimal.
348        //
349        // This has *massive* benefit when executing searches because the
350        // unanchored starting state is by far the hottest state and is
351        // frequently visited. Moreover, the 'for' loop below that works
352        // decently on an actually sparse state is disastrous on a state that
353        // is nearly or completely dense.
354        if s.dense == StateID::ZERO {
355            self.follow_transition_sparse(sid, byte)
356        } else {
357            let class = usize::from(self.byte_classes.get(byte));
358            self.dense[s.dense.as_usize() + class]
359        }
360    }
361
362    /// Like `follow_transition`, but always uses the sparse representation.
363    #[inline(always)]
364    fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID {
365        for t in self.iter_trans(sid) {
366            if byte <= t.byte {
367                if byte == t.byte {
368                    return t.next;
369                }
370                break;
371            }
372        }
373        NFA::FAIL
374    }
375
376    /// Set the transition for the given byte to the state ID given.
377    ///
378    /// Note that one should not set transitions to the FAIL state. It is not
379    /// technically incorrect, but it wastes space. If a transition is not
380    /// defined, then it is automatically assumed to lead to the FAIL state.
381    fn add_transition(
382        &mut self,
383        prev: StateID,
384        byte: u8,
385        next: StateID,
386    ) -> Result<(), BuildError> {
387        if self.states[prev].dense != StateID::ZERO {
388            let dense = self.states[prev].dense;
389            let class = usize::from(self.byte_classes.get(byte));
390            self.dense[dense.as_usize() + class] = next;
391        }
392
393        let head = self.states[prev].sparse;
394        if head == StateID::ZERO || byte < self.sparse[head].byte {
395            let new_link = self.alloc_transition()?;
396            self.sparse[new_link] = Transition { byte, next, link: head };
397            self.states[prev].sparse = new_link;
398            return Ok(());
399        } else if byte == self.sparse[head].byte {
400            self.sparse[head].next = next;
401            return Ok(());
402        }
403
404        // We handled the only cases where the beginning of the transition
405        // chain needs to change. At this point, we now know that there is
406        // at least one entry in the transition chain and the byte for that
407        // transition is less than the byte for the transition we're adding.
408        let (mut link_prev, mut link_next) = (head, self.sparse[head].link);
409        while link_next != StateID::ZERO && byte > self.sparse[link_next].byte
410        {
411            link_prev = link_next;
412            link_next = self.sparse[link_next].link;
413        }
414        if link_next == StateID::ZERO || byte < self.sparse[link_next].byte {
415            let link = self.alloc_transition()?;
416            self.sparse[link] = Transition { byte, next, link: link_next };
417            self.sparse[link_prev].link = link;
418        } else {
419            assert_eq!(byte, self.sparse[link_next].byte);
420            self.sparse[link_next].next = next;
421        }
422        Ok(())
423    }
424
425    /// This sets every possible transition (all 255 of them) for the given
426    /// state to the name `next` value.
427    ///
428    /// This is useful for efficiently initializing start/dead states.
429    ///
430    /// # Panics
431    ///
432    /// This requires that the state has no transitions added to it already.
433    /// If it has any transitions, then this panics. It will also panic if
434    /// the state has been densified prior to calling this.
435    fn init_full_state(
436        &mut self,
437        prev: StateID,
438        next: StateID,
439    ) -> Result<(), BuildError> {
440        assert_eq!(
441            StateID::ZERO,
442            self.states[prev].dense,
443            "state must not be dense yet"
444        );
445        assert_eq!(
446            StateID::ZERO,
447            self.states[prev].sparse,
448            "state must have zero transitions"
449        );
450        let mut prev_link = StateID::ZERO;
451        for byte in 0..=255 {
452            let new_link = self.alloc_transition()?;
453            self.sparse[new_link] =
454                Transition { byte, next, link: StateID::ZERO };
455            if prev_link == StateID::ZERO {
456                self.states[prev].sparse = new_link;
457            } else {
458                self.sparse[prev_link].link = new_link;
459            }
460            prev_link = new_link;
461        }
462        Ok(())
463    }
464
465    /// Add a match for the given pattern ID to the state for the given ID.
466    fn add_match(
467        &mut self,
468        sid: StateID,
469        pid: PatternID,
470    ) -> Result<(), BuildError> {
471        let head = self.states[sid].matches;
472        let mut link = head;
473        while self.matches[link].link != StateID::ZERO {
474            link = self.matches[link].link;
475        }
476        let new_match_link = self.alloc_match()?;
477        self.matches[new_match_link].pid = pid;
478        if link == StateID::ZERO {
479            self.states[sid].matches = new_match_link;
480        } else {
481            self.matches[link].link = new_match_link;
482        }
483        Ok(())
484    }
485
486    /// Copy matches from the `src` state to the `dst` state. This is useful
487    /// when a match state can be reached via a failure transition. In which
488    /// case, you'll want to copy the matches (if any) from the state reached
489    /// by the failure transition to the original state you were at.
490    fn copy_matches(
491        &mut self,
492        src: StateID,
493        dst: StateID,
494    ) -> Result<(), BuildError> {
495        let head_dst = self.states[dst].matches;
496        let mut link_dst = head_dst;
497        while self.matches[link_dst].link != StateID::ZERO {
498            link_dst = self.matches[link_dst].link;
499        }
500        let mut link_src = self.states[src].matches;
501        while link_src != StateID::ZERO {
502            let new_match_link =
503                StateID::new(self.matches.len()).map_err(|e| {
504                    BuildError::state_id_overflow(
505                        StateID::MAX.as_u64(),
506                        e.attempted(),
507                    )
508                })?;
509            self.matches.push(Match {
510                pid: self.matches[link_src].pid,
511                link: StateID::ZERO,
512            });
513            if link_dst == StateID::ZERO {
514                self.states[dst].matches = new_match_link;
515            } else {
516                self.matches[link_dst].link = new_match_link;
517            }
518
519            link_dst = new_match_link;
520            link_src = self.matches[link_src].link;
521        }
522        Ok(())
523    }
524
525    /// Create a new entry in `NFA::trans`, if there's room, and return that
526    /// entry's ID. If there's no room, then an error is returned.
527    fn alloc_transition(&mut self) -> Result<StateID, BuildError> {
528        let id = StateID::new(self.sparse.len()).map_err(|e| {
529            BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
530        })?;
531        self.sparse.push(Transition::default());
532        Ok(id)
533    }
534
535    /// Create a new entry in `NFA::matches`, if there's room, and return that
536    /// entry's ID. If there's no room, then an error is returned.
537    fn alloc_match(&mut self) -> Result<StateID, BuildError> {
538        let id = StateID::new(self.matches.len()).map_err(|e| {
539            BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
540        })?;
541        self.matches.push(Match::default());
542        Ok(id)
543    }
544
545    /// Create a new set of `N` transitions in this NFA's dense transition
546    /// table. The ID return corresponds to the index at which the `N`
547    /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is
548    /// the last.
549    ///
550    /// `N` is determined via `NFA::byte_classes::alphabet_len`.
551    fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> {
552        let id = StateID::new(self.dense.len()).map_err(|e| {
553            BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
554        })?;
555        // We use FAIL because it's the correct default. If a state doesn't
556        // have a transition defined for every possible byte value, then the
557        // transition function should return NFA::FAIL.
558        self.dense.extend(
559            core::iter::repeat(NFA::FAIL)
560                .take(self.byte_classes.alphabet_len()),
561        );
562        Ok(id)
563    }
564
565    /// Allocate and add a fresh state to the underlying NFA and return its
566    /// ID (guaranteed to be one more than the ID of the previously allocated
567    /// state). If the ID would overflow `StateID`, then this returns an error.
568    fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> {
569        // This is OK because we error when building the trie if we see a
570        // pattern whose length cannot fit into a 'SmallIndex', and the longest
571        // possible depth corresponds to the length of the longest pattern.
572        let depth = SmallIndex::new(depth)
573            .expect("patterns longer than SmallIndex::MAX are not allowed");
574        let id = StateID::new(self.states.len()).map_err(|e| {
575            BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
576        })?;
577        self.states.push(State {
578            sparse: StateID::ZERO,
579            dense: StateID::ZERO,
580            matches: StateID::ZERO,
581            fail: self.special.start_unanchored_id,
582            depth,
583        });
584        Ok(id)
585    }
586}
587
588// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
589// returns a valid state ID given a valid state ID. We otherwise claim that
590// all other methods are correct as well.
591unsafe impl Automaton for NFA {
592    #[inline(always)]
593    fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
594        match anchored {
595            Anchored::No => Ok(self.special.start_unanchored_id),
596            Anchored::Yes => Ok(self.special.start_anchored_id),
597        }
598    }
599
600    #[inline(always)]
601    fn next_state(
602        &self,
603        anchored: Anchored,
604        mut sid: StateID,
605        byte: u8,
606    ) -> StateID {
607        // This terminates since:
608        //
609        // 1. state.fail never points to the FAIL state.
610        // 2. All state.fail values point to a state closer to the start state.
611        // 3. The start state has no transitions to the FAIL state.
612        loop {
613            let next = self.follow_transition(sid, byte);
614            if next != NFA::FAIL {
615                return next;
616            }
617            // For an anchored search, we never follow failure transitions
618            // because failure transitions lead us down a path to matching
619            // a *proper* suffix of the path we were on. Thus, it can only
620            // produce matches that appear after the beginning of the search.
621            if anchored.is_anchored() {
622                return NFA::DEAD;
623            }
624            sid = self.states[sid].fail();
625        }
626    }
627
628    #[inline(always)]
629    fn is_special(&self, sid: StateID) -> bool {
630        sid <= self.special.max_special_id
631    }
632
633    #[inline(always)]
634    fn is_dead(&self, sid: StateID) -> bool {
635        sid == NFA::DEAD
636    }
637
638    #[inline(always)]
639    fn is_match(&self, sid: StateID) -> bool {
640        // N.B. This returns true when sid==NFA::FAIL but that's okay because
641        // NFA::FAIL is not actually a valid state ID from the perspective of
642        // the Automaton trait. Namely, it is never returned by 'start_state'
643        // or by 'next_state'. So we don't need to care about it here.
644        !self.is_dead(sid) && sid <= self.special.max_match_id
645    }
646
647    #[inline(always)]
648    fn is_start(&self, sid: StateID) -> bool {
649        sid == self.special.start_unanchored_id
650            || sid == self.special.start_anchored_id
651    }
652
653    #[inline(always)]
654    fn match_kind(&self) -> MatchKind {
655        self.match_kind
656    }
657
658    #[inline(always)]
659    fn patterns_len(&self) -> usize {
660        self.pattern_lens.len()
661    }
662
663    #[inline(always)]
664    fn pattern_len(&self, pid: PatternID) -> usize {
665        self.pattern_lens[pid].as_usize()
666    }
667
668    #[inline(always)]
669    fn min_pattern_len(&self) -> usize {
670        self.min_pattern_len
671    }
672
673    #[inline(always)]
674    fn max_pattern_len(&self) -> usize {
675        self.max_pattern_len
676    }
677
678    #[inline(always)]
679    fn match_len(&self, sid: StateID) -> usize {
680        self.iter_matches(sid).count()
681    }
682
683    #[inline(always)]
684    fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
685        self.iter_matches(sid).nth(index).unwrap()
686    }
687
688    #[inline(always)]
689    fn memory_usage(&self) -> usize {
690        self.states.len() * core::mem::size_of::<State>()
691            + self.sparse.len() * core::mem::size_of::<Transition>()
692            + self.matches.len() * core::mem::size_of::<Match>()
693            + self.dense.len() * StateID::SIZE
694            + self.pattern_lens.len() * SmallIndex::SIZE
695            + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
696    }
697
698    #[inline(always)]
699    fn prefilter(&self) -> Option<&Prefilter> {
700        self.prefilter.as_ref()
701    }
702}
703
704/// A representation of a sparse NFA state for an Aho-Corasick automaton.
705///
706/// It contains the transitions to the next state, a failure transition for
707/// cases where there exists no other transition for the current input byte
708/// and the matches implied by visiting this state (if any).
709#[derive(Clone, Debug)]
710pub(crate) struct State {
711    /// A pointer to `NFA::trans` corresponding to the head of a linked list
712    /// containing all of the transitions for this state.
713    ///
714    /// This is `StateID::ZERO` if and only if this state has zero transitions.
715    sparse: StateID,
716    /// A pointer to a row of `N` transitions in `NFA::dense`. These
717    /// transitions correspond precisely to what is obtained by traversing
718    /// `sparse`, but permits constant time lookup.
719    ///
720    /// When this is zero (which is true for most states in the default
721    /// configuration), then this state has no dense representation.
722    ///
723    /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is
724    /// typically much less than 256 (the maximum value).
725    dense: StateID,
726    /// A pointer to `NFA::matches` corresponding to the head of a linked list
727    /// containing all of the matches for this state.
728    ///
729    /// This is `StateID::ZERO` if and only if this state is not a match state.
730    matches: StateID,
731    /// The state that should be transitioned to if the current byte in the
732    /// haystack does not have a corresponding transition defined in this
733    /// state.
734    fail: StateID,
735    /// The depth of this state. Specifically, this is the distance from this
736    /// state to the starting state. (For the special sentinel states DEAD and
737    /// FAIL, their depth is always 0.) The depth of a starting state is 0.
738    ///
739    /// Note that depth is currently not used in this non-contiguous NFA. It
740    /// may in the future, but it is used in the contiguous NFA. Namely, it
741    /// permits an optimization where states near the starting state have their
742    /// transitions stored in a dense fashion, but all other states have their
743    /// transitions stored in a sparse fashion. (This non-contiguous NFA uses
744    /// a sparse representation for all states unconditionally.) In any case,
745    /// this is really the only convenient place to compute and store this
746    /// information, which we need when building the contiguous NFA.
747    depth: SmallIndex,
748}
749
750impl State {
751    /// Return true if and only if this state is a match state.
752    pub(crate) fn is_match(&self) -> bool {
753        self.matches != StateID::ZERO
754    }
755
756    /// Returns the failure transition for this state.
757    pub(crate) fn fail(&self) -> StateID {
758        self.fail
759    }
760
761    /// Returns the depth of this state. That is, the number of transitions
762    /// this state is from the start state of the NFA.
763    pub(crate) fn depth(&self) -> SmallIndex {
764        self.depth
765    }
766}
767
768/// A single transition in a non-contiguous NFA.
769#[derive(Clone, Copy, Default)]
770#[repr(packed)]
771pub(crate) struct Transition {
772    byte: u8,
773    next: StateID,
774    link: StateID,
775}
776
777impl Transition {
778    /// Return the byte for which this transition is defined.
779    pub(crate) fn byte(&self) -> u8 {
780        self.byte
781    }
782
783    /// Return the ID of the state that this transition points to.
784    pub(crate) fn next(&self) -> StateID {
785        self.next
786    }
787
788    /// Return the ID of the next transition.
789    fn link(&self) -> StateID {
790        self.link
791    }
792}
793
794impl core::fmt::Debug for Transition {
795    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
796        write!(
797            f,
798            "Transition(byte: {:X?}, next: {:?}, link: {:?})",
799            self.byte,
800            self.next().as_usize(),
801            self.link().as_usize()
802        )
803    }
804}
805
806/// A single match in a non-contiguous NFA.
807#[derive(Clone, Copy, Default)]
808struct Match {
809    pid: PatternID,
810    link: StateID,
811}
812
813impl Match {
814    /// Return the pattern ID for this match.
815    pub(crate) fn pattern(&self) -> PatternID {
816        self.pid
817    }
818
819    /// Return the ID of the next match.
820    fn link(&self) -> StateID {
821        self.link
822    }
823}
824
825impl core::fmt::Debug for Match {
826    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
827        write!(
828            f,
829            "Match(pid: {:?}, link: {:?})",
830            self.pattern().as_usize(),
831            self.link().as_usize()
832        )
833    }
834}
835
836/// A builder for configuring an Aho-Corasick noncontiguous NFA.
837///
838/// This builder has a subset of the options available to a
839/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
840/// their behavior is identical.
841#[derive(Clone, Debug)]
842pub struct Builder {
843    match_kind: MatchKind,
844    prefilter: bool,
845    ascii_case_insensitive: bool,
846    dense_depth: usize,
847}
848
849impl Default for Builder {
850    fn default() -> Builder {
851        Builder {
852            match_kind: MatchKind::default(),
853            prefilter: true,
854            ascii_case_insensitive: false,
855            dense_depth: 3,
856        }
857    }
858}
859
860impl Builder {
861    /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA.
862    pub fn new() -> Builder {
863        Builder::default()
864    }
865
866    /// Build an Aho-Corasick noncontiguous NFA from the given iterator of
867    /// patterns.
868    ///
869    /// A builder may be reused to create more NFAs.
870    pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
871    where
872        I: IntoIterator<Item = P>,
873        P: AsRef<[u8]>,
874    {
875        debug!("building non-contiguous NFA");
876        let nfa = Compiler::new(self)?.compile(patterns)?;
877        debug!(
878            "non-contiguous NFA built, <states: {:?}, size: {:?}>",
879            nfa.states.len(),
880            nfa.memory_usage()
881        );
882        Ok(nfa)
883    }
884
885    /// Set the desired match semantics.
886    ///
887    /// See
888    /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
889    /// for more documentation and examples.
890    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
891        self.match_kind = kind;
892        self
893    }
894
895    /// Enable ASCII-aware case insensitive matching.
896    ///
897    /// See
898    /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
899    /// for more documentation and examples.
900    pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
901        self.ascii_case_insensitive = yes;
902        self
903    }
904
905    /// Set the limit on how many states use a dense representation for their
906    /// transitions. Other states will generally use a sparse representation.
907    ///
908    /// See
909    /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
910    /// for more documentation and examples.
911    pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
912        self.dense_depth = depth;
913        self
914    }
915
916    /// Enable heuristic prefilter optimizations.
917    ///
918    /// See
919    /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
920    /// for more documentation and examples.
921    pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
922        self.prefilter = yes;
923        self
924    }
925}
926
927/// A compiler uses a builder configuration and builds up the NFA formulation
928/// of an Aho-Corasick automaton. This roughly corresponds to the standard
929/// formulation described in textbooks, with some tweaks to support leftmost
930/// searching.
931#[derive(Debug)]
932struct Compiler<'a> {
933    builder: &'a Builder,
934    prefilter: prefilter::Builder,
935    nfa: NFA,
936    byteset: ByteClassSet,
937}
938
939impl<'a> Compiler<'a> {
940    fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> {
941        let prefilter = prefilter::Builder::new(builder.match_kind)
942            .ascii_case_insensitive(builder.ascii_case_insensitive);
943        Ok(Compiler {
944            builder,
945            prefilter,
946            nfa: NFA {
947                match_kind: builder.match_kind,
948                states: vec![],
949                sparse: vec![],
950                dense: vec![],
951                matches: vec![],
952                pattern_lens: vec![],
953                prefilter: None,
954                byte_classes: ByteClasses::singletons(),
955                min_pattern_len: usize::MAX,
956                max_pattern_len: 0,
957                special: Special::zero(),
958            },
959            byteset: ByteClassSet::empty(),
960        })
961    }
962
963    fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError>
964    where
965        I: IntoIterator<Item = P>,
966        P: AsRef<[u8]>,
967    {
968        // Add dummy transition/match links, so that no valid link will point
969        // to another link at index 0.
970        self.nfa.sparse.push(Transition::default());
971        self.nfa.matches.push(Match::default());
972        // Add a dummy dense transition so that no states can have dense==0
973        // represent a valid pointer to dense transitions. This permits
974        // dense==0 to be a sentinel indicating "no dense transitions."
975        self.nfa.dense.push(NFA::DEAD);
976        // the dead state, only used for leftmost and fixed to id==0
977        self.nfa.alloc_state(0)?;
978        // the fail state, which is never entered and fixed to id==1
979        self.nfa.alloc_state(0)?;
980        // unanchored start state, initially fixed to id==2 but later shuffled
981        // to appear after all non-start match states.
982        self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?;
983        // anchored start state, initially fixed to id==3 but later shuffled
984        // to appear after unanchored start state.
985        self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?;
986        // Initialize the unanchored starting state in order to make it dense,
987        // and thus make transition lookups on this state faster.
988        self.init_unanchored_start_state()?;
989        // Set all transitions on the DEAD state to point to itself. This way,
990        // the DEAD state can never be escaped. It MUST be used as a sentinel
991        // in any correct search.
992        self.add_dead_state_loop()?;
993        // Build the base trie from the given patterns.
994        self.build_trie(patterns)?;
995        self.nfa.states.shrink_to_fit();
996        // Turn our set of bytes into equivalent classes. This NFA
997        // implementation uses byte classes only for states that use a dense
998        // representation of transitions. (And that's why this comes before
999        // `self.densify()`, as the byte classes need to be set first.)
1000        self.nfa.byte_classes = self.byteset.byte_classes();
1001        // Add transitions (and maybe matches) to the anchored starting state.
1002        // The anchored starting state is used for anchored searches. The only
1003        // mechanical difference between it and the unanchored start state is
1004        // that missing transitions map to the DEAD state instead of the FAIL
1005        // state.
1006        self.set_anchored_start_state()?;
1007        // Rewrite transitions to the FAIL state on the unanchored start state
1008        // as self-transitions. This keeps the start state active at all times.
1009        self.add_unanchored_start_state_loop();
1010        // Make some (possibly zero) states use a dense representation for
1011        // transitions. It's important to do this right after the states
1012        // and non-failure transitions are solidified. That way, subsequent
1013        // accesses (particularly `fill_failure_transitions`) will benefit from
1014        // the faster transition lookup in densified states.
1015        self.densify()?;
1016        // The meat of the Aho-Corasick algorithm: compute and write failure
1017        // transitions. i.e., the state to move to when a transition isn't
1018        // defined in the current state. These are epsilon transitions and thus
1019        // make this formulation an NFA.
1020        self.fill_failure_transitions()?;
1021        // Handle a special case under leftmost semantics when at least one
1022        // of the patterns is the empty string.
1023        self.close_start_state_loop_for_leftmost();
1024        // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START,
1025        // NON-MATCH, ... This permits us to very quickly query the type of
1026        // the state we're currently in during a search.
1027        self.shuffle();
1028        self.nfa.prefilter = self.prefilter.build();
1029        // Store the maximum ID of all *relevant* special states. Start states
1030        // are only relevant when we have a prefilter, otherwise, there is zero
1031        // reason to care about whether a state is a start state or not during
1032        // a search. Indeed, without a prefilter, we are careful to explicitly
1033        // NOT care about start states, otherwise the search can ping pong
1034        // between the unrolled loop and the handling of special-status states
1035        // and destroy perf.
1036        self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() {
1037            // Why the anchored starting state? Because we always put it
1038            // after the unanchored starting state and it is therefore the
1039            // maximum. Why put unanchored followed by anchored? No particular
1040            // reason, but that's how the states are logically organized in the
1041            // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯
1042            self.nfa.special.start_anchored_id
1043        } else {
1044            self.nfa.special.max_match_id
1045        };
1046        self.nfa.sparse.shrink_to_fit();
1047        self.nfa.dense.shrink_to_fit();
1048        self.nfa.matches.shrink_to_fit();
1049        self.nfa.pattern_lens.shrink_to_fit();
1050        Ok(self.nfa)
1051    }
1052
1053    /// This sets up the initial prefix trie that makes up the Aho-Corasick
1054    /// automaton. Effectively, it creates the basic structure of the
1055    /// automaton, where every pattern given has a path from the start state to
1056    /// the end of the pattern.
1057    fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError>
1058    where
1059        I: IntoIterator<Item = P>,
1060        P: AsRef<[u8]>,
1061    {
1062        'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() {
1063            let pid = PatternID::new(i).map_err(|e| {
1064                BuildError::pattern_id_overflow(
1065                    PatternID::MAX.as_u64(),
1066                    e.attempted(),
1067                )
1068            })?;
1069            let pat = pat.as_ref();
1070            let patlen = SmallIndex::new(pat.len())
1071                .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?;
1072            self.nfa.min_pattern_len =
1073                core::cmp::min(self.nfa.min_pattern_len, pat.len());
1074            self.nfa.max_pattern_len =
1075                core::cmp::max(self.nfa.max_pattern_len, pat.len());
1076            assert_eq!(
1077                i,
1078                self.nfa.pattern_lens.len(),
1079                "expected number of patterns to match pattern ID"
1080            );
1081            self.nfa.pattern_lens.push(patlen);
1082            // We add the pattern to the prefilter here because the pattern
1083            // ID in the prefilter is determined with respect to the patterns
1084            // added to the prefilter. That is, it isn't the ID we have here,
1085            // but the one determined by its own accounting of patterns.
1086            // To ensure they line up, we add every pattern we see to the
1087            // prefilter, even if some patterns ultimately are impossible to
1088            // match (in leftmost-first semantics specifically).
1089            //
1090            // Another way of doing this would be to expose an API in the
1091            // prefilter to permit setting your own pattern IDs. Or to just use
1092            // our own map and go between them. But this case is sufficiently
1093            // rare that we don't bother and just make sure they're in sync.
1094            if self.builder.prefilter {
1095                self.prefilter.add(pat);
1096            }
1097
1098            let mut prev = self.nfa.special.start_unanchored_id;
1099            let mut saw_match = false;
1100            for (depth, &b) in pat.iter().enumerate() {
1101                // When leftmost-first match semantics are requested, we
1102                // specifically stop adding patterns when a previously added
1103                // pattern is a prefix of it. We avoid adding it because
1104                // leftmost-first semantics imply that the pattern can never
1105                // match. This is not just an optimization to save space! It
1106                // is necessary for correctness. In fact, this is the only
1107                // difference in the automaton between the implementations for
1108                // leftmost-first and leftmost-longest.
1109                saw_match = saw_match || self.nfa.states[prev].is_match();
1110                if self.builder.match_kind.is_leftmost_first() && saw_match {
1111                    // Skip to the next pattern immediately. This avoids
1112                    // incorrectly adding a match after this loop terminates.
1113                    continue 'PATTERNS;
1114                }
1115
1116                // Add this byte to our equivalence classes. These don't
1117                // get used while building the trie, but other Aho-Corasick
1118                // implementations may use them.
1119                self.byteset.set_range(b, b);
1120                if self.builder.ascii_case_insensitive {
1121                    let b = opposite_ascii_case(b);
1122                    self.byteset.set_range(b, b);
1123                }
1124
1125                // If the transition from prev using the current byte already
1126                // exists, then just move through it. Otherwise, add a new
1127                // state. We track the depth here so that we can determine
1128                // how to represent transitions. States near the start state
1129                // use a dense representation that uses more memory but is
1130                // faster. Other states use a sparse representation that uses
1131                // less memory but is slower.
1132                let next = self.nfa.follow_transition(prev, b);
1133                if next != NFA::FAIL {
1134                    prev = next;
1135                } else {
1136                    let next = self.nfa.alloc_state(depth)?;
1137                    self.nfa.add_transition(prev, b, next)?;
1138                    if self.builder.ascii_case_insensitive {
1139                        let b = opposite_ascii_case(b);
1140                        self.nfa.add_transition(prev, b, next)?;
1141                    }
1142                    prev = next;
1143                }
1144            }
1145            // Once the pattern has been added, log the match in the final
1146            // state that it reached.
1147            self.nfa.add_match(prev, pid)?;
1148        }
1149        Ok(())
1150    }
1151
1152    /// This routine creates failure transitions according to the standard
1153    /// textbook formulation of the Aho-Corasick algorithm, with a couple small
1154    /// tweaks to support "leftmost" semantics.
1155    ///
1156    /// Building failure transitions is the most interesting part of building
1157    /// the Aho-Corasick automaton, because they are what allow searches to
1158    /// be performed in linear time. Specifically, a failure transition is
1159    /// a single transition associated with each state that points back to
1160    /// the longest proper suffix of the pattern being searched. The failure
1161    /// transition is followed whenever there exists no transition on the
1162    /// current state for the current input byte. If there is no other proper
1163    /// suffix, then the failure transition points back to the starting state.
1164    ///
1165    /// For example, let's say we built an Aho-Corasick automaton with the
1166    /// following patterns: 'abcd' and 'cef'. The trie looks like this:
1167    ///
1168    /// ```ignore
1169    ///          a - S1 - b - S2 - c - S3 - d - S4*
1170    ///         /
1171    ///     S0 - c - S5 - e - S6 - f - S7*
1172    /// ```
1173    ///
1174    /// At this point, it should be fairly straight-forward to see how this
1175    /// trie can be used in a simplistic way. At any given position in the
1176    /// text we're searching (called the "subject" string), all we need to do
1177    /// is follow the transitions in the trie by consuming one transition for
1178    /// each byte in the subject string. If we reach a match state, then we can
1179    /// report that location as a match.
1180    ///
1181    /// The trick comes when searching a subject string like 'abcef'. We'll
1182    /// initially follow the transition from S0 to S1 and wind up in S3 after
1183    /// observng the 'c' byte. At this point, the next byte is 'e' but state
1184    /// S3 has no transition for 'e', so the search fails. We then would need
1185    /// to restart the search at the next position in 'abcef', which
1186    /// corresponds to 'b'. The match would fail, but the next search starting
1187    /// at 'c' would finally succeed. The problem with this approach is that
1188    /// we wind up searching the subject string potentially many times. In
1189    /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
1190    /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
1191    /// like to achieve a `O(n + m)` worst case complexity.
1192    ///
1193    /// This is where failure transitions come in. Instead of dying at S3 in
1194    /// the first search, the automaton can instruct the search to move to
1195    /// another part of the automaton that corresponds to a suffix of what
1196    /// we've seen so far. Recall that we've seen 'abc' in the subject string,
1197    /// and the automaton does indeed have a non-empty suffix, 'c', that could
1198    /// potentially lead to another match. Thus, the actual Aho-Corasick
1199    /// automaton for our patterns in this case looks like this:
1200    ///
1201    /// ```ignore
1202    ///          a - S1 - b - S2 - c - S3 - d - S4*
1203    ///         /                      /
1204    ///        /       ----------------
1205    ///       /       /
1206    ///     S0 - c - S5 - e - S6 - f - S7*
1207    /// ```
1208    ///
1209    /// That is, we have a failure transition from S3 to S5, which is followed
1210    /// exactly in cases when we are in state S3 but see any byte other than
1211    /// 'd' (that is, we've "failed" to find a match in this portion of our
1212    /// trie). We know we can transition back to S5 because we've already seen
1213    /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
1214    /// with the search starting at S5 and complete our match.
1215    ///
1216    /// Adding failure transitions to a trie is fairly simple, but subtle. The
1217    /// key issue is that you might have multiple failure transition that you
1218    /// need to follow. For example, look at the trie for the patterns
1219    /// 'abcd', 'b', 'bcd' and 'cd':
1220    ///
1221    /// ```ignore
1222    ///          - a - S1 - b - S2* - c - S3 - d - S4*
1223    ///         /               /         /
1224    ///        /         -------   -------
1225    ///       /         /         /
1226    ///     S0 --- b - S5* - c - S6 - d - S7*
1227    ///       \                  /
1228    ///        \         --------
1229    ///         \       /
1230    ///          - c - S8 - d - S9*
1231    /// ```
1232    ///
1233    /// The failure transitions for this trie are defined from S2 to S5,
1234    /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
1235    /// corresponds to a match, since its failure transition to S5 is itself
1236    /// a match state.
1237    ///
1238    /// Perhaps simplest way to think about adding these failure transitions
1239    /// is recursively. That is, if you know the failure transitions for every
1240    /// possible previous state that could be visited (e.g., when computing the
1241    /// failure transition for S3, you already know the failure transitions
1242    /// for S0, S1 and S2), then you can simply follow the failure transition
1243    /// of the previous state and check whether the incoming transition is
1244    /// defined after following the failure transition.
1245    ///
1246    /// For example, when determining the failure state for S3, by our
1247    /// assumptions, we already know that there is a failure transition from
1248    /// S2 (the previous state) to S5. So we follow that transition and check
1249    /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
1250    /// as there is a transition from S5 to S6 for the byte 'c'. If no such
1251    /// transition existed, we could keep following the failure transitions
1252    /// until we reach the start state, which is the failure transition for
1253    /// every state that has no corresponding proper suffix.
1254    ///
1255    /// We don't actually use recursion to implement this, but instead, use a
1256    /// breadth first search of the automaton. Our base case is the start
1257    /// state, whose failure transition is just a transition to itself.
1258    ///
1259    /// When building a leftmost automaton, we proceed as above, but only
1260    /// include a subset of failure transitions. Namely, we omit any failure
1261    /// transitions that appear after a match state in the trie. This is
1262    /// because failure transitions always point back to a proper suffix of
1263    /// what has been seen so far. Thus, following a failure transition after
1264    /// a match implies looking for a match that starts after the one that has
1265    /// already been seen, which is of course therefore not the leftmost match.
1266    ///
1267    /// N.B. I came up with this algorithm on my own, and after scouring all of
1268    /// the other AC implementations I know of (Perl, Snort, many on GitHub).
1269    /// I couldn't find any that implement leftmost semantics like this.
1270    /// Perl of course needs leftmost-first semantics, but they implement it
1271    /// with a seeming hack at *search* time instead of encoding it into the
1272    /// automaton. There are also a couple Java libraries that support leftmost
1273    /// longest semantics, but they do it by building a queue of matches at
1274    /// search time, which is even worse than what Perl is doing. ---AG
1275    fn fill_failure_transitions(&mut self) -> Result<(), BuildError> {
1276        let is_leftmost = self.builder.match_kind.is_leftmost();
1277        let start_uid = self.nfa.special.start_unanchored_id;
1278        // Initialize the queue for breadth first search with all transitions
1279        // out of the start state. We handle the start state specially because
1280        // we only want to follow non-self transitions. If we followed self
1281        // transitions, then this would never terminate.
1282        let mut queue = VecDeque::new();
1283        let mut seen = self.queued_set();
1284        let mut prev_link = None;
1285        while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1286            prev_link = Some(link);
1287            let t = self.nfa.sparse[link];
1288
1289            // Skip anything we've seen before and any self-transitions on the
1290            // start state.
1291            if start_uid == t.next() || seen.contains(t.next) {
1292                continue;
1293            }
1294            queue.push_back(t.next);
1295            seen.insert(t.next);
1296            // Under leftmost semantics, if a state immediately following
1297            // the start state is a match state, then we never want to
1298            // follow its failure transition since the failure transition
1299            // necessarily leads back to the start state, which we never
1300            // want to do for leftmost matching after a match has been
1301            // found.
1302            //
1303            // We apply the same logic to non-start states below as well.
1304            if is_leftmost && self.nfa.states[t.next].is_match() {
1305                self.nfa.states[t.next].fail = NFA::DEAD;
1306            }
1307        }
1308        while let Some(id) = queue.pop_front() {
1309            let mut prev_link = None;
1310            while let Some(link) = self.nfa.next_link(id, prev_link) {
1311                prev_link = Some(link);
1312                let t = self.nfa.sparse[link];
1313
1314                if seen.contains(t.next) {
1315                    // The only way to visit a duplicate state in a transition
1316                    // list is when ASCII case insensitivity is enabled. In
1317                    // this case, we want to skip it since it's redundant work.
1318                    // But it would also end up duplicating matches, which
1319                    // results in reporting duplicate matches in some cases.
1320                    // See the 'acasei010' regression test.
1321                    continue;
1322                }
1323                queue.push_back(t.next);
1324                seen.insert(t.next);
1325
1326                // As above for start states, under leftmost semantics, once
1327                // we see a match all subsequent states should have no failure
1328                // transitions because failure transitions always imply looking
1329                // for a match that is a suffix of what has been seen so far
1330                // (where "seen so far" corresponds to the string formed by
1331                // following the transitions from the start state to the
1332                // current state). Under leftmost semantics, we specifically do
1333                // not want to allow this to happen because we always want to
1334                // report the match found at the leftmost position.
1335                //
1336                // The difference between leftmost-first and leftmost-longest
1337                // occurs previously while we build the trie. For
1338                // leftmost-first, we simply omit any entries that would
1339                // otherwise require passing through a match state.
1340                //
1341                // Note that for correctness, the failure transition has to be
1342                // set to the dead state for ALL states following a match, not
1343                // just the match state itself. However, by setting the failure
1344                // transition to the dead state on all match states, the dead
1345                // state will automatically propagate to all subsequent states
1346                // via the failure state computation below.
1347                if is_leftmost && self.nfa.states[t.next].is_match() {
1348                    self.nfa.states[t.next].fail = NFA::DEAD;
1349                    continue;
1350                }
1351                let mut fail = self.nfa.states[id].fail;
1352                while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL {
1353                    fail = self.nfa.states[fail].fail;
1354                }
1355                fail = self.nfa.follow_transition(fail, t.byte);
1356                self.nfa.states[t.next].fail = fail;
1357                self.nfa.copy_matches(fail, t.next)?;
1358            }
1359            // If the start state is a match state, then this automaton can
1360            // match the empty string. This implies all states are match states
1361            // since every position matches the empty string, so copy the
1362            // matches from the start state to every state. Strictly speaking,
1363            // this is only necessary for overlapping matches since each
1364            // non-empty non-start match state needs to report empty matches
1365            // in addition to its own. For the non-overlapping case, such
1366            // states only report the first match, which is never empty since
1367            // it isn't a start state.
1368            if !is_leftmost {
1369                self.nfa
1370                    .copy_matches(self.nfa.special.start_unanchored_id, id)?;
1371            }
1372        }
1373        Ok(())
1374    }
1375
1376    /// Shuffle the states so that they appear in this sequence:
1377    ///
1378    ///   DEAD, FAIL, MATCH..., START, START, NON-MATCH...
1379    ///
1380    /// The idea here is that if we know how special states are laid out in our
1381    /// transition table, then we can determine what "kind" of state we're in
1382    /// just by comparing our current state ID with a particular value. In this
1383    /// way, we avoid doing extra memory lookups.
1384    ///
1385    /// Before shuffling begins, our states look something like this:
1386    ///
1387    ///   DEAD, FAIL, START, START, (MATCH | NON-MATCH)...
1388    ///
1389    /// So all we need to do is move all of the MATCH states so that they
1390    /// all appear before any NON-MATCH state, like so:
1391    ///
1392    ///   DEAD, FAIL, START, START, MATCH... NON-MATCH...
1393    ///
1394    /// Then it's just a simple matter of swapping the two START states with
1395    /// the last two MATCH states.
1396    ///
1397    /// (This is the same technique used for fully compiled DFAs in
1398    /// regex-automata.)
1399    fn shuffle(&mut self) {
1400        let old_start_uid = self.nfa.special.start_unanchored_id;
1401        let old_start_aid = self.nfa.special.start_anchored_id;
1402        assert!(old_start_uid < old_start_aid);
1403        assert_eq!(
1404            3,
1405            old_start_aid.as_usize(),
1406            "anchored start state should be at index 3"
1407        );
1408        // We implement shuffling by a sequence of pairwise swaps of states.
1409        // Since we have a number of things referencing states via their
1410        // IDs and swapping them changes their IDs, we need to record every
1411        // swap we make so that we can remap IDs. The remapper handles this
1412        // book-keeping for us.
1413        let mut remapper = Remapper::new(&self.nfa, 0);
1414        // The way we proceed here is by moving all match states so that
1415        // they directly follow the start states. So it will go: DEAD, FAIL,
1416        // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ...
1417        //
1418        // To do that, we proceed forward through all states after
1419        // START-ANCHORED and swap match states so that they appear before all
1420        // non-match states.
1421        let mut next_avail = StateID::from(4u8);
1422        for i in next_avail.as_usize()..self.nfa.states.len() {
1423            let sid = StateID::new(i).unwrap();
1424            if !self.nfa.states[sid].is_match() {
1425                continue;
1426            }
1427            remapper.swap(&mut self.nfa, sid, next_avail);
1428            // The key invariant here is that only non-match states exist
1429            // between 'next_avail' and 'sid' (with them being potentially
1430            // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed
1431            // to land on the leftmost non-match state. (Unless 'next_avail'
1432            // and 'sid' are equivalent, in which case, a swap will occur but
1433            // it is a no-op.)
1434            next_avail = StateID::new(next_avail.one_more()).unwrap();
1435        }
1436        // Now we'd like to move the start states to immediately following the
1437        // match states. (The start states may themselves be match states, but
1438        // we'll handle that later.) We arrange the states this way so that we
1439        // don't necessarily need to check whether a state is a start state or
1440        // not before checking whether a state is a match state. For example,
1441        // we'd like to be able to write this as our state machine loop:
1442        //
1443        //   sid = start()
1444        //   for byte in haystack:
1445        //     sid = next(sid, byte)
1446        //     if sid <= nfa.max_start_id:
1447        //       if sid <= nfa.max_dead_id:
1448        //         # search complete
1449        //       elif sid <= nfa.max_match_id:
1450        //         # found match
1451        //
1452        // The important context here is that we might not want to look for
1453        // start states at all. Namely, if a searcher doesn't have a prefilter,
1454        // then there is no reason to care about whether we're in a start state
1455        // or not. And indeed, if we did check for it, this very hot loop would
1456        // ping pong between the special state handling and the main state
1457        // transition logic. This in turn stalls the CPU by killing branch
1458        // prediction.
1459        //
1460        // So essentially, we really want to be able to "forget" that start
1461        // states even exist and this is why we put them at the end.
1462        let new_start_aid =
1463            StateID::new(next_avail.as_usize().checked_sub(1).unwrap())
1464                .unwrap();
1465        remapper.swap(&mut self.nfa, old_start_aid, new_start_aid);
1466        let new_start_uid =
1467            StateID::new(next_avail.as_usize().checked_sub(2).unwrap())
1468                .unwrap();
1469        remapper.swap(&mut self.nfa, old_start_uid, new_start_uid);
1470        let new_max_match_id =
1471            StateID::new(next_avail.as_usize().checked_sub(3).unwrap())
1472                .unwrap();
1473        self.nfa.special.max_match_id = new_max_match_id;
1474        self.nfa.special.start_unanchored_id = new_start_uid;
1475        self.nfa.special.start_anchored_id = new_start_aid;
1476        // If one start state is a match state, then they both are.
1477        if self.nfa.states[self.nfa.special.start_anchored_id].is_match() {
1478            self.nfa.special.max_match_id = self.nfa.special.start_anchored_id;
1479        }
1480        remapper.remap(&mut self.nfa);
1481    }
1482
1483    /// Attempts to convert the transition representation of a subset of states
1484    /// in this NFA from sparse to dense. This can greatly improve search
1485    /// performance since states with a higher number of transitions tend to
1486    /// correlate with very active states.
1487    ///
1488    /// We generally only densify states that are close to the start state.
1489    /// These tend to be the most active states and thus benefit from a dense
1490    /// representation more than other states.
1491    ///
1492    /// This tends to best balance between memory usage and performance. In
1493    /// particular, the *vast majority* of all states in a typical Aho-Corasick
1494    /// automaton have only 1 transition and are usually farther from the start
1495    /// state and thus don't get densified.
1496    ///
1497    /// Note that this doesn't remove the sparse representation of transitions
1498    /// for states that are densified. It could be done, but actually removing
1499    /// entries from `NFA::sparse` is likely more expensive than it's worth.
1500    fn densify(&mut self) -> Result<(), BuildError> {
1501        for i in 0..self.nfa.states.len() {
1502            let sid = StateID::new(i).unwrap();
1503            // Don't bother densifying states that are only used as sentinels.
1504            if sid == NFA::DEAD || sid == NFA::FAIL {
1505                continue;
1506            }
1507            // Only densify states that are "close enough" to the start state.
1508            if self.nfa.states[sid].depth.as_usize()
1509                >= self.builder.dense_depth
1510            {
1511                continue;
1512            }
1513            let dense = self.nfa.alloc_dense_state()?;
1514            let mut prev_link = None;
1515            while let Some(link) = self.nfa.next_link(sid, prev_link) {
1516                prev_link = Some(link);
1517                let t = self.nfa.sparse[link];
1518
1519                let class = usize::from(self.nfa.byte_classes.get(t.byte));
1520                let index = dense.as_usize() + class;
1521                self.nfa.dense[index] = t.next;
1522            }
1523            self.nfa.states[sid].dense = dense;
1524        }
1525        Ok(())
1526    }
1527
1528    /// Returns a set that tracked queued states.
1529    ///
1530    /// This is only necessary when ASCII case insensitivity is enabled, since
1531    /// it is the only way to visit the same state twice. Otherwise, this
1532    /// returns an inert set that nevers adds anything and always reports
1533    /// `false` for every member test.
1534    fn queued_set(&self) -> QueuedSet {
1535        if self.builder.ascii_case_insensitive {
1536            QueuedSet::active()
1537        } else {
1538            QueuedSet::inert()
1539        }
1540    }
1541
1542    /// Initializes the unanchored start state by making it dense. This is
1543    /// achieved by explicitly setting every transition to the FAIL state.
1544    /// This isn't necessary for correctness, since any missing transition is
1545    /// automatically assumed to be mapped to the FAIL state. We do this to
1546    /// make the unanchored starting state dense, and thus in turn make
1547    /// transition lookups on it faster. (Which is worth doing because it's
1548    /// the most active state.)
1549    fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> {
1550        let start_uid = self.nfa.special.start_unanchored_id;
1551        let start_aid = self.nfa.special.start_anchored_id;
1552        self.nfa.init_full_state(start_uid, NFA::FAIL)?;
1553        self.nfa.init_full_state(start_aid, NFA::FAIL)?;
1554        Ok(())
1555    }
1556
1557    /// Setup the anchored start state by copying all of the transitions and
1558    /// matches from the unanchored starting state with one change: the failure
1559    /// transition is changed to the DEAD state, so that for any undefined
1560    /// transitions, the search will stop.
1561    fn set_anchored_start_state(&mut self) -> Result<(), BuildError> {
1562        let start_uid = self.nfa.special.start_unanchored_id;
1563        let start_aid = self.nfa.special.start_anchored_id;
1564        let (mut uprev_link, mut aprev_link) = (None, None);
1565        loop {
1566            let unext = self.nfa.next_link(start_uid, uprev_link);
1567            let anext = self.nfa.next_link(start_aid, aprev_link);
1568            let (ulink, alink) = match (unext, anext) {
1569                (Some(ulink), Some(alink)) => (ulink, alink),
1570                (None, None) => break,
1571                _ => unreachable!(),
1572            };
1573            uprev_link = Some(ulink);
1574            aprev_link = Some(alink);
1575            self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next;
1576        }
1577        self.nfa.copy_matches(start_uid, start_aid)?;
1578        // This is the main difference between the unanchored and anchored
1579        // starting states. If a lookup on an anchored starting state fails,
1580        // then the search should stop.
1581        //
1582        // N.B. This assumes that the loop on the unanchored starting state
1583        // hasn't been created yet.
1584        self.nfa.states[start_aid].fail = NFA::DEAD;
1585        Ok(())
1586    }
1587
1588    /// Set the failure transitions on the start state to loop back to the
1589    /// start state. This effectively permits the Aho-Corasick automaton to
1590    /// match at any position. This is also required for finding the next
1591    /// state to terminate, namely, finding the next state should never return
1592    /// a fail_id.
1593    ///
1594    /// This must be done after building the initial trie, since trie
1595    /// construction depends on transitions to `fail_id` to determine whether a
1596    /// state already exists or not.
1597    fn add_unanchored_start_state_loop(&mut self) {
1598        let start_uid = self.nfa.special.start_unanchored_id;
1599        let mut prev_link = None;
1600        while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1601            prev_link = Some(link);
1602            if self.nfa.sparse[link].next() == NFA::FAIL {
1603                self.nfa.sparse[link].next = start_uid;
1604            }
1605        }
1606    }
1607
1608    /// Remove the start state loop by rewriting any transitions on the start
1609    /// state back to the start state with transitions to the dead state.
1610    ///
1611    /// The loop is only closed when two conditions are met: the start state
1612    /// is a match state and the match kind is leftmost-first or
1613    /// leftmost-longest.
1614    ///
1615    /// The reason for this is that under leftmost semantics, a start state
1616    /// that is also a match implies that we should never restart the search
1617    /// process. We allow normal transitions out of the start state, but if
1618    /// none exist, we transition to the dead state, which signals that
1619    /// searching should stop.
1620    fn close_start_state_loop_for_leftmost(&mut self) {
1621        let start_uid = self.nfa.special.start_unanchored_id;
1622        let start = &mut self.nfa.states[start_uid];
1623        let dense = start.dense;
1624        if self.builder.match_kind.is_leftmost() && start.is_match() {
1625            let mut prev_link = None;
1626            while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1627                prev_link = Some(link);
1628                if self.nfa.sparse[link].next() == start_uid {
1629                    self.nfa.sparse[link].next = NFA::DEAD;
1630                    if dense != StateID::ZERO {
1631                        let b = self.nfa.sparse[link].byte;
1632                        let class = usize::from(self.nfa.byte_classes.get(b));
1633                        self.nfa.dense[dense.as_usize() + class] = NFA::DEAD;
1634                    }
1635                }
1636            }
1637        }
1638    }
1639
1640    /// Sets all transitions on the dead state to point back to the dead state.
1641    /// Normally, missing transitions map back to the failure state, but the
1642    /// point of the dead state is to act as a sink that can never be escaped.
1643    fn add_dead_state_loop(&mut self) -> Result<(), BuildError> {
1644        self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?;
1645        Ok(())
1646    }
1647}
1648
1649/// A set of state identifiers used to avoid revisiting the same state multiple
1650/// times when filling in failure transitions.
1651///
1652/// This set has an "inert" and an "active" mode. When inert, the set never
1653/// stores anything and always returns `false` for every member test. This is
1654/// useful to avoid the performance and memory overhead of maintaining this
1655/// set when it is not needed.
1656#[derive(Debug)]
1657struct QueuedSet {
1658    set: Option<BTreeSet<StateID>>,
1659}
1660
1661impl QueuedSet {
1662    /// Return an inert set that returns `false` for every state ID membership
1663    /// test.
1664    fn inert() -> QueuedSet {
1665        QueuedSet { set: None }
1666    }
1667
1668    /// Return an active set that tracks state ID membership.
1669    fn active() -> QueuedSet {
1670        QueuedSet { set: Some(BTreeSet::new()) }
1671    }
1672
1673    /// Inserts the given state ID into this set. (If the set is inert, then
1674    /// this is a no-op.)
1675    fn insert(&mut self, state_id: StateID) {
1676        if let Some(ref mut set) = self.set {
1677            set.insert(state_id);
1678        }
1679    }
1680
1681    /// Returns true if and only if the given state ID is in this set. If the
1682    /// set is inert, this always returns false.
1683    fn contains(&self, state_id: StateID) -> bool {
1684        match self.set {
1685            None => false,
1686            Some(ref set) => set.contains(&state_id),
1687        }
1688    }
1689}
1690
1691impl core::fmt::Debug for NFA {
1692    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1693        use crate::{
1694            automaton::{fmt_state_indicator, sparse_transitions},
1695            util::debug::DebugByte,
1696        };
1697
1698        writeln!(f, "noncontiguous::NFA(")?;
1699        for (sid, state) in self.states.iter().with_state_ids() {
1700            // The FAIL state doesn't actually have space for a state allocated
1701            // for it, so we have to treat it as a special case.
1702            if sid == NFA::FAIL {
1703                writeln!(f, "F {:06}:", sid.as_usize())?;
1704                continue;
1705            }
1706            fmt_state_indicator(f, self, sid)?;
1707            write!(
1708                f,
1709                "{:06}({:06}): ",
1710                sid.as_usize(),
1711                state.fail.as_usize()
1712            )?;
1713
1714            let it = sparse_transitions(
1715                self.iter_trans(sid).map(|t| (t.byte, t.next)),
1716            )
1717            .enumerate();
1718            for (i, (start, end, sid)) in it {
1719                if i > 0 {
1720                    write!(f, ", ")?;
1721                }
1722                if start == end {
1723                    write!(
1724                        f,
1725                        "{:?} => {:?}",
1726                        DebugByte(start),
1727                        sid.as_usize()
1728                    )?;
1729                } else {
1730                    write!(
1731                        f,
1732                        "{:?}-{:?} => {:?}",
1733                        DebugByte(start),
1734                        DebugByte(end),
1735                        sid.as_usize()
1736                    )?;
1737                }
1738            }
1739
1740            write!(f, "\n")?;
1741            if self.is_match(sid) {
1742                write!(f, "         matches: ")?;
1743                for (i, pid) in self.iter_matches(sid).enumerate() {
1744                    if i > 0 {
1745                        write!(f, ", ")?;
1746                    }
1747                    write!(f, "{}", pid.as_usize())?;
1748                }
1749                write!(f, "\n")?;
1750            }
1751        }
1752        writeln!(f, "match kind: {:?}", self.match_kind)?;
1753        writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
1754        writeln!(f, "state length: {:?}", self.states.len())?;
1755        writeln!(f, "pattern length: {:?}", self.patterns_len())?;
1756        writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
1757        writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
1758        writeln!(f, "memory usage: {:?}", self.memory_usage())?;
1759        writeln!(f, ")")?;
1760        Ok(())
1761    }
1762}
aho_corasick/nfa/noncontiguous.rs

aho_corasick/nfa/
noncontiguous.rs