aho_corasick/packed/
api.rs

1use alloc::sync::Arc;
2
3use crate::{
4    packed::{pattern::Patterns, rabinkarp::RabinKarp, teddy},
5    util::search::{Match, Span},
6};
7
8/// This is a limit placed on the total number of patterns we're willing to try
9/// and match at once. As more sophisticated algorithms are added, this number
10/// may be increased.
11const PATTERN_LIMIT: usize = 128;
12
13/// A knob for controlling the match semantics of a packed multiple string
14/// searcher.
15///
16/// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level
17/// crate module in that it doesn't support "standard" match semantics,
18/// and instead only supports leftmost-first or leftmost-longest. Namely,
19/// "standard" semantics cannot be easily supported by packed searchers.
20///
21/// For more information on the distinction between leftmost-first and
22/// leftmost-longest, see the docs on the top-level `MatchKind` type.
23///
24/// Unlike the top-level `MatchKind` type, the default match semantics for this
25/// type are leftmost-first.
26#[derive(Clone, Copy, Debug, Eq, PartialEq)]
27#[non_exhaustive]
28pub enum MatchKind {
29    /// Use leftmost-first match semantics, which reports leftmost matches.
30    /// When there are multiple possible leftmost matches, the match
31    /// corresponding to the pattern that appeared earlier when constructing
32    /// the automaton is reported.
33    ///
34    /// This is the default.
35    LeftmostFirst,
36    /// Use leftmost-longest match semantics, which reports leftmost matches.
37    /// When there are multiple possible leftmost matches, the longest match
38    /// is chosen.
39    LeftmostLongest,
40}
41
42impl Default for MatchKind {
43    fn default() -> MatchKind {
44        MatchKind::LeftmostFirst
45    }
46}
47
48/// The configuration for a packed multiple pattern searcher.
49///
50/// The configuration is currently limited only to being able to select the
51/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
52/// future, more knobs may be made available.
53///
54/// A configuration produces a [`packed::Builder`](Builder), which in turn can
55/// be used to construct a [`packed::Searcher`](Searcher) for searching.
56///
57/// # Example
58///
59/// This example shows how to use leftmost-longest semantics instead of the
60/// default (leftmost-first).
61///
62/// ```
63/// use aho_corasick::{packed::{Config, MatchKind}, PatternID};
64///
65/// # fn example() -> Option<()> {
66/// let searcher = Config::new()
67///     .match_kind(MatchKind::LeftmostLongest)
68///     .builder()
69///     .add("foo")
70///     .add("foobar")
71///     .build()?;
72/// let matches: Vec<PatternID> = searcher
73///     .find_iter("foobar")
74///     .map(|mat| mat.pattern())
75///     .collect();
76/// assert_eq!(vec![PatternID::must(1)], matches);
77/// # Some(()) }
78/// # if cfg!(all(feature = "std", any(
79/// #     target_arch = "x86_64", target_arch = "aarch64",
80/// # ))) {
81/// #     example().unwrap()
82/// # } else {
83/// #     assert!(example().is_none());
84/// # }
85/// ```
86#[derive(Clone, Debug)]
87pub struct Config {
88    kind: MatchKind,
89    force: Option<ForceAlgorithm>,
90    only_teddy_fat: Option<bool>,
91    only_teddy_256bit: Option<bool>,
92    heuristic_pattern_limits: bool,
93}
94
95/// An internal option for forcing the use of a particular packed algorithm.
96///
97/// When an algorithm is forced, if a searcher could not be constructed for it,
98/// then no searcher will be returned even if an alternative algorithm would
99/// work.
100#[derive(Clone, Debug)]
101enum ForceAlgorithm {
102    Teddy,
103    RabinKarp,
104}
105
106impl Default for Config {
107    fn default() -> Config {
108        Config::new()
109    }
110}
111
112impl Config {
113    /// Create a new default configuration. A default configuration uses
114    /// leftmost-first match semantics.
115    pub fn new() -> Config {
116        Config {
117            kind: MatchKind::LeftmostFirst,
118            force: None,
119            only_teddy_fat: None,
120            only_teddy_256bit: None,
121            heuristic_pattern_limits: true,
122        }
123    }
124
125    /// Create a packed builder from this configuration. The builder can be
126    /// used to accumulate patterns and create a [`Searcher`] from them.
127    pub fn builder(&self) -> Builder {
128        Builder::from_config(self.clone())
129    }
130
131    /// Set the match semantics for this configuration.
132    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
133        self.kind = kind;
134        self
135    }
136
137    /// An undocumented method for forcing the use of the Teddy algorithm.
138    ///
139    /// This is only exposed for more precise testing and benchmarks. Callers
140    /// should not use it as it is not part of the API stability guarantees of
141    /// this crate.
142    #[doc(hidden)]
143    pub fn only_teddy(&mut self, yes: bool) -> &mut Config {
144        if yes {
145            self.force = Some(ForceAlgorithm::Teddy);
146        } else {
147            self.force = None;
148        }
149        self
150    }
151
152    /// An undocumented method for forcing the use of the Fat Teddy algorithm.
153    ///
154    /// This is only exposed for more precise testing and benchmarks. Callers
155    /// should not use it as it is not part of the API stability guarantees of
156    /// this crate.
157    #[doc(hidden)]
158    pub fn only_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
159        self.only_teddy_fat = yes;
160        self
161    }
162
163    /// An undocumented method for forcing the use of SSE (`Some(false)`) or
164    /// AVX (`Some(true)`) algorithms.
165    ///
166    /// This is only exposed for more precise testing and benchmarks. Callers
167    /// should not use it as it is not part of the API stability guarantees of
168    /// this crate.
169    #[doc(hidden)]
170    pub fn only_teddy_256bit(&mut self, yes: Option<bool>) -> &mut Config {
171        self.only_teddy_256bit = yes;
172        self
173    }
174
175    /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
176    ///
177    /// This is only exposed for more precise testing and benchmarks. Callers
178    /// should not use it as it is not part of the API stability guarantees of
179    /// this crate.
180    #[doc(hidden)]
181    pub fn only_rabin_karp(&mut self, yes: bool) -> &mut Config {
182        if yes {
183            self.force = Some(ForceAlgorithm::RabinKarp);
184        } else {
185            self.force = None;
186        }
187        self
188    }
189
190    /// Request that heuristic limitations on the number of patterns be
191    /// employed. This useful to disable for benchmarking where one wants to
192    /// explore how Teddy performs on large number of patterns even if the
193    /// heuristics would otherwise refuse construction.
194    ///
195    /// This is enabled by default.
196    pub fn heuristic_pattern_limits(&mut self, yes: bool) -> &mut Config {
197        self.heuristic_pattern_limits = yes;
198        self
199    }
200}
201
202/// A builder for constructing a packed searcher from a collection of patterns.
203///
204/// # Example
205///
206/// This example shows how to use a builder to construct a searcher. By
207/// default, leftmost-first match semantics are used.
208///
209/// ```
210/// use aho_corasick::{packed::{Builder, MatchKind}, PatternID};
211///
212/// # fn example() -> Option<()> {
213/// let searcher = Builder::new()
214///     .add("foobar")
215///     .add("foo")
216///     .build()?;
217/// let matches: Vec<PatternID> = searcher
218///     .find_iter("foobar")
219///     .map(|mat| mat.pattern())
220///     .collect();
221/// assert_eq!(vec![PatternID::ZERO], matches);
222/// # Some(()) }
223/// # if cfg!(all(feature = "std", any(
224/// #     target_arch = "x86_64", target_arch = "aarch64",
225/// # ))) {
226/// #     example().unwrap()
227/// # } else {
228/// #     assert!(example().is_none());
229/// # }
230/// ```
231#[derive(Clone, Debug)]
232pub struct Builder {
233    /// The configuration of this builder and subsequent matcher.
234    config: Config,
235    /// Set to true if the builder detects that a matcher cannot be built.
236    inert: bool,
237    /// The patterns provided by the caller.
238    patterns: Patterns,
239}
240
241impl Builder {
242    /// Create a new builder for constructing a multi-pattern searcher. This
243    /// constructor uses the default configuration.
244    pub fn new() -> Builder {
245        Builder::from_config(Config::new())
246    }
247
248    fn from_config(config: Config) -> Builder {
249        Builder { config, inert: false, patterns: Patterns::new() }
250    }
251
252    /// Build a searcher from the patterns added to this builder so far.
253    pub fn build(&self) -> Option<Searcher> {
254        if self.inert || self.patterns.is_empty() {
255            return None;
256        }
257        let mut patterns = self.patterns.clone();
258        patterns.set_match_kind(self.config.kind);
259        let patterns = Arc::new(patterns);
260        let rabinkarp = RabinKarp::new(&patterns);
261        // Effectively, we only want to return a searcher if we can use Teddy,
262        // since Teddy is our only fast packed searcher at the moment.
263        // Rabin-Karp is only used when searching haystacks smaller than what
264        // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
265        // is to force it using undocumented APIs (for tests/benchmarks).
266        let (search_kind, minimum_len) = match self.config.force {
267            None | Some(ForceAlgorithm::Teddy) => {
268                debug!("trying to build Teddy packed matcher");
269                let teddy = match self.build_teddy(Arc::clone(&patterns)) {
270                    None => return None,
271                    Some(teddy) => teddy,
272                };
273                let minimum_len = teddy.minimum_len();
274                (SearchKind::Teddy(teddy), minimum_len)
275            }
276            Some(ForceAlgorithm::RabinKarp) => {
277                debug!("using Rabin-Karp packed matcher");
278                (SearchKind::RabinKarp, 0)
279            }
280        };
281        Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
282    }
283
284    fn build_teddy(&self, patterns: Arc<Patterns>) -> Option<teddy::Searcher> {
285        teddy::Builder::new()
286            .only_256bit(self.config.only_teddy_256bit)
287            .only_fat(self.config.only_teddy_fat)
288            .heuristic_pattern_limits(self.config.heuristic_pattern_limits)
289            .build(patterns)
290    }
291
292    /// Add the given pattern to this set to match.
293    ///
294    /// The order in which patterns are added is significant. Namely, when
295    /// using leftmost-first match semantics, then when multiple patterns can
296    /// match at a particular location, the pattern that was added first is
297    /// used as the match.
298    ///
299    /// If the number of patterns added exceeds the amount supported by packed
300    /// searchers, then the builder will stop accumulating patterns and render
301    /// itself inert. At this point, constructing a searcher will always return
302    /// `None`.
303    pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
304        if self.inert {
305            return self;
306        } else if self.patterns.len() >= PATTERN_LIMIT {
307            self.inert = true;
308            self.patterns.reset();
309            return self;
310        }
311        // Just in case PATTERN_LIMIT increases beyond u16::MAX.
312        assert!(self.patterns.len() <= core::u16::MAX as usize);
313
314        let pattern = pattern.as_ref();
315        if pattern.is_empty() {
316            self.inert = true;
317            self.patterns.reset();
318            return self;
319        }
320        self.patterns.add(pattern);
321        self
322    }
323
324    /// Add the given iterator of patterns to this set to match.
325    ///
326    /// The iterator must yield elements that can be converted into a `&[u8]`.
327    ///
328    /// The order in which patterns are added is significant. Namely, when
329    /// using leftmost-first match semantics, then when multiple patterns can
330    /// match at a particular location, the pattern that was added first is
331    /// used as the match.
332    ///
333    /// If the number of patterns added exceeds the amount supported by packed
334    /// searchers, then the builder will stop accumulating patterns and render
335    /// itself inert. At this point, constructing a searcher will always return
336    /// `None`.
337    pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
338    where
339        I: IntoIterator<Item = P>,
340        P: AsRef<[u8]>,
341    {
342        for p in patterns {
343            self.add(p);
344        }
345        self
346    }
347
348    /// Returns the number of patterns added to this builder.
349    pub fn len(&self) -> usize {
350        self.patterns.len()
351    }
352
353    /// Returns the length, in bytes, of the shortest pattern added.
354    pub fn minimum_len(&self) -> usize {
355        self.patterns.minimum_len()
356    }
357}
358
359impl Default for Builder {
360    fn default() -> Builder {
361        Builder::new()
362    }
363}
364
365/// A packed searcher for quickly finding occurrences of multiple patterns.
366///
367/// If callers need more flexible construction, or if one wants to change the
368/// match semantics (either leftmost-first or leftmost-longest), then one can
369/// use the [`Config`] and/or [`Builder`] types for more fine grained control.
370///
371/// # Example
372///
373/// This example shows how to create a searcher from an iterator of patterns.
374/// By default, leftmost-first match semantics are used.
375///
376/// ```
377/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
378///
379/// # fn example() -> Option<()> {
380/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
381/// let matches: Vec<PatternID> = searcher
382///     .find_iter("foobar")
383///     .map(|mat| mat.pattern())
384///     .collect();
385/// assert_eq!(vec![PatternID::ZERO], matches);
386/// # Some(()) }
387/// # if cfg!(all(feature = "std", any(
388/// #     target_arch = "x86_64", target_arch = "aarch64",
389/// # ))) {
390/// #     example().unwrap()
391/// # } else {
392/// #     assert!(example().is_none());
393/// # }
394/// ```
395#[derive(Clone, Debug)]
396pub struct Searcher {
397    patterns: Arc<Patterns>,
398    rabinkarp: RabinKarp,
399    search_kind: SearchKind,
400    minimum_len: usize,
401}
402
403#[derive(Clone, Debug)]
404enum SearchKind {
405    Teddy(teddy::Searcher),
406    RabinKarp,
407}
408
409impl Searcher {
410    /// A convenience function for constructing a searcher from an iterator
411    /// of things that can be converted to a `&[u8]`.
412    ///
413    /// If a searcher could not be constructed (either because of an
414    /// unsupported CPU or because there are too many patterns), then `None`
415    /// is returned.
416    ///
417    /// # Example
418    ///
419    /// Basic usage:
420    ///
421    /// ```
422    /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
423    ///
424    /// # fn example() -> Option<()> {
425    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
426    /// let matches: Vec<PatternID> = searcher
427    ///     .find_iter("foobar")
428    ///     .map(|mat| mat.pattern())
429    ///     .collect();
430    /// assert_eq!(vec![PatternID::ZERO], matches);
431    /// # Some(()) }
432    /// # if cfg!(all(feature = "std", any(
433    /// #     target_arch = "x86_64", target_arch = "aarch64",
434    /// # ))) {
435    /// #     example().unwrap()
436    /// # } else {
437    /// #     assert!(example().is_none());
438    /// # }
439    /// ```
440    pub fn new<I, P>(patterns: I) -> Option<Searcher>
441    where
442        I: IntoIterator<Item = P>,
443        P: AsRef<[u8]>,
444    {
445        Builder::new().extend(patterns).build()
446    }
447
448    /// A convenience function for calling `Config::new()`.
449    ///
450    /// This is useful for avoiding an additional import.
451    pub fn config() -> Config {
452        Config::new()
453    }
454
455    /// A convenience function for calling `Builder::new()`.
456    ///
457    /// This is useful for avoiding an additional import.
458    pub fn builder() -> Builder {
459        Builder::new()
460    }
461
462    /// Return the first occurrence of any of the patterns in this searcher,
463    /// according to its match semantics, in the given haystack. The `Match`
464    /// returned will include the identifier of the pattern that matched, which
465    /// corresponds to the index of the pattern (starting from `0`) in which it
466    /// was added.
467    ///
468    /// # Example
469    ///
470    /// Basic usage:
471    ///
472    /// ```
473    /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
474    ///
475    /// # fn example() -> Option<()> {
476    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
477    /// let mat = searcher.find("foobar")?;
478    /// assert_eq!(PatternID::ZERO, mat.pattern());
479    /// assert_eq!(0, mat.start());
480    /// assert_eq!(6, mat.end());
481    /// # Some(()) }
482    /// # if cfg!(all(feature = "std", any(
483    /// #     target_arch = "x86_64", target_arch = "aarch64",
484    /// # ))) {
485    /// #     example().unwrap()
486    /// # } else {
487    /// #     assert!(example().is_none());
488    /// # }
489    /// ```
490    #[inline]
491    pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
492        let haystack = haystack.as_ref();
493        self.find_in(haystack, Span::from(0..haystack.len()))
494    }
495
496    /// Return the first occurrence of any of the patterns in this searcher,
497    /// according to its match semantics, in the given haystack starting from
498    /// the given position.
499    ///
500    /// The `Match` returned will include the identifier of the pattern that
501    /// matched, which corresponds to the index of the pattern (starting from
502    /// `0`) in which it was added. The offsets in the `Match` will be relative
503    /// to the start of `haystack` (and not `at`).
504    ///
505    /// # Example
506    ///
507    /// Basic usage:
508    ///
509    /// ```
510    /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span};
511    ///
512    /// # fn example() -> Option<()> {
513    /// let haystack = "foofoobar";
514    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
515    /// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?;
516    /// assert_eq!(PatternID::ZERO, mat.pattern());
517    /// assert_eq!(3, mat.start());
518    /// assert_eq!(9, mat.end());
519    /// # Some(()) }
520    /// # if cfg!(all(feature = "std", any(
521    /// #     target_arch = "x86_64", target_arch = "aarch64",
522    /// # ))) {
523    /// #     example().unwrap()
524    /// # } else {
525    /// #     assert!(example().is_none());
526    /// # }
527    /// ```
528    #[inline]
529    pub fn find_in<B: AsRef<[u8]>>(
530        &self,
531        haystack: B,
532        span: Span,
533    ) -> Option<Match> {
534        let haystack = haystack.as_ref();
535        match self.search_kind {
536            SearchKind::Teddy(ref teddy) => {
537                if haystack[span].len() < teddy.minimum_len() {
538                    return self.find_in_slow(haystack, span);
539                }
540                teddy.find(&haystack[..span.end], span.start)
541            }
542            SearchKind::RabinKarp => {
543                self.rabinkarp.find_at(&haystack[..span.end], span.start)
544            }
545        }
546    }
547
548    /// Return an iterator of non-overlapping occurrences of the patterns in
549    /// this searcher, according to its match semantics, in the given haystack.
550    ///
551    /// # Example
552    ///
553    /// Basic usage:
554    ///
555    /// ```
556    /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
557    ///
558    /// # fn example() -> Option<()> {
559    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
560    /// let matches: Vec<PatternID> = searcher
561    ///     .find_iter("foobar fooba foofoo")
562    ///     .map(|mat| mat.pattern())
563    ///     .collect();
564    /// assert_eq!(vec![
565    ///     PatternID::must(0),
566    ///     PatternID::must(1),
567    ///     PatternID::must(1),
568    ///     PatternID::must(1),
569    /// ], matches);
570    /// # Some(()) }
571    /// # if cfg!(all(feature = "std", any(
572    /// #     target_arch = "x86_64", target_arch = "aarch64",
573    /// # ))) {
574    /// #     example().unwrap()
575    /// # } else {
576    /// #     assert!(example().is_none());
577    /// # }
578    /// ```
579    #[inline]
580    pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
581        &'a self,
582        haystack: &'b B,
583    ) -> FindIter<'a, 'b> {
584        let haystack = haystack.as_ref();
585        let span = Span::from(0..haystack.len());
586        FindIter { searcher: self, haystack, span }
587    }
588
589    /// Returns the match kind used by this packed searcher.
590    ///
591    /// # Examples
592    ///
593    /// Basic usage:
594    ///
595    /// ```
596    /// use aho_corasick::packed::{MatchKind, Searcher};
597    ///
598    /// # fn example() -> Option<()> {
599    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
600    /// // leftmost-first is the default.
601    /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
602    /// # Some(()) }
603    /// # if cfg!(all(feature = "std", any(
604    /// #     target_arch = "x86_64", target_arch = "aarch64",
605    /// # ))) {
606    /// #     example().unwrap()
607    /// # } else {
608    /// #     assert!(example().is_none());
609    /// # }
610    /// ```
611    #[inline]
612    pub fn match_kind(&self) -> &MatchKind {
613        self.patterns.match_kind()
614    }
615
616    /// Returns the minimum length of a haystack that is required in order for
617    /// packed searching to be effective.
618    ///
619    /// In some cases, the underlying packed searcher may not be able to search
620    /// very short haystacks. When that occurs, the implementation will defer
621    /// to a slower non-packed searcher (which is still generally faster than
622    /// Aho-Corasick for a small number of patterns). However, callers may
623    /// want to avoid ever using the slower variant, which one can do by
624    /// never passing a haystack shorter than the minimum length returned by
625    /// this method.
626    #[inline]
627    pub fn minimum_len(&self) -> usize {
628        self.minimum_len
629    }
630
631    /// Returns the approximate total amount of heap used by this searcher, in
632    /// units of bytes.
633    #[inline]
634    pub fn memory_usage(&self) -> usize {
635        self.patterns.memory_usage()
636            + self.rabinkarp.memory_usage()
637            + self.search_kind.memory_usage()
638    }
639
640    /// Use a slow (non-packed) searcher.
641    ///
642    /// This is useful when a packed searcher could be constructed, but could
643    /// not be used to search a specific haystack. For example, if Teddy was
644    /// built but the haystack is smaller than ~34 bytes, then Teddy might not
645    /// be able to run.
646    fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option<Match> {
647        self.rabinkarp.find_at(&haystack[..span.end], span.start)
648    }
649}
650
651impl SearchKind {
652    fn memory_usage(&self) -> usize {
653        match *self {
654            SearchKind::Teddy(ref ted) => ted.memory_usage(),
655            SearchKind::RabinKarp => 0,
656        }
657    }
658}
659
660/// An iterator over non-overlapping matches from a packed searcher.
661///
662/// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`],
663/// while the lifetime `'h` refers to the lifetime of the haystack being
664/// searched.
665#[derive(Debug)]
666pub struct FindIter<'s, 'h> {
667    searcher: &'s Searcher,
668    haystack: &'h [u8],
669    span: Span,
670}
671
672impl<'s, 'h> Iterator for FindIter<'s, 'h> {
673    type Item = Match;
674
675    fn next(&mut self) -> Option<Match> {
676        if self.span.start > self.span.end {
677            return None;
678        }
679        match self.searcher.find_in(&self.haystack, self.span) {
680            None => None,
681            Some(m) => {
682                self.span.start = m.end();
683                Some(m)
684            }
685        }
686    }
687}