regex/
builders.rs

1#![allow(warnings)]
2
3// This module defines an internal builder that encapsulates all interaction
4// with meta::Regex construction, and then 4 public API builders that wrap
5// around it. The docs are essentially repeated on each of the 4 public
6// builders, with tweaks to the examples as needed.
7//
8// The reason why there are so many builders is partially because of a misstep
9// in the initial API design: the builder constructor takes in the pattern
10// strings instead of using the `build` method to accept the pattern strings.
11// This means `new` has a different signature for each builder. It probably
12// would have been nicer to to use one builder with `fn new()`, and then add
13// `build(pat)` and `build_many(pats)` constructors.
14//
15// The other reason is because I think the `bytes` module should probably
16// have its own builder type. That way, it is completely isolated from the
17// top-level API.
18//
19// If I could do it again, I'd probably have a `regex::Builder` and a
20// `regex::bytes::Builder`. Each would have `build` and `build_set` (or
21// `build_many`) methods for constructing a single pattern `Regex` and a
22// multi-pattern `RegexSet`, respectively.
23
24use alloc::{
25    string::{String, ToString},
26    sync::Arc,
27    vec,
28    vec::Vec,
29};
30
31use regex_automata::{
32    meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind,
33};
34
35use crate::error::Error;
36
37/// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a
38/// `bytes::RegexSet`.
39///
40/// This is essentially the implementation of the four different builder types
41/// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder`
42/// and `bytes::RegexSetBuilder`.
43#[derive(Clone, Debug)]
44struct Builder {
45    pats: Vec<String>,
46    metac: meta::Config,
47    syntaxc: syntax::Config,
48}
49
50impl Default for Builder {
51    fn default() -> Builder {
52        let metac = meta::Config::new()
53            .nfa_size_limit(Some(10 * (1 << 20)))
54            .hybrid_cache_capacity(2 * (1 << 20));
55        Builder { pats: vec![], metac, syntaxc: syntax::Config::default() }
56    }
57}
58
59impl Builder {
60    fn new<I, S>(patterns: I) -> Builder
61    where
62        S: AsRef<str>,
63        I: IntoIterator<Item = S>,
64    {
65        let mut b = Builder::default();
66        b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string()));
67        b
68    }
69
70    fn build_one_string(&self) -> Result<crate::Regex, Error> {
71        assert_eq!(1, self.pats.len());
72        let metac = self
73            .metac
74            .clone()
75            .match_kind(MatchKind::LeftmostFirst)
76            .utf8_empty(true);
77        let syntaxc = self.syntaxc.clone().utf8(true);
78        let pattern = Arc::from(self.pats[0].as_str());
79        meta::Builder::new()
80            .configure(metac)
81            .syntax(syntaxc)
82            .build(&pattern)
83            .map(|meta| crate::Regex { meta, pattern })
84            .map_err(Error::from_meta_build_error)
85    }
86
87    fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> {
88        assert_eq!(1, self.pats.len());
89        let metac = self
90            .metac
91            .clone()
92            .match_kind(MatchKind::LeftmostFirst)
93            .utf8_empty(false);
94        let syntaxc = self.syntaxc.clone().utf8(false);
95        let pattern = Arc::from(self.pats[0].as_str());
96        meta::Builder::new()
97            .configure(metac)
98            .syntax(syntaxc)
99            .build(&pattern)
100            .map(|meta| crate::bytes::Regex { meta, pattern })
101            .map_err(Error::from_meta_build_error)
102    }
103
104    fn build_many_string(&self) -> Result<crate::RegexSet, Error> {
105        let metac = self
106            .metac
107            .clone()
108            .match_kind(MatchKind::All)
109            .utf8_empty(true)
110            .which_captures(WhichCaptures::None);
111        let syntaxc = self.syntaxc.clone().utf8(true);
112        let patterns = Arc::from(self.pats.as_slice());
113        meta::Builder::new()
114            .configure(metac)
115            .syntax(syntaxc)
116            .build_many(&patterns)
117            .map(|meta| crate::RegexSet { meta, patterns })
118            .map_err(Error::from_meta_build_error)
119    }
120
121    fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> {
122        let metac = self
123            .metac
124            .clone()
125            .match_kind(MatchKind::All)
126            .utf8_empty(false)
127            .which_captures(WhichCaptures::None);
128        let syntaxc = self.syntaxc.clone().utf8(false);
129        let patterns = Arc::from(self.pats.as_slice());
130        meta::Builder::new()
131            .configure(metac)
132            .syntax(syntaxc)
133            .build_many(&patterns)
134            .map(|meta| crate::bytes::RegexSet { meta, patterns })
135            .map_err(Error::from_meta_build_error)
136    }
137
138    fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
139        self.syntaxc = self.syntaxc.case_insensitive(yes);
140        self
141    }
142
143    fn multi_line(&mut self, yes: bool) -> &mut Builder {
144        self.syntaxc = self.syntaxc.multi_line(yes);
145        self
146    }
147
148    fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
149        self.syntaxc = self.syntaxc.dot_matches_new_line(yes);
150        self
151    }
152
153    fn crlf(&mut self, yes: bool) -> &mut Builder {
154        self.syntaxc = self.syntaxc.crlf(yes);
155        self
156    }
157
158    fn line_terminator(&mut self, byte: u8) -> &mut Builder {
159        self.metac = self.metac.clone().line_terminator(byte);
160        self.syntaxc = self.syntaxc.line_terminator(byte);
161        self
162    }
163
164    fn swap_greed(&mut self, yes: bool) -> &mut Builder {
165        self.syntaxc = self.syntaxc.swap_greed(yes);
166        self
167    }
168
169    fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
170        self.syntaxc = self.syntaxc.ignore_whitespace(yes);
171        self
172    }
173
174    fn unicode(&mut self, yes: bool) -> &mut Builder {
175        self.syntaxc = self.syntaxc.unicode(yes);
176        self
177    }
178
179    fn octal(&mut self, yes: bool) -> &mut Builder {
180        self.syntaxc = self.syntaxc.octal(yes);
181        self
182    }
183
184    fn size_limit(&mut self, limit: usize) -> &mut Builder {
185        self.metac = self.metac.clone().nfa_size_limit(Some(limit));
186        self
187    }
188
189    fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder {
190        self.metac = self.metac.clone().hybrid_cache_capacity(limit);
191        self
192    }
193
194    fn nest_limit(&mut self, limit: u32) -> &mut Builder {
195        self.syntaxc = self.syntaxc.nest_limit(limit);
196        self
197    }
198}
199
200pub(crate) mod string {
201    use crate::{error::Error, Regex, RegexSet};
202
203    use super::Builder;
204
205    /// A configurable builder for a [`Regex`].
206    ///
207    /// This builder can be used to programmatically set flags such as `i`
208    /// (case insensitive) and `x` (for verbose mode). This builder can also be
209    /// used to configure things like the line terminator and a size limit on
210    /// the compiled regular expression.
211    #[derive(Clone, Debug)]
212    pub struct RegexBuilder {
213        builder: Builder,
214    }
215
216    impl RegexBuilder {
217        /// Create a new builder with a default configuration for the given
218        /// pattern.
219        ///
220        /// If the pattern is invalid or exceeds the configured size limits,
221        /// then an error will be returned when [`RegexBuilder::build`] is
222        /// called.
223        pub fn new(pattern: &str) -> RegexBuilder {
224            RegexBuilder { builder: Builder::new([pattern]) }
225        }
226
227        /// Compiles the pattern given to `RegexBuilder::new` with the
228        /// configuration set on this builder.
229        ///
230        /// If the pattern isn't a valid regex or if a configured size limit
231        /// was exceeded, then an error is returned.
232        pub fn build(&self) -> Result<Regex, Error> {
233            self.builder.build_one_string()
234        }
235
236        /// This configures Unicode mode for the entire pattern.
237        ///
238        /// Enabling Unicode mode does a number of things:
239        ///
240        /// * Most fundamentally, it causes the fundamental atom of matching
241        /// to be a single codepoint. When Unicode mode is disabled, it's a
242        /// single byte. For example, when Unicode mode is enabled, `.` will
243        /// match `💩` once, where as it will match 4 times when Unicode mode
244        /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
245        /// * Case insensitive matching uses Unicode simple case folding rules.
246        /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
247        /// available.
248        /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
249        /// `\d`.
250        /// * The word boundary assertions, `\b` and `\B`, use the Unicode
251        /// definition of a word character.
252        ///
253        /// Note that if Unicode mode is disabled, then the regex will fail to
254        /// compile if it could match invalid UTF-8. For example, when Unicode
255        /// mode is disabled, then since `.` matches any byte (except for
256        /// `\n`), then it can match invalid UTF-8 and thus building a regex
257        /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
258        /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
259        /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
260        /// and so it is not allowed. This restriction can be lifted only by
261        /// using a [`bytes::Regex`](crate::bytes::Regex).
262        ///
263        /// For more details on the Unicode support in this crate, see the
264        /// [Unicode section](crate#unicode) in this crate's top-level
265        /// documentation.
266        ///
267        /// The default for this is `true`.
268        ///
269        /// # Example
270        ///
271        /// ```
272        /// use regex::RegexBuilder;
273        ///
274        /// let re = RegexBuilder::new(r"\w")
275        ///     .unicode(false)
276        ///     .build()
277        ///     .unwrap();
278        /// // Normally greek letters would be included in \w, but since
279        /// // Unicode mode is disabled, it only matches ASCII letters.
280        /// assert!(!re.is_match("δ"));
281        ///
282        /// let re = RegexBuilder::new(r"s")
283        ///     .case_insensitive(true)
284        ///     .unicode(false)
285        ///     .build()
286        ///     .unwrap();
287        /// // Normally 'ſ' is included when searching for 's' case
288        /// // insensitively due to Unicode's simple case folding rules. But
289        /// // when Unicode mode is disabled, only ASCII case insensitive rules
290        /// // are used.
291        /// assert!(!re.is_match("ſ"));
292        /// ```
293        pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
294            self.builder.unicode(yes);
295            self
296        }
297
298        /// This configures whether to enable case insensitive matching for the
299        /// entire pattern.
300        ///
301        /// This setting can also be configured using the inline flag `i`
302        /// in the pattern. For example, `(?i:foo)` matches `foo` case
303        /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
304        ///
305        /// The default for this is `false`.
306        ///
307        /// # Example
308        ///
309        /// ```
310        /// use regex::RegexBuilder;
311        ///
312        /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
313        ///     .case_insensitive(true)
314        ///     .build()
315        ///     .unwrap();
316        /// assert!(re.is_match("FoObarQuUx"));
317        /// // Even though case insensitive matching is enabled in the builder,
318        /// // it can be locally disabled within the pattern. In this case,
319        /// // `bar` is matched case sensitively.
320        /// assert!(!re.is_match("fooBARquux"));
321        /// ```
322        pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
323            self.builder.case_insensitive(yes);
324            self
325        }
326
327        /// This configures multi-line mode for the entire pattern.
328        ///
329        /// Enabling multi-line mode changes the behavior of the `^` and `$`
330        /// anchor assertions. Instead of only matching at the beginning and
331        /// end of a haystack, respectively, multi-line mode causes them to
332        /// match at the beginning and end of a line *in addition* to the
333        /// beginning and end of a haystack. More precisely, `^` will match at
334        /// the position immediately following a `\n` and `$` will match at the
335        /// position immediately preceding a `\n`.
336        ///
337        /// The behavior of this option can be impacted by other settings too:
338        ///
339        /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
340        /// to any ASCII byte.
341        /// * The [`RegexBuilder::crlf`] option changes the line terminator to
342        /// be either `\r` or `\n`, but never at the position between a `\r`
343        /// and `\n`.
344        ///
345        /// This setting can also be configured using the inline flag `m` in
346        /// the pattern.
347        ///
348        /// The default for this is `false`.
349        ///
350        /// # Example
351        ///
352        /// ```
353        /// use regex::RegexBuilder;
354        ///
355        /// let re = RegexBuilder::new(r"^foo$")
356        ///     .multi_line(true)
357        ///     .build()
358        ///     .unwrap();
359        /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range()));
360        /// ```
361        pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
362            self.builder.multi_line(yes);
363            self
364        }
365
366        /// This configures dot-matches-new-line mode for the entire pattern.
367        ///
368        /// Perhaps surprisingly, the default behavior for `.` is not to match
369        /// any character, but rather, to match any character except for the
370        /// line terminator (which is `\n` by default). When this mode is
371        /// enabled, the behavior changes such that `.` truly matches any
372        /// character.
373        ///
374        /// This setting can also be configured using the inline flag `s` in
375        /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
376        /// regexes.
377        ///
378        /// The default for this is `false`.
379        ///
380        /// # Example
381        ///
382        /// ```
383        /// use regex::RegexBuilder;
384        ///
385        /// let re = RegexBuilder::new(r"foo.bar")
386        ///     .dot_matches_new_line(true)
387        ///     .build()
388        ///     .unwrap();
389        /// let hay = "foo\nbar";
390        /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str()));
391        /// ```
392        pub fn dot_matches_new_line(
393            &mut self,
394            yes: bool,
395        ) -> &mut RegexBuilder {
396            self.builder.dot_matches_new_line(yes);
397            self
398        }
399
400        /// This configures CRLF mode for the entire pattern.
401        ///
402        /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
403        /// short) and `\n` ("line feed" or LF for short) are treated as line
404        /// terminators. This results in the following:
405        ///
406        /// * Unless dot-matches-new-line mode is enabled, `.` will now match
407        /// any character except for `\n` and `\r`.
408        /// * When multi-line mode is enabled, `^` will match immediately
409        /// following a `\n` or a `\r`. Similarly, `$` will match immediately
410        /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
411        /// between `\r` and `\n`.
412        ///
413        /// This setting can also be configured using the inline flag `R` in
414        /// the pattern.
415        ///
416        /// The default for this is `false`.
417        ///
418        /// # Example
419        ///
420        /// ```
421        /// use regex::RegexBuilder;
422        ///
423        /// let re = RegexBuilder::new(r"^foo$")
424        ///     .multi_line(true)
425        ///     .crlf(true)
426        ///     .build()
427        ///     .unwrap();
428        /// let hay = "\r\nfoo\r\n";
429        /// // If CRLF mode weren't enabled here, then '$' wouldn't match
430        /// // immediately after 'foo', and thus no match would be found.
431        /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str()));
432        /// ```
433        ///
434        /// This example demonstrates that `^` will never match at a position
435        /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
436        /// and a `\n`.)
437        ///
438        /// ```
439        /// use regex::RegexBuilder;
440        ///
441        /// let re = RegexBuilder::new(r"^")
442        ///     .multi_line(true)
443        ///     .crlf(true)
444        ///     .build()
445        ///     .unwrap();
446        /// let hay = "\r\n\r\n";
447        /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
448        /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
449        /// ```
450        pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
451            self.builder.crlf(yes);
452            self
453        }
454
455        /// Configures the line terminator to be used by the regex.
456        ///
457        /// The line terminator is relevant in two ways for a particular regex:
458        ///
459        /// * When dot-matches-new-line mode is *not* enabled (the default),
460        /// then `.` will match any character except for the configured line
461        /// terminator.
462        /// * When multi-line mode is enabled (not the default), then `^` and
463        /// `$` will match immediately after and before, respectively, a line
464        /// terminator.
465        ///
466        /// In both cases, if CRLF mode is enabled in a particular context,
467        /// then it takes precedence over any configured line terminator.
468        ///
469        /// This option cannot be configured from within the pattern.
470        ///
471        /// The default line terminator is `\n`.
472        ///
473        /// # Example
474        ///
475        /// This shows how to treat the NUL byte as a line terminator. This can
476        /// be a useful heuristic when searching binary data.
477        ///
478        /// ```
479        /// use regex::RegexBuilder;
480        ///
481        /// let re = RegexBuilder::new(r"^foo$")
482        ///     .multi_line(true)
483        ///     .line_terminator(b'\x00')
484        ///     .build()
485        ///     .unwrap();
486        /// let hay = "\x00foo\x00";
487        /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
488        /// ```
489        ///
490        /// This example shows that the behavior of `.` is impacted by this
491        /// setting as well:
492        ///
493        /// ```
494        /// use regex::RegexBuilder;
495        ///
496        /// let re = RegexBuilder::new(r".")
497        ///     .line_terminator(b'\x00')
498        ///     .build()
499        ///     .unwrap();
500        /// assert!(re.is_match("\n"));
501        /// assert!(!re.is_match("\x00"));
502        /// ```
503        ///
504        /// This shows that building a regex will fail if the byte given
505        /// is not ASCII and the pattern could result in matching invalid
506        /// UTF-8. This is because any singular non-ASCII byte is not valid
507        /// UTF-8, and it is not permitted for a [`Regex`] to match invalid
508        /// UTF-8. (It is permissible to use a non-ASCII byte when building a
509        /// [`bytes::Regex`](crate::bytes::Regex).)
510        ///
511        /// ```
512        /// use regex::RegexBuilder;
513        ///
514        /// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err());
515        /// // Note that using a non-ASCII byte isn't enough on its own to
516        /// // cause regex compilation to fail. You actually have to make use
517        /// // of it in the regex in a way that leads to matching invalid
518        /// // UTF-8. If you don't, then regex compilation will succeed!
519        /// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok());
520        /// ```
521        pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
522            self.builder.line_terminator(byte);
523            self
524        }
525
526        /// This configures swap-greed mode for the entire pattern.
527        ///
528        /// When swap-greed mode is enabled, patterns like `a+` will become
529        /// non-greedy and patterns like `a+?` will become greedy. In other
530        /// words, the meanings of `a+` and `a+?` are switched.
531        ///
532        /// This setting can also be configured using the inline flag `U` in
533        /// the pattern.
534        ///
535        /// The default for this is `false`.
536        ///
537        /// # Example
538        ///
539        /// ```
540        /// use regex::RegexBuilder;
541        ///
542        /// let re = RegexBuilder::new(r"a+")
543        ///     .swap_greed(true)
544        ///     .build()
545        ///     .unwrap();
546        /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str()));
547        /// ```
548        pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
549            self.builder.swap_greed(yes);
550            self
551        }
552
553        /// This configures verbose mode for the entire pattern.
554        ///
555        /// When enabled, whitespace will treated as insignifcant in the
556        /// pattern and `#` can be used to start a comment until the next new
557        /// line.
558        ///
559        /// Normally, in most places in a pattern, whitespace is treated
560        /// literally. For example ` +` will match one or more ASCII whitespace
561        /// characters.
562        ///
563        /// When verbose mode is enabled, `\#` can be used to match a literal
564        /// `#` and `\ ` can be used to match a literal ASCII whitespace
565        /// character.
566        ///
567        /// Verbose mode is useful for permitting regexes to be formatted and
568        /// broken up more nicely. This may make them more easily readable.
569        ///
570        /// This setting can also be configured using the inline flag `x` in
571        /// the pattern.
572        ///
573        /// The default for this is `false`.
574        ///
575        /// # Example
576        ///
577        /// ```
578        /// use regex::RegexBuilder;
579        ///
580        /// let pat = r"
581        ///     \b
582        ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
583        ///     [\s--\n]+                   # whitespace should separate names
584        ///     (?: # middle name can be an initial!
585        ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
586        ///         [\s--\n]+
587        ///     )?
588        ///     (?<last>\p{Uppercase}\w*)
589        ///     \b
590        /// ";
591        /// let re = RegexBuilder::new(pat)
592        ///     .ignore_whitespace(true)
593        ///     .build()
594        ///     .unwrap();
595        ///
596        /// let caps = re.captures("Harry Potter").unwrap();
597        /// assert_eq!("Harry", &caps["first"]);
598        /// assert_eq!("Potter", &caps["last"]);
599        ///
600        /// let caps = re.captures("Harry J. Potter").unwrap();
601        /// assert_eq!("Harry", &caps["first"]);
602        /// // Since a middle name/initial isn't required for an overall match,
603        /// // we can't assume that 'initial' or 'middle' will be populated!
604        /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str()));
605        /// assert_eq!(None, caps.name("middle").map(|m| m.as_str()));
606        /// assert_eq!("Potter", &caps["last"]);
607        ///
608        /// let caps = re.captures("Harry James Potter").unwrap();
609        /// assert_eq!("Harry", &caps["first"]);
610        /// // Since a middle name/initial isn't required for an overall match,
611        /// // we can't assume that 'initial' or 'middle' will be populated!
612        /// assert_eq!(None, caps.name("initial").map(|m| m.as_str()));
613        /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str()));
614        /// assert_eq!("Potter", &caps["last"]);
615        /// ```
616        pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
617            self.builder.ignore_whitespace(yes);
618            self
619        }
620
621        /// This configures octal mode for the entire pattern.
622        ///
623        /// Octal syntax is a little-known way of uttering Unicode codepoints
624        /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
625        /// equivalent patterns, where the last example shows octal syntax.
626        ///
627        /// While supporting octal syntax isn't in and of itself a problem,
628        /// it does make good error messages harder. That is, in PCRE based
629        /// regex engines, syntax like `\1` invokes a backreference, which is
630        /// explicitly unsupported this library. However, many users expect
631        /// backreferences to be supported. Therefore, when octal support
632        /// is disabled, the error message will explicitly mention that
633        /// backreferences aren't supported.
634        ///
635        /// The default for this is `false`.
636        ///
637        /// # Example
638        ///
639        /// ```
640        /// use regex::RegexBuilder;
641        ///
642        /// // Normally this pattern would not compile, with an error message
643        /// // about backreferences not being supported. But with octal mode
644        /// // enabled, octal escape sequences work.
645        /// let re = RegexBuilder::new(r"\141")
646        ///     .octal(true)
647        ///     .build()
648        ///     .unwrap();
649        /// assert!(re.is_match("a"));
650        /// ```
651        pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
652            self.builder.octal(yes);
653            self
654        }
655
656        /// Sets the approximate size limit, in bytes, of the compiled regex.
657        ///
658        /// This roughly corresponds to the number of heap memory, in
659        /// bytes, occupied by a single regex. If the regex would otherwise
660        /// approximately exceed this limit, then compiling that regex will
661        /// fail.
662        ///
663        /// The main utility of a method like this is to avoid compiling
664        /// regexes that use an unexpected amount of resources, such as
665        /// time and memory. Even if the memory usage of a large regex is
666        /// acceptable, its search time may not be. Namely, worst case time
667        /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
668        /// `n ~ len(haystack)`. That is, search time depends, in part, on the
669        /// size of the compiled regex. This means that putting a limit on the
670        /// size of the regex limits how much a regex can impact search time.
671        ///
672        /// For more information about regex size limits, see the section on
673        /// [untrusted inputs](crate#untrusted-input) in the top-level crate
674        /// documentation.
675        ///
676        /// The default for this is some reasonable number that permits most
677        /// patterns to compile successfully.
678        ///
679        /// # Example
680        ///
681        /// ```
682        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
683        /// use regex::RegexBuilder;
684        ///
685        /// // It may surprise you how big some seemingly small patterns can
686        /// // be! Since \w is Unicode aware, this generates a regex that can
687        /// // match approximately 140,000 distinct codepoints.
688        /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
689        /// ```
690        pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
691            self.builder.size_limit(bytes);
692            self
693        }
694
695        /// Set the approximate capacity, in bytes, of the cache of transitions
696        /// used by the lazy DFA.
697        ///
698        /// While the lazy DFA isn't always used, in tends to be the most
699        /// commonly use regex engine in default configurations. It tends to
700        /// adopt the performance profile of a fully build DFA, but without the
701        /// downside of taking worst case exponential time to build.
702        ///
703        /// The downside is that it needs to keep a cache of transitions and
704        /// states that are built while running a search, and this cache
705        /// can fill up. When it fills up, the cache will reset itself. Any
706        /// previously generated states and transitions will then need to be
707        /// re-generated. If this happens too many times, then this library
708        /// will bail out of using the lazy DFA and switch to a different regex
709        /// engine.
710        ///
711        /// If your regex provokes this particular downside of the lazy DFA,
712        /// then it may be beneficial to increase its cache capacity. This will
713        /// potentially reduce the frequency of cache resetting (ideally to
714        /// `0`). While it won't fix all potential performance problems with
715        /// the lazy DFA, increasing the cache capacity does fix some.
716        ///
717        /// There is no easy way to determine, a priori, whether increasing
718        /// this cache capacity will help. In general, the larger your regex,
719        /// the more cache it's likely to use. But that isn't an ironclad rule.
720        /// For example, a regex like `[01]*1[01]{N}` would normally produce a
721        /// fully build DFA that is exponential in size with respect to `N`.
722        /// The lazy DFA will prevent exponential space blow-up, but it cache
723        /// is likely to fill up, even when it's large and even for smallish
724        /// values of `N`.
725        ///
726        /// If you aren't sure whether this helps or not, it is sensible to
727        /// set this to some arbitrarily large number in testing, such as
728        /// `usize::MAX`. Namely, this represents the amount of capacity that
729        /// *may* be used. It's probably not a good idea to use `usize::MAX` in
730        /// production though, since it implies there are no controls on heap
731        /// memory used by this library during a search. In effect, set it to
732        /// whatever you're willing to allocate for a single regex search.
733        pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
734            self.builder.dfa_size_limit(bytes);
735            self
736        }
737
738        /// Set the nesting limit for this parser.
739        ///
740        /// The nesting limit controls how deep the abstract syntax tree is
741        /// allowed to be. If the AST exceeds the given limit (e.g., with too
742        /// many nested groups), then an error is returned by the parser.
743        ///
744        /// The purpose of this limit is to act as a heuristic to prevent stack
745        /// overflow for consumers that do structural induction on an AST using
746        /// explicit recursion. While this crate never does this (instead using
747        /// constant stack space and moving the call stack to the heap), other
748        /// crates may.
749        ///
750        /// This limit is not checked until the entire AST is parsed.
751        /// Therefore, if callers want to put a limit on the amount of heap
752        /// space used, then they should impose a limit on the length, in
753        /// bytes, of the concrete pattern string. In particular, this is
754        /// viable since this parser implementation will limit itself to heap
755        /// space proportional to the length of the pattern string. See also
756        /// the [untrusted inputs](crate#untrusted-input) section in the
757        /// top-level crate documentation for more information about this.
758        ///
759        /// Note that a nest limit of `0` will return a nest limit error for
760        /// most patterns but not all. For example, a nest limit of `0` permits
761        /// `a` but not `ab`, since `ab` requires an explicit concatenation,
762        /// which results in a nest depth of `1`. In general, a nest limit is
763        /// not something that manifests in an obvious way in the concrete
764        /// syntax, therefore, it should not be used in a granular way.
765        ///
766        /// # Example
767        ///
768        /// ```
769        /// use regex::RegexBuilder;
770        ///
771        /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
772        /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
773        /// ```
774        pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
775            self.builder.nest_limit(limit);
776            self
777        }
778    }
779
780    /// A configurable builder for a [`RegexSet`].
781    ///
782    /// This builder can be used to programmatically set flags such as
783    /// `i` (case insensitive) and `x` (for verbose mode). This builder
784    /// can also be used to configure things like the line terminator
785    /// and a size limit on the compiled regular expression.
786    #[derive(Clone, Debug)]
787    pub struct RegexSetBuilder {
788        builder: Builder,
789    }
790
791    impl RegexSetBuilder {
792        /// Create a new builder with a default configuration for the given
793        /// patterns.
794        ///
795        /// If the patterns are invalid or exceed the configured size limits,
796        /// then an error will be returned when [`RegexSetBuilder::build`] is
797        /// called.
798        pub fn new<I, S>(patterns: I) -> RegexSetBuilder
799        where
800            I: IntoIterator<Item = S>,
801            S: AsRef<str>,
802        {
803            RegexSetBuilder { builder: Builder::new(patterns) }
804        }
805
806        /// Compiles the patterns given to `RegexSetBuilder::new` with the
807        /// configuration set on this builder.
808        ///
809        /// If the patterns aren't valid regexes or if a configured size limit
810        /// was exceeded, then an error is returned.
811        pub fn build(&self) -> Result<RegexSet, Error> {
812            self.builder.build_many_string()
813        }
814
815        /// This configures Unicode mode for the all of the patterns.
816        ///
817        /// Enabling Unicode mode does a number of things:
818        ///
819        /// * Most fundamentally, it causes the fundamental atom of matching
820        /// to be a single codepoint. When Unicode mode is disabled, it's a
821        /// single byte. For example, when Unicode mode is enabled, `.` will
822        /// match `💩` once, where as it will match 4 times when Unicode mode
823        /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
824        /// * Case insensitive matching uses Unicode simple case folding rules.
825        /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
826        /// available.
827        /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
828        /// `\d`.
829        /// * The word boundary assertions, `\b` and `\B`, use the Unicode
830        /// definition of a word character.
831        ///
832        /// Note that if Unicode mode is disabled, then the regex will fail to
833        /// compile if it could match invalid UTF-8. For example, when Unicode
834        /// mode is disabled, then since `.` matches any byte (except for
835        /// `\n`), then it can match invalid UTF-8 and thus building a regex
836        /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
837        /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
838        /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
839        /// and so it is not allowed. This restriction can be lifted only by
840        /// using a [`bytes::RegexSet`](crate::bytes::RegexSet).
841        ///
842        /// For more details on the Unicode support in this crate, see the
843        /// [Unicode section](crate#unicode) in this crate's top-level
844        /// documentation.
845        ///
846        /// The default for this is `true`.
847        ///
848        /// # Example
849        ///
850        /// ```
851        /// use regex::RegexSetBuilder;
852        ///
853        /// let re = RegexSetBuilder::new([r"\w"])
854        ///     .unicode(false)
855        ///     .build()
856        ///     .unwrap();
857        /// // Normally greek letters would be included in \w, but since
858        /// // Unicode mode is disabled, it only matches ASCII letters.
859        /// assert!(!re.is_match("δ"));
860        ///
861        /// let re = RegexSetBuilder::new([r"s"])
862        ///     .case_insensitive(true)
863        ///     .unicode(false)
864        ///     .build()
865        ///     .unwrap();
866        /// // Normally 'ſ' is included when searching for 's' case
867        /// // insensitively due to Unicode's simple case folding rules. But
868        /// // when Unicode mode is disabled, only ASCII case insensitive rules
869        /// // are used.
870        /// assert!(!re.is_match("ſ"));
871        /// ```
872        pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
873            self.builder.unicode(yes);
874            self
875        }
876
877        /// This configures whether to enable case insensitive matching for all
878        /// of the patterns.
879        ///
880        /// This setting can also be configured using the inline flag `i`
881        /// in the pattern. For example, `(?i:foo)` matches `foo` case
882        /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
883        ///
884        /// The default for this is `false`.
885        ///
886        /// # Example
887        ///
888        /// ```
889        /// use regex::RegexSetBuilder;
890        ///
891        /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
892        ///     .case_insensitive(true)
893        ///     .build()
894        ///     .unwrap();
895        /// assert!(re.is_match("FoObarQuUx"));
896        /// // Even though case insensitive matching is enabled in the builder,
897        /// // it can be locally disabled within the pattern. In this case,
898        /// // `bar` is matched case sensitively.
899        /// assert!(!re.is_match("fooBARquux"));
900        /// ```
901        pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
902            self.builder.case_insensitive(yes);
903            self
904        }
905
906        /// This configures multi-line mode for all of the patterns.
907        ///
908        /// Enabling multi-line mode changes the behavior of the `^` and `$`
909        /// anchor assertions. Instead of only matching at the beginning and
910        /// end of a haystack, respectively, multi-line mode causes them to
911        /// match at the beginning and end of a line *in addition* to the
912        /// beginning and end of a haystack. More precisely, `^` will match at
913        /// the position immediately following a `\n` and `$` will match at the
914        /// position immediately preceding a `\n`.
915        ///
916        /// The behavior of this option can be impacted by other settings too:
917        ///
918        /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
919        /// above to any ASCII byte.
920        /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
921        /// to be either `\r` or `\n`, but never at the position between a `\r`
922        /// and `\n`.
923        ///
924        /// This setting can also be configured using the inline flag `m` in
925        /// the pattern.
926        ///
927        /// The default for this is `false`.
928        ///
929        /// # Example
930        ///
931        /// ```
932        /// use regex::RegexSetBuilder;
933        ///
934        /// let re = RegexSetBuilder::new([r"^foo$"])
935        ///     .multi_line(true)
936        ///     .build()
937        ///     .unwrap();
938        /// assert!(re.is_match("\nfoo\n"));
939        /// ```
940        pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
941            self.builder.multi_line(yes);
942            self
943        }
944
945        /// This configures dot-matches-new-line mode for the entire pattern.
946        ///
947        /// Perhaps surprisingly, the default behavior for `.` is not to match
948        /// any character, but rather, to match any character except for the
949        /// line terminator (which is `\n` by default). When this mode is
950        /// enabled, the behavior changes such that `.` truly matches any
951        /// character.
952        ///
953        /// This setting can also be configured using the inline flag `s` in
954        /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
955        /// regexes.
956        ///
957        /// The default for this is `false`.
958        ///
959        /// # Example
960        ///
961        /// ```
962        /// use regex::RegexSetBuilder;
963        ///
964        /// let re = RegexSetBuilder::new([r"foo.bar"])
965        ///     .dot_matches_new_line(true)
966        ///     .build()
967        ///     .unwrap();
968        /// let hay = "foo\nbar";
969        /// assert!(re.is_match(hay));
970        /// ```
971        pub fn dot_matches_new_line(
972            &mut self,
973            yes: bool,
974        ) -> &mut RegexSetBuilder {
975            self.builder.dot_matches_new_line(yes);
976            self
977        }
978
979        /// This configures CRLF mode for all of the patterns.
980        ///
981        /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
982        /// short) and `\n` ("line feed" or LF for short) are treated as line
983        /// terminators. This results in the following:
984        ///
985        /// * Unless dot-matches-new-line mode is enabled, `.` will now match
986        /// any character except for `\n` and `\r`.
987        /// * When multi-line mode is enabled, `^` will match immediately
988        /// following a `\n` or a `\r`. Similarly, `$` will match immediately
989        /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
990        /// between `\r` and `\n`.
991        ///
992        /// This setting can also be configured using the inline flag `R` in
993        /// the pattern.
994        ///
995        /// The default for this is `false`.
996        ///
997        /// # Example
998        ///
999        /// ```
1000        /// use regex::RegexSetBuilder;
1001        ///
1002        /// let re = RegexSetBuilder::new([r"^foo$"])
1003        ///     .multi_line(true)
1004        ///     .crlf(true)
1005        ///     .build()
1006        ///     .unwrap();
1007        /// let hay = "\r\nfoo\r\n";
1008        /// // If CRLF mode weren't enabled here, then '$' wouldn't match
1009        /// // immediately after 'foo', and thus no match would be found.
1010        /// assert!(re.is_match(hay));
1011        /// ```
1012        ///
1013        /// This example demonstrates that `^` will never match at a position
1014        /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1015        /// and a `\n`.)
1016        ///
1017        /// ```
1018        /// use regex::RegexSetBuilder;
1019        ///
1020        /// let re = RegexSetBuilder::new([r"^\n"])
1021        ///     .multi_line(true)
1022        ///     .crlf(true)
1023        ///     .build()
1024        ///     .unwrap();
1025        /// assert!(!re.is_match("\r\n"));
1026        /// ```
1027        pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
1028            self.builder.crlf(yes);
1029            self
1030        }
1031
1032        /// Configures the line terminator to be used by the regex.
1033        ///
1034        /// The line terminator is relevant in two ways for a particular regex:
1035        ///
1036        /// * When dot-matches-new-line mode is *not* enabled (the default),
1037        /// then `.` will match any character except for the configured line
1038        /// terminator.
1039        /// * When multi-line mode is enabled (not the default), then `^` and
1040        /// `$` will match immediately after and before, respectively, a line
1041        /// terminator.
1042        ///
1043        /// In both cases, if CRLF mode is enabled in a particular context,
1044        /// then it takes precedence over any configured line terminator.
1045        ///
1046        /// This option cannot be configured from within the pattern.
1047        ///
1048        /// The default line terminator is `\n`.
1049        ///
1050        /// # Example
1051        ///
1052        /// This shows how to treat the NUL byte as a line terminator. This can
1053        /// be a useful heuristic when searching binary data.
1054        ///
1055        /// ```
1056        /// use regex::RegexSetBuilder;
1057        ///
1058        /// let re = RegexSetBuilder::new([r"^foo$"])
1059        ///     .multi_line(true)
1060        ///     .line_terminator(b'\x00')
1061        ///     .build()
1062        ///     .unwrap();
1063        /// let hay = "\x00foo\x00";
1064        /// assert!(re.is_match(hay));
1065        /// ```
1066        ///
1067        /// This example shows that the behavior of `.` is impacted by this
1068        /// setting as well:
1069        ///
1070        /// ```
1071        /// use regex::RegexSetBuilder;
1072        ///
1073        /// let re = RegexSetBuilder::new([r"."])
1074        ///     .line_terminator(b'\x00')
1075        ///     .build()
1076        ///     .unwrap();
1077        /// assert!(re.is_match("\n"));
1078        /// assert!(!re.is_match("\x00"));
1079        /// ```
1080        ///
1081        /// This shows that building a regex will fail if the byte given
1082        /// is not ASCII and the pattern could result in matching invalid
1083        /// UTF-8. This is because any singular non-ASCII byte is not valid
1084        /// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid
1085        /// UTF-8. (It is permissible to use a non-ASCII byte when building a
1086        /// [`bytes::RegexSet`](crate::bytes::RegexSet).)
1087        ///
1088        /// ```
1089        /// use regex::RegexSetBuilder;
1090        ///
1091        /// assert!(
1092        ///     RegexSetBuilder::new([r"."])
1093        ///         .line_terminator(0x80)
1094        ///         .build()
1095        ///         .is_err()
1096        /// );
1097        /// // Note that using a non-ASCII byte isn't enough on its own to
1098        /// // cause regex compilation to fail. You actually have to make use
1099        /// // of it in the regex in a way that leads to matching invalid
1100        /// // UTF-8. If you don't, then regex compilation will succeed!
1101        /// assert!(
1102        ///     RegexSetBuilder::new([r"a"])
1103        ///         .line_terminator(0x80)
1104        ///         .build()
1105        ///         .is_ok()
1106        /// );
1107        /// ```
1108        pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
1109            self.builder.line_terminator(byte);
1110            self
1111        }
1112
1113        /// This configures swap-greed mode for all of the patterns.
1114        ///
1115        /// When swap-greed mode is enabled, patterns like `a+` will become
1116        /// non-greedy and patterns like `a+?` will become greedy. In other
1117        /// words, the meanings of `a+` and `a+?` are switched.
1118        ///
1119        /// This setting can also be configured using the inline flag `U` in
1120        /// the pattern.
1121        ///
1122        /// Note that this is generally not useful for a `RegexSet` since a
1123        /// `RegexSet` can only report whether a pattern matches or not. Since
1124        /// greediness never impacts whether a match is found or not (only the
1125        /// offsets of the match), it follows that whether parts of a pattern
1126        /// are greedy or not doesn't matter for a `RegexSet`.
1127        ///
1128        /// The default for this is `false`.
1129        pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
1130            self.builder.swap_greed(yes);
1131            self
1132        }
1133
1134        /// This configures verbose mode for all of the patterns.
1135        ///
1136        /// When enabled, whitespace will treated as insignifcant in the
1137        /// pattern and `#` can be used to start a comment until the next new
1138        /// line.
1139        ///
1140        /// Normally, in most places in a pattern, whitespace is treated
1141        /// literally. For example ` +` will match one or more ASCII whitespace
1142        /// characters.
1143        ///
1144        /// When verbose mode is enabled, `\#` can be used to match a literal
1145        /// `#` and `\ ` can be used to match a literal ASCII whitespace
1146        /// character.
1147        ///
1148        /// Verbose mode is useful for permitting regexes to be formatted and
1149        /// broken up more nicely. This may make them more easily readable.
1150        ///
1151        /// This setting can also be configured using the inline flag `x` in
1152        /// the pattern.
1153        ///
1154        /// The default for this is `false`.
1155        ///
1156        /// # Example
1157        ///
1158        /// ```
1159        /// use regex::RegexSetBuilder;
1160        ///
1161        /// let pat = r"
1162        ///     \b
1163        ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
1164        ///     [\s--\n]+                   # whitespace should separate names
1165        ///     (?: # middle name can be an initial!
1166        ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
1167        ///         [\s--\n]+
1168        ///     )?
1169        ///     (?<last>\p{Uppercase}\w*)
1170        ///     \b
1171        /// ";
1172        /// let re = RegexSetBuilder::new([pat])
1173        ///     .ignore_whitespace(true)
1174        ///     .build()
1175        ///     .unwrap();
1176        /// assert!(re.is_match("Harry Potter"));
1177        /// assert!(re.is_match("Harry J. Potter"));
1178        /// assert!(re.is_match("Harry James Potter"));
1179        /// assert!(!re.is_match("harry J. Potter"));
1180        /// ```
1181        pub fn ignore_whitespace(
1182            &mut self,
1183            yes: bool,
1184        ) -> &mut RegexSetBuilder {
1185            self.builder.ignore_whitespace(yes);
1186            self
1187        }
1188
1189        /// This configures octal mode for all of the patterns.
1190        ///
1191        /// Octal syntax is a little-known way of uttering Unicode codepoints
1192        /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1193        /// equivalent patterns, where the last example shows octal syntax.
1194        ///
1195        /// While supporting octal syntax isn't in and of itself a problem,
1196        /// it does make good error messages harder. That is, in PCRE based
1197        /// regex engines, syntax like `\1` invokes a backreference, which is
1198        /// explicitly unsupported this library. However, many users expect
1199        /// backreferences to be supported. Therefore, when octal support
1200        /// is disabled, the error message will explicitly mention that
1201        /// backreferences aren't supported.
1202        ///
1203        /// The default for this is `false`.
1204        ///
1205        /// # Example
1206        ///
1207        /// ```
1208        /// use regex::RegexSetBuilder;
1209        ///
1210        /// // Normally this pattern would not compile, with an error message
1211        /// // about backreferences not being supported. But with octal mode
1212        /// // enabled, octal escape sequences work.
1213        /// let re = RegexSetBuilder::new([r"\141"])
1214        ///     .octal(true)
1215        ///     .build()
1216        ///     .unwrap();
1217        /// assert!(re.is_match("a"));
1218        /// ```
1219        pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
1220            self.builder.octal(yes);
1221            self
1222        }
1223
1224        /// Sets the approximate size limit, in bytes, of the compiled regex.
1225        ///
1226        /// This roughly corresponds to the number of heap memory, in
1227        /// bytes, occupied by a single regex. If the regex would otherwise
1228        /// approximately exceed this limit, then compiling that regex will
1229        /// fail.
1230        ///
1231        /// The main utility of a method like this is to avoid compiling
1232        /// regexes that use an unexpected amount of resources, such as
1233        /// time and memory. Even if the memory usage of a large regex is
1234        /// acceptable, its search time may not be. Namely, worst case time
1235        /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
1236        /// `n ~ len(haystack)`. That is, search time depends, in part, on the
1237        /// size of the compiled regex. This means that putting a limit on the
1238        /// size of the regex limits how much a regex can impact search time.
1239        ///
1240        /// For more information about regex size limits, see the section on
1241        /// [untrusted inputs](crate#untrusted-input) in the top-level crate
1242        /// documentation.
1243        ///
1244        /// The default for this is some reasonable number that permits most
1245        /// patterns to compile successfully.
1246        ///
1247        /// # Example
1248        ///
1249        /// ```
1250        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1251        /// use regex::RegexSetBuilder;
1252        ///
1253        /// // It may surprise you how big some seemingly small patterns can
1254        /// // be! Since \w is Unicode aware, this generates a regex that can
1255        /// // match approximately 140,000 distinct codepoints.
1256        /// assert!(
1257        ///     RegexSetBuilder::new([r"\w"])
1258        ///         .size_limit(45_000)
1259        ///         .build()
1260        ///         .is_err()
1261        /// );
1262        /// ```
1263        pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
1264            self.builder.size_limit(bytes);
1265            self
1266        }
1267
1268        /// Set the approximate capacity, in bytes, of the cache of transitions
1269        /// used by the lazy DFA.
1270        ///
1271        /// While the lazy DFA isn't always used, in tends to be the most
1272        /// commonly use regex engine in default configurations. It tends to
1273        /// adopt the performance profile of a fully build DFA, but without the
1274        /// downside of taking worst case exponential time to build.
1275        ///
1276        /// The downside is that it needs to keep a cache of transitions and
1277        /// states that are built while running a search, and this cache
1278        /// can fill up. When it fills up, the cache will reset itself. Any
1279        /// previously generated states and transitions will then need to be
1280        /// re-generated. If this happens too many times, then this library
1281        /// will bail out of using the lazy DFA and switch to a different regex
1282        /// engine.
1283        ///
1284        /// If your regex provokes this particular downside of the lazy DFA,
1285        /// then it may be beneficial to increase its cache capacity. This will
1286        /// potentially reduce the frequency of cache resetting (ideally to
1287        /// `0`). While it won't fix all potential performance problems with
1288        /// the lazy DFA, increasing the cache capacity does fix some.
1289        ///
1290        /// There is no easy way to determine, a priori, whether increasing
1291        /// this cache capacity will help. In general, the larger your regex,
1292        /// the more cache it's likely to use. But that isn't an ironclad rule.
1293        /// For example, a regex like `[01]*1[01]{N}` would normally produce a
1294        /// fully build DFA that is exponential in size with respect to `N`.
1295        /// The lazy DFA will prevent exponential space blow-up, but it cache
1296        /// is likely to fill up, even when it's large and even for smallish
1297        /// values of `N`.
1298        ///
1299        /// If you aren't sure whether this helps or not, it is sensible to
1300        /// set this to some arbitrarily large number in testing, such as
1301        /// `usize::MAX`. Namely, this represents the amount of capacity that
1302        /// *may* be used. It's probably not a good idea to use `usize::MAX` in
1303        /// production though, since it implies there are no controls on heap
1304        /// memory used by this library during a search. In effect, set it to
1305        /// whatever you're willing to allocate for a single regex search.
1306        pub fn dfa_size_limit(
1307            &mut self,
1308            bytes: usize,
1309        ) -> &mut RegexSetBuilder {
1310            self.builder.dfa_size_limit(bytes);
1311            self
1312        }
1313
1314        /// Set the nesting limit for this parser.
1315        ///
1316        /// The nesting limit controls how deep the abstract syntax tree is
1317        /// allowed to be. If the AST exceeds the given limit (e.g., with too
1318        /// many nested groups), then an error is returned by the parser.
1319        ///
1320        /// The purpose of this limit is to act as a heuristic to prevent stack
1321        /// overflow for consumers that do structural induction on an AST using
1322        /// explicit recursion. While this crate never does this (instead using
1323        /// constant stack space and moving the call stack to the heap), other
1324        /// crates may.
1325        ///
1326        /// This limit is not checked until the entire AST is parsed.
1327        /// Therefore, if callers want to put a limit on the amount of heap
1328        /// space used, then they should impose a limit on the length, in
1329        /// bytes, of the concrete pattern string. In particular, this is
1330        /// viable since this parser implementation will limit itself to heap
1331        /// space proportional to the length of the pattern string. See also
1332        /// the [untrusted inputs](crate#untrusted-input) section in the
1333        /// top-level crate documentation for more information about this.
1334        ///
1335        /// Note that a nest limit of `0` will return a nest limit error for
1336        /// most patterns but not all. For example, a nest limit of `0` permits
1337        /// `a` but not `ab`, since `ab` requires an explicit concatenation,
1338        /// which results in a nest depth of `1`. In general, a nest limit is
1339        /// not something that manifests in an obvious way in the concrete
1340        /// syntax, therefore, it should not be used in a granular way.
1341        ///
1342        /// # Example
1343        ///
1344        /// ```
1345        /// use regex::RegexSetBuilder;
1346        ///
1347        /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
1348        /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
1349        /// ```
1350        pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
1351            self.builder.nest_limit(limit);
1352            self
1353        }
1354    }
1355}
1356
1357pub(crate) mod bytes {
1358    use crate::{
1359        bytes::{Regex, RegexSet},
1360        error::Error,
1361    };
1362
1363    use super::Builder;
1364
1365    /// A configurable builder for a [`Regex`].
1366    ///
1367    /// This builder can be used to programmatically set flags such as `i`
1368    /// (case insensitive) and `x` (for verbose mode). This builder can also be
1369    /// used to configure things like the line terminator and a size limit on
1370    /// the compiled regular expression.
1371    #[derive(Clone, Debug)]
1372    pub struct RegexBuilder {
1373        builder: Builder,
1374    }
1375
1376    impl RegexBuilder {
1377        /// Create a new builder with a default configuration for the given
1378        /// pattern.
1379        ///
1380        /// If the pattern is invalid or exceeds the configured size limits,
1381        /// then an error will be returned when [`RegexBuilder::build`] is
1382        /// called.
1383        pub fn new(pattern: &str) -> RegexBuilder {
1384            RegexBuilder { builder: Builder::new([pattern]) }
1385        }
1386
1387        /// Compiles the pattern given to `RegexBuilder::new` with the
1388        /// configuration set on this builder.
1389        ///
1390        /// If the pattern isn't a valid regex or if a configured size limit
1391        /// was exceeded, then an error is returned.
1392        pub fn build(&self) -> Result<Regex, Error> {
1393            self.builder.build_one_bytes()
1394        }
1395
1396        /// This configures Unicode mode for the entire pattern.
1397        ///
1398        /// Enabling Unicode mode does a number of things:
1399        ///
1400        /// * Most fundamentally, it causes the fundamental atom of matching
1401        /// to be a single codepoint. When Unicode mode is disabled, it's a
1402        /// single byte. For example, when Unicode mode is enabled, `.` will
1403        /// match `💩` once, where as it will match 4 times when Unicode mode
1404        /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
1405        /// * Case insensitive matching uses Unicode simple case folding rules.
1406        /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
1407        /// available.
1408        /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
1409        /// `\d`.
1410        /// * The word boundary assertions, `\b` and `\B`, use the Unicode
1411        /// definition of a word character.
1412        ///
1413        /// Note that unlike the top-level `Regex` for searching `&str`, it
1414        /// is permitted to disable Unicode mode even if the resulting pattern
1415        /// could match invalid UTF-8. For example, `(?-u:.)` is not a valid
1416        /// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`.
1417        ///
1418        /// For more details on the Unicode support in this crate, see the
1419        /// [Unicode section](crate#unicode) in this crate's top-level
1420        /// documentation.
1421        ///
1422        /// The default for this is `true`.
1423        ///
1424        /// # Example
1425        ///
1426        /// ```
1427        /// use regex::bytes::RegexBuilder;
1428        ///
1429        /// let re = RegexBuilder::new(r"\w")
1430        ///     .unicode(false)
1431        ///     .build()
1432        ///     .unwrap();
1433        /// // Normally greek letters would be included in \w, but since
1434        /// // Unicode mode is disabled, it only matches ASCII letters.
1435        /// assert!(!re.is_match("δ".as_bytes()));
1436        ///
1437        /// let re = RegexBuilder::new(r"s")
1438        ///     .case_insensitive(true)
1439        ///     .unicode(false)
1440        ///     .build()
1441        ///     .unwrap();
1442        /// // Normally 'ſ' is included when searching for 's' case
1443        /// // insensitively due to Unicode's simple case folding rules. But
1444        /// // when Unicode mode is disabled, only ASCII case insensitive rules
1445        /// // are used.
1446        /// assert!(!re.is_match("ſ".as_bytes()));
1447        /// ```
1448        ///
1449        /// Since this builder is for constructing a [`bytes::Regex`](Regex),
1450        /// one can disable Unicode mode even if it would match invalid UTF-8:
1451        ///
1452        /// ```
1453        /// use regex::bytes::RegexBuilder;
1454        ///
1455        /// let re = RegexBuilder::new(r".")
1456        ///     .unicode(false)
1457        ///     .build()
1458        ///     .unwrap();
1459        /// // Normally greek letters would be included in \w, but since
1460        /// // Unicode mode is disabled, it only matches ASCII letters.
1461        /// assert!(re.is_match(b"\xFF"));
1462        /// ```
1463        pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
1464            self.builder.unicode(yes);
1465            self
1466        }
1467
1468        /// This configures whether to enable case insensitive matching for the
1469        /// entire pattern.
1470        ///
1471        /// This setting can also be configured using the inline flag `i`
1472        /// in the pattern. For example, `(?i:foo)` matches `foo` case
1473        /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
1474        ///
1475        /// The default for this is `false`.
1476        ///
1477        /// # Example
1478        ///
1479        /// ```
1480        /// use regex::bytes::RegexBuilder;
1481        ///
1482        /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
1483        ///     .case_insensitive(true)
1484        ///     .build()
1485        ///     .unwrap();
1486        /// assert!(re.is_match(b"FoObarQuUx"));
1487        /// // Even though case insensitive matching is enabled in the builder,
1488        /// // it can be locally disabled within the pattern. In this case,
1489        /// // `bar` is matched case sensitively.
1490        /// assert!(!re.is_match(b"fooBARquux"));
1491        /// ```
1492        pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
1493            self.builder.case_insensitive(yes);
1494            self
1495        }
1496
1497        /// This configures multi-line mode for the entire pattern.
1498        ///
1499        /// Enabling multi-line mode changes the behavior of the `^` and `$`
1500        /// anchor assertions. Instead of only matching at the beginning and
1501        /// end of a haystack, respectively, multi-line mode causes them to
1502        /// match at the beginning and end of a line *in addition* to the
1503        /// beginning and end of a haystack. More precisely, `^` will match at
1504        /// the position immediately following a `\n` and `$` will match at the
1505        /// position immediately preceding a `\n`.
1506        ///
1507        /// The behavior of this option can be impacted by other settings too:
1508        ///
1509        /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
1510        /// to any ASCII byte.
1511        /// * The [`RegexBuilder::crlf`] option changes the line terminator to
1512        /// be either `\r` or `\n`, but never at the position between a `\r`
1513        /// and `\n`.
1514        ///
1515        /// This setting can also be configured using the inline flag `m` in
1516        /// the pattern.
1517        ///
1518        /// The default for this is `false`.
1519        ///
1520        /// # Example
1521        ///
1522        /// ```
1523        /// use regex::bytes::RegexBuilder;
1524        ///
1525        /// let re = RegexBuilder::new(r"^foo$")
1526        ///     .multi_line(true)
1527        ///     .build()
1528        ///     .unwrap();
1529        /// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range()));
1530        /// ```
1531        pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
1532            self.builder.multi_line(yes);
1533            self
1534        }
1535
1536        /// This configures dot-matches-new-line mode for the entire pattern.
1537        ///
1538        /// Perhaps surprisingly, the default behavior for `.` is not to match
1539        /// any character, but rather, to match any character except for the
1540        /// line terminator (which is `\n` by default). When this mode is
1541        /// enabled, the behavior changes such that `.` truly matches any
1542        /// character.
1543        ///
1544        /// This setting can also be configured using the inline flag `s` in
1545        /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
1546        /// regexes.
1547        ///
1548        /// The default for this is `false`.
1549        ///
1550        /// # Example
1551        ///
1552        /// ```
1553        /// use regex::bytes::RegexBuilder;
1554        ///
1555        /// let re = RegexBuilder::new(r"foo.bar")
1556        ///     .dot_matches_new_line(true)
1557        ///     .build()
1558        ///     .unwrap();
1559        /// let hay = b"foo\nbar";
1560        /// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes()));
1561        /// ```
1562        pub fn dot_matches_new_line(
1563            &mut self,
1564            yes: bool,
1565        ) -> &mut RegexBuilder {
1566            self.builder.dot_matches_new_line(yes);
1567            self
1568        }
1569
1570        /// This configures CRLF mode for the entire pattern.
1571        ///
1572        /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
1573        /// short) and `\n` ("line feed" or LF for short) are treated as line
1574        /// terminators. This results in the following:
1575        ///
1576        /// * Unless dot-matches-new-line mode is enabled, `.` will now match
1577        /// any character except for `\n` and `\r`.
1578        /// * When multi-line mode is enabled, `^` will match immediately
1579        /// following a `\n` or a `\r`. Similarly, `$` will match immediately
1580        /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
1581        /// between `\r` and `\n`.
1582        ///
1583        /// This setting can also be configured using the inline flag `R` in
1584        /// the pattern.
1585        ///
1586        /// The default for this is `false`.
1587        ///
1588        /// # Example
1589        ///
1590        /// ```
1591        /// use regex::bytes::RegexBuilder;
1592        ///
1593        /// let re = RegexBuilder::new(r"^foo$")
1594        ///     .multi_line(true)
1595        ///     .crlf(true)
1596        ///     .build()
1597        ///     .unwrap();
1598        /// let hay = b"\r\nfoo\r\n";
1599        /// // If CRLF mode weren't enabled here, then '$' wouldn't match
1600        /// // immediately after 'foo', and thus no match would be found.
1601        /// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes()));
1602        /// ```
1603        ///
1604        /// This example demonstrates that `^` will never match at a position
1605        /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1606        /// and a `\n`.)
1607        ///
1608        /// ```
1609        /// use regex::bytes::RegexBuilder;
1610        ///
1611        /// let re = RegexBuilder::new(r"^")
1612        ///     .multi_line(true)
1613        ///     .crlf(true)
1614        ///     .build()
1615        ///     .unwrap();
1616        /// let hay = b"\r\n\r\n";
1617        /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
1618        /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
1619        /// ```
1620        pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
1621            self.builder.crlf(yes);
1622            self
1623        }
1624
1625        /// Configures the line terminator to be used by the regex.
1626        ///
1627        /// The line terminator is relevant in two ways for a particular regex:
1628        ///
1629        /// * When dot-matches-new-line mode is *not* enabled (the default),
1630        /// then `.` will match any character except for the configured line
1631        /// terminator.
1632        /// * When multi-line mode is enabled (not the default), then `^` and
1633        /// `$` will match immediately after and before, respectively, a line
1634        /// terminator.
1635        ///
1636        /// In both cases, if CRLF mode is enabled in a particular context,
1637        /// then it takes precedence over any configured line terminator.
1638        ///
1639        /// This option cannot be configured from within the pattern.
1640        ///
1641        /// The default line terminator is `\n`.
1642        ///
1643        /// # Example
1644        ///
1645        /// This shows how to treat the NUL byte as a line terminator. This can
1646        /// be a useful heuristic when searching binary data.
1647        ///
1648        /// ```
1649        /// use regex::bytes::RegexBuilder;
1650        ///
1651        /// let re = RegexBuilder::new(r"^foo$")
1652        ///     .multi_line(true)
1653        ///     .line_terminator(b'\x00')
1654        ///     .build()
1655        ///     .unwrap();
1656        /// let hay = b"\x00foo\x00";
1657        /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
1658        /// ```
1659        ///
1660        /// This example shows that the behavior of `.` is impacted by this
1661        /// setting as well:
1662        ///
1663        /// ```
1664        /// use regex::bytes::RegexBuilder;
1665        ///
1666        /// let re = RegexBuilder::new(r".")
1667        ///     .line_terminator(b'\x00')
1668        ///     .build()
1669        ///     .unwrap();
1670        /// assert!(re.is_match(b"\n"));
1671        /// assert!(!re.is_match(b"\x00"));
1672        /// ```
1673        ///
1674        /// This shows that building a regex will work even when the byte
1675        /// given is not ASCII. This is unlike the top-level `Regex` API where
1676        /// matching invalid UTF-8 is not allowed.
1677        ///
1678        /// Note though that you must disable Unicode mode. This is required
1679        /// because Unicode mode requires matching one codepoint at a time,
1680        /// and there is no way to match a non-ASCII byte as if it were a
1681        /// codepoint.
1682        ///
1683        /// ```
1684        /// use regex::bytes::RegexBuilder;
1685        ///
1686        /// assert!(
1687        ///     RegexBuilder::new(r".")
1688        ///         .unicode(false)
1689        ///         .line_terminator(0x80)
1690        ///         .build()
1691        ///         .is_ok(),
1692        /// );
1693        /// ```
1694        pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
1695            self.builder.line_terminator(byte);
1696            self
1697        }
1698
1699        /// This configures swap-greed mode for the entire pattern.
1700        ///
1701        /// When swap-greed mode is enabled, patterns like `a+` will become
1702        /// non-greedy and patterns like `a+?` will become greedy. In other
1703        /// words, the meanings of `a+` and `a+?` are switched.
1704        ///
1705        /// This setting can also be configured using the inline flag `U` in
1706        /// the pattern.
1707        ///
1708        /// The default for this is `false`.
1709        ///
1710        /// # Example
1711        ///
1712        /// ```
1713        /// use regex::bytes::RegexBuilder;
1714        ///
1715        /// let re = RegexBuilder::new(r"a+")
1716        ///     .swap_greed(true)
1717        ///     .build()
1718        ///     .unwrap();
1719        /// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes()));
1720        /// ```
1721        pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
1722            self.builder.swap_greed(yes);
1723            self
1724        }
1725
1726        /// This configures verbose mode for the entire pattern.
1727        ///
1728        /// When enabled, whitespace will treated as insignifcant in the
1729        /// pattern and `#` can be used to start a comment until the next new
1730        /// line.
1731        ///
1732        /// Normally, in most places in a pattern, whitespace is treated
1733        /// literally. For example ` +` will match one or more ASCII whitespace
1734        /// characters.
1735        ///
1736        /// When verbose mode is enabled, `\#` can be used to match a literal
1737        /// `#` and `\ ` can be used to match a literal ASCII whitespace
1738        /// character.
1739        ///
1740        /// Verbose mode is useful for permitting regexes to be formatted and
1741        /// broken up more nicely. This may make them more easily readable.
1742        ///
1743        /// This setting can also be configured using the inline flag `x` in
1744        /// the pattern.
1745        ///
1746        /// The default for this is `false`.
1747        ///
1748        /// # Example
1749        ///
1750        /// ```
1751        /// use regex::bytes::RegexBuilder;
1752        ///
1753        /// let pat = r"
1754        ///     \b
1755        ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
1756        ///     [\s--\n]+                   # whitespace should separate names
1757        ///     (?: # middle name can be an initial!
1758        ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
1759        ///         [\s--\n]+
1760        ///     )?
1761        ///     (?<last>\p{Uppercase}\w*)
1762        ///     \b
1763        /// ";
1764        /// let re = RegexBuilder::new(pat)
1765        ///     .ignore_whitespace(true)
1766        ///     .build()
1767        ///     .unwrap();
1768        ///
1769        /// let caps = re.captures(b"Harry Potter").unwrap();
1770        /// assert_eq!(&b"Harry"[..], &caps["first"]);
1771        /// assert_eq!(&b"Potter"[..], &caps["last"]);
1772        ///
1773        /// let caps = re.captures(b"Harry J. Potter").unwrap();
1774        /// assert_eq!(&b"Harry"[..], &caps["first"]);
1775        /// // Since a middle name/initial isn't required for an overall match,
1776        /// // we can't assume that 'initial' or 'middle' will be populated!
1777        /// assert_eq!(
1778        ///     Some(&b"J"[..]),
1779        ///     caps.name("initial").map(|m| m.as_bytes()),
1780        /// );
1781        /// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes()));
1782        /// assert_eq!(&b"Potter"[..], &caps["last"]);
1783        ///
1784        /// let caps = re.captures(b"Harry James Potter").unwrap();
1785        /// assert_eq!(&b"Harry"[..], &caps["first"]);
1786        /// // Since a middle name/initial isn't required for an overall match,
1787        /// // we can't assume that 'initial' or 'middle' will be populated!
1788        /// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes()));
1789        /// assert_eq!(
1790        ///     Some(&b"James"[..]),
1791        ///     caps.name("middle").map(|m| m.as_bytes()),
1792        /// );
1793        /// assert_eq!(&b"Potter"[..], &caps["last"]);
1794        /// ```
1795        pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
1796            self.builder.ignore_whitespace(yes);
1797            self
1798        }
1799
1800        /// This configures octal mode for the entire pattern.
1801        ///
1802        /// Octal syntax is a little-known way of uttering Unicode codepoints
1803        /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1804        /// equivalent patterns, where the last example shows octal syntax.
1805        ///
1806        /// While supporting octal syntax isn't in and of itself a problem,
1807        /// it does make good error messages harder. That is, in PCRE based
1808        /// regex engines, syntax like `\1` invokes a backreference, which is
1809        /// explicitly unsupported this library. However, many users expect
1810        /// backreferences to be supported. Therefore, when octal support
1811        /// is disabled, the error message will explicitly mention that
1812        /// backreferences aren't supported.
1813        ///
1814        /// The default for this is `false`.
1815        ///
1816        /// # Example
1817        ///
1818        /// ```
1819        /// use regex::bytes::RegexBuilder;
1820        ///
1821        /// // Normally this pattern would not compile, with an error message
1822        /// // about backreferences not being supported. But with octal mode
1823        /// // enabled, octal escape sequences work.
1824        /// let re = RegexBuilder::new(r"\141")
1825        ///     .octal(true)
1826        ///     .build()
1827        ///     .unwrap();
1828        /// assert!(re.is_match(b"a"));
1829        /// ```
1830        pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
1831            self.builder.octal(yes);
1832            self
1833        }
1834
1835        /// Sets the approximate size limit, in bytes, of the compiled regex.
1836        ///
1837        /// This roughly corresponds to the number of heap memory, in
1838        /// bytes, occupied by a single regex. If the regex would otherwise
1839        /// approximately exceed this limit, then compiling that regex will
1840        /// fail.
1841        ///
1842        /// The main utility of a method like this is to avoid compiling
1843        /// regexes that use an unexpected amount of resources, such as
1844        /// time and memory. Even if the memory usage of a large regex is
1845        /// acceptable, its search time may not be. Namely, worst case time
1846        /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
1847        /// `n ~ len(haystack)`. That is, search time depends, in part, on the
1848        /// size of the compiled regex. This means that putting a limit on the
1849        /// size of the regex limits how much a regex can impact search time.
1850        ///
1851        /// For more information about regex size limits, see the section on
1852        /// [untrusted inputs](crate#untrusted-input) in the top-level crate
1853        /// documentation.
1854        ///
1855        /// The default for this is some reasonable number that permits most
1856        /// patterns to compile successfully.
1857        ///
1858        /// # Example
1859        ///
1860        /// ```
1861        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1862        /// use regex::bytes::RegexBuilder;
1863        ///
1864        /// // It may surprise you how big some seemingly small patterns can
1865        /// // be! Since \w is Unicode aware, this generates a regex that can
1866        /// // match approximately 140,000 distinct codepoints.
1867        /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
1868        /// ```
1869        pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1870            self.builder.size_limit(bytes);
1871            self
1872        }
1873
1874        /// Set the approximate capacity, in bytes, of the cache of transitions
1875        /// used by the lazy DFA.
1876        ///
1877        /// While the lazy DFA isn't always used, in tends to be the most
1878        /// commonly use regex engine in default configurations. It tends to
1879        /// adopt the performance profile of a fully build DFA, but without the
1880        /// downside of taking worst case exponential time to build.
1881        ///
1882        /// The downside is that it needs to keep a cache of transitions and
1883        /// states that are built while running a search, and this cache
1884        /// can fill up. When it fills up, the cache will reset itself. Any
1885        /// previously generated states and transitions will then need to be
1886        /// re-generated. If this happens too many times, then this library
1887        /// will bail out of using the lazy DFA and switch to a different regex
1888        /// engine.
1889        ///
1890        /// If your regex provokes this particular downside of the lazy DFA,
1891        /// then it may be beneficial to increase its cache capacity. This will
1892        /// potentially reduce the frequency of cache resetting (ideally to
1893        /// `0`). While it won't fix all potential performance problems with
1894        /// the lazy DFA, increasing the cache capacity does fix some.
1895        ///
1896        /// There is no easy way to determine, a priori, whether increasing
1897        /// this cache capacity will help. In general, the larger your regex,
1898        /// the more cache it's likely to use. But that isn't an ironclad rule.
1899        /// For example, a regex like `[01]*1[01]{N}` would normally produce a
1900        /// fully build DFA that is exponential in size with respect to `N`.
1901        /// The lazy DFA will prevent exponential space blow-up, but it cache
1902        /// is likely to fill up, even when it's large and even for smallish
1903        /// values of `N`.
1904        ///
1905        /// If you aren't sure whether this helps or not, it is sensible to
1906        /// set this to some arbitrarily large number in testing, such as
1907        /// `usize::MAX`. Namely, this represents the amount of capacity that
1908        /// *may* be used. It's probably not a good idea to use `usize::MAX` in
1909        /// production though, since it implies there are no controls on heap
1910        /// memory used by this library during a search. In effect, set it to
1911        /// whatever you're willing to allocate for a single regex search.
1912        pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1913            self.builder.dfa_size_limit(bytes);
1914            self
1915        }
1916
1917        /// Set the nesting limit for this parser.
1918        ///
1919        /// The nesting limit controls how deep the abstract syntax tree is
1920        /// allowed to be. If the AST exceeds the given limit (e.g., with too
1921        /// many nested groups), then an error is returned by the parser.
1922        ///
1923        /// The purpose of this limit is to act as a heuristic to prevent stack
1924        /// overflow for consumers that do structural induction on an AST using
1925        /// explicit recursion. While this crate never does this (instead using
1926        /// constant stack space and moving the call stack to the heap), other
1927        /// crates may.
1928        ///
1929        /// This limit is not checked until the entire AST is parsed.
1930        /// Therefore, if callers want to put a limit on the amount of heap
1931        /// space used, then they should impose a limit on the length, in
1932        /// bytes, of the concrete pattern string. In particular, this is
1933        /// viable since this parser implementation will limit itself to heap
1934        /// space proportional to the length of the pattern string. See also
1935        /// the [untrusted inputs](crate#untrusted-input) section in the
1936        /// top-level crate documentation for more information about this.
1937        ///
1938        /// Note that a nest limit of `0` will return a nest limit error for
1939        /// most patterns but not all. For example, a nest limit of `0` permits
1940        /// `a` but not `ab`, since `ab` requires an explicit concatenation,
1941        /// which results in a nest depth of `1`. In general, a nest limit is
1942        /// not something that manifests in an obvious way in the concrete
1943        /// syntax, therefore, it should not be used in a granular way.
1944        ///
1945        /// # Example
1946        ///
1947        /// ```
1948        /// use regex::bytes::RegexBuilder;
1949        ///
1950        /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
1951        /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
1952        /// ```
1953        pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
1954            self.builder.nest_limit(limit);
1955            self
1956        }
1957    }
1958
1959    /// A configurable builder for a [`RegexSet`].
1960    ///
1961    /// This builder can be used to programmatically set flags such as `i`
1962    /// (case insensitive) and `x` (for verbose mode). This builder can also be
1963    /// used to configure things like the line terminator and a size limit on
1964    /// the compiled regular expression.
1965    #[derive(Clone, Debug)]
1966    pub struct RegexSetBuilder {
1967        builder: Builder,
1968    }
1969
1970    impl RegexSetBuilder {
1971        /// Create a new builder with a default configuration for the given
1972        /// patterns.
1973        ///
1974        /// If the patterns are invalid or exceed the configured size limits,
1975        /// then an error will be returned when [`RegexSetBuilder::build`] is
1976        /// called.
1977        pub fn new<I, S>(patterns: I) -> RegexSetBuilder
1978        where
1979            I: IntoIterator<Item = S>,
1980            S: AsRef<str>,
1981        {
1982            RegexSetBuilder { builder: Builder::new(patterns) }
1983        }
1984
1985        /// Compiles the patterns given to `RegexSetBuilder::new` with the
1986        /// configuration set on this builder.
1987        ///
1988        /// If the patterns aren't valid regexes or if a configured size limit
1989        /// was exceeded, then an error is returned.
1990        pub fn build(&self) -> Result<RegexSet, Error> {
1991            self.builder.build_many_bytes()
1992        }
1993
1994        /// This configures Unicode mode for the all of the patterns.
1995        ///
1996        /// Enabling Unicode mode does a number of things:
1997        ///
1998        /// * Most fundamentally, it causes the fundamental atom of matching
1999        /// to be a single codepoint. When Unicode mode is disabled, it's a
2000        /// single byte. For example, when Unicode mode is enabled, `.` will
2001        /// match `💩` once, where as it will match 4 times when Unicode mode
2002        /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
2003        /// * Case insensitive matching uses Unicode simple case folding rules.
2004        /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
2005        /// available.
2006        /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
2007        /// `\d`.
2008        /// * The word boundary assertions, `\b` and `\B`, use the Unicode
2009        /// definition of a word character.
2010        ///
2011        /// Note that unlike the top-level `RegexSet` for searching `&str`,
2012        /// it is permitted to disable Unicode mode even if the resulting
2013        /// pattern could match invalid UTF-8. For example, `(?-u:.)` is not
2014        /// a valid pattern for a top-level `RegexSet`, but is valid for a
2015        /// `bytes::RegexSet`.
2016        ///
2017        /// For more details on the Unicode support in this crate, see the
2018        /// [Unicode section](crate#unicode) in this crate's top-level
2019        /// documentation.
2020        ///
2021        /// The default for this is `true`.
2022        ///
2023        /// # Example
2024        ///
2025        /// ```
2026        /// use regex::bytes::RegexSetBuilder;
2027        ///
2028        /// let re = RegexSetBuilder::new([r"\w"])
2029        ///     .unicode(false)
2030        ///     .build()
2031        ///     .unwrap();
2032        /// // Normally greek letters would be included in \w, but since
2033        /// // Unicode mode is disabled, it only matches ASCII letters.
2034        /// assert!(!re.is_match("δ".as_bytes()));
2035        ///
2036        /// let re = RegexSetBuilder::new([r"s"])
2037        ///     .case_insensitive(true)
2038        ///     .unicode(false)
2039        ///     .build()
2040        ///     .unwrap();
2041        /// // Normally 'ſ' is included when searching for 's' case
2042        /// // insensitively due to Unicode's simple case folding rules. But
2043        /// // when Unicode mode is disabled, only ASCII case insensitive rules
2044        /// // are used.
2045        /// assert!(!re.is_match("ſ".as_bytes()));
2046        /// ```
2047        ///
2048        /// Since this builder is for constructing a
2049        /// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if
2050        /// it would match invalid UTF-8:
2051        ///
2052        /// ```
2053        /// use regex::bytes::RegexSetBuilder;
2054        ///
2055        /// let re = RegexSetBuilder::new([r"."])
2056        ///     .unicode(false)
2057        ///     .build()
2058        ///     .unwrap();
2059        /// // Normally greek letters would be included in \w, but since
2060        /// // Unicode mode is disabled, it only matches ASCII letters.
2061        /// assert!(re.is_match(b"\xFF"));
2062        /// ```
2063        pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
2064            self.builder.unicode(yes);
2065            self
2066        }
2067
2068        /// This configures whether to enable case insensitive matching for all
2069        /// of the patterns.
2070        ///
2071        /// This setting can also be configured using the inline flag `i`
2072        /// in the pattern. For example, `(?i:foo)` matches `foo` case
2073        /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
2074        ///
2075        /// The default for this is `false`.
2076        ///
2077        /// # Example
2078        ///
2079        /// ```
2080        /// use regex::bytes::RegexSetBuilder;
2081        ///
2082        /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
2083        ///     .case_insensitive(true)
2084        ///     .build()
2085        ///     .unwrap();
2086        /// assert!(re.is_match(b"FoObarQuUx"));
2087        /// // Even though case insensitive matching is enabled in the builder,
2088        /// // it can be locally disabled within the pattern. In this case,
2089        /// // `bar` is matched case sensitively.
2090        /// assert!(!re.is_match(b"fooBARquux"));
2091        /// ```
2092        pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
2093            self.builder.case_insensitive(yes);
2094            self
2095        }
2096
2097        /// This configures multi-line mode for all of the patterns.
2098        ///
2099        /// Enabling multi-line mode changes the behavior of the `^` and `$`
2100        /// anchor assertions. Instead of only matching at the beginning and
2101        /// end of a haystack, respectively, multi-line mode causes them to
2102        /// match at the beginning and end of a line *in addition* to the
2103        /// beginning and end of a haystack. More precisely, `^` will match at
2104        /// the position immediately following a `\n` and `$` will match at the
2105        /// position immediately preceding a `\n`.
2106        ///
2107        /// The behavior of this option can be impacted by other settings too:
2108        ///
2109        /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
2110        /// above to any ASCII byte.
2111        /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
2112        /// to be either `\r` or `\n`, but never at the position between a `\r`
2113        /// and `\n`.
2114        ///
2115        /// This setting can also be configured using the inline flag `m` in
2116        /// the pattern.
2117        ///
2118        /// The default for this is `false`.
2119        ///
2120        /// # Example
2121        ///
2122        /// ```
2123        /// use regex::bytes::RegexSetBuilder;
2124        ///
2125        /// let re = RegexSetBuilder::new([r"^foo$"])
2126        ///     .multi_line(true)
2127        ///     .build()
2128        ///     .unwrap();
2129        /// assert!(re.is_match(b"\nfoo\n"));
2130        /// ```
2131        pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
2132            self.builder.multi_line(yes);
2133            self
2134        }
2135
2136        /// This configures dot-matches-new-line mode for the entire pattern.
2137        ///
2138        /// Perhaps surprisingly, the default behavior for `.` is not to match
2139        /// any character, but rather, to match any character except for the
2140        /// line terminator (which is `\n` by default). When this mode is
2141        /// enabled, the behavior changes such that `.` truly matches any
2142        /// character.
2143        ///
2144        /// This setting can also be configured using the inline flag `s` in
2145        /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
2146        /// regexes.
2147        ///
2148        /// The default for this is `false`.
2149        ///
2150        /// # Example
2151        ///
2152        /// ```
2153        /// use regex::bytes::RegexSetBuilder;
2154        ///
2155        /// let re = RegexSetBuilder::new([r"foo.bar"])
2156        ///     .dot_matches_new_line(true)
2157        ///     .build()
2158        ///     .unwrap();
2159        /// let hay = b"foo\nbar";
2160        /// assert!(re.is_match(hay));
2161        /// ```
2162        pub fn dot_matches_new_line(
2163            &mut self,
2164            yes: bool,
2165        ) -> &mut RegexSetBuilder {
2166            self.builder.dot_matches_new_line(yes);
2167            self
2168        }
2169
2170        /// This configures CRLF mode for all of the patterns.
2171        ///
2172        /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
2173        /// short) and `\n` ("line feed" or LF for short) are treated as line
2174        /// terminators. This results in the following:
2175        ///
2176        /// * Unless dot-matches-new-line mode is enabled, `.` will now match
2177        /// any character except for `\n` and `\r`.
2178        /// * When multi-line mode is enabled, `^` will match immediately
2179        /// following a `\n` or a `\r`. Similarly, `$` will match immediately
2180        /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
2181        /// between `\r` and `\n`.
2182        ///
2183        /// This setting can also be configured using the inline flag `R` in
2184        /// the pattern.
2185        ///
2186        /// The default for this is `false`.
2187        ///
2188        /// # Example
2189        ///
2190        /// ```
2191        /// use regex::bytes::RegexSetBuilder;
2192        ///
2193        /// let re = RegexSetBuilder::new([r"^foo$"])
2194        ///     .multi_line(true)
2195        ///     .crlf(true)
2196        ///     .build()
2197        ///     .unwrap();
2198        /// let hay = b"\r\nfoo\r\n";
2199        /// // If CRLF mode weren't enabled here, then '$' wouldn't match
2200        /// // immediately after 'foo', and thus no match would be found.
2201        /// assert!(re.is_match(hay));
2202        /// ```
2203        ///
2204        /// This example demonstrates that `^` will never match at a position
2205        /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
2206        /// and a `\n`.)
2207        ///
2208        /// ```
2209        /// use regex::bytes::RegexSetBuilder;
2210        ///
2211        /// let re = RegexSetBuilder::new([r"^\n"])
2212        ///     .multi_line(true)
2213        ///     .crlf(true)
2214        ///     .build()
2215        ///     .unwrap();
2216        /// assert!(!re.is_match(b"\r\n"));
2217        /// ```
2218        pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
2219            self.builder.crlf(yes);
2220            self
2221        }
2222
2223        /// Configures the line terminator to be used by the regex.
2224        ///
2225        /// The line terminator is relevant in two ways for a particular regex:
2226        ///
2227        /// * When dot-matches-new-line mode is *not* enabled (the default),
2228        /// then `.` will match any character except for the configured line
2229        /// terminator.
2230        /// * When multi-line mode is enabled (not the default), then `^` and
2231        /// `$` will match immediately after and before, respectively, a line
2232        /// terminator.
2233        ///
2234        /// In both cases, if CRLF mode is enabled in a particular context,
2235        /// then it takes precedence over any configured line terminator.
2236        ///
2237        /// This option cannot be configured from within the pattern.
2238        ///
2239        /// The default line terminator is `\n`.
2240        ///
2241        /// # Example
2242        ///
2243        /// This shows how to treat the NUL byte as a line terminator. This can
2244        /// be a useful heuristic when searching binary data.
2245        ///
2246        /// ```
2247        /// use regex::bytes::RegexSetBuilder;
2248        ///
2249        /// let re = RegexSetBuilder::new([r"^foo$"])
2250        ///     .multi_line(true)
2251        ///     .line_terminator(b'\x00')
2252        ///     .build()
2253        ///     .unwrap();
2254        /// let hay = b"\x00foo\x00";
2255        /// assert!(re.is_match(hay));
2256        /// ```
2257        ///
2258        /// This example shows that the behavior of `.` is impacted by this
2259        /// setting as well:
2260        ///
2261        /// ```
2262        /// use regex::bytes::RegexSetBuilder;
2263        ///
2264        /// let re = RegexSetBuilder::new([r"."])
2265        ///     .line_terminator(b'\x00')
2266        ///     .build()
2267        ///     .unwrap();
2268        /// assert!(re.is_match(b"\n"));
2269        /// assert!(!re.is_match(b"\x00"));
2270        /// ```
2271        ///
2272        /// This shows that building a regex will work even when the byte given
2273        /// is not ASCII. This is unlike the top-level `RegexSet` API where
2274        /// matching invalid UTF-8 is not allowed.
2275        ///
2276        /// Note though that you must disable Unicode mode. This is required
2277        /// because Unicode mode requires matching one codepoint at a time,
2278        /// and there is no way to match a non-ASCII byte as if it were a
2279        /// codepoint.
2280        ///
2281        /// ```
2282        /// use regex::bytes::RegexSetBuilder;
2283        ///
2284        /// assert!(
2285        ///     RegexSetBuilder::new([r"."])
2286        ///         .unicode(false)
2287        ///         .line_terminator(0x80)
2288        ///         .build()
2289        ///         .is_ok(),
2290        /// );
2291        /// ```
2292        pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
2293            self.builder.line_terminator(byte);
2294            self
2295        }
2296
2297        /// This configures swap-greed mode for all of the patterns.
2298        ///
2299        /// When swap-greed mode is enabled, patterns like `a+` will become
2300        /// non-greedy and patterns like `a+?` will become greedy. In other
2301        /// words, the meanings of `a+` and `a+?` are switched.
2302        ///
2303        /// This setting can also be configured using the inline flag `U` in
2304        /// the pattern.
2305        ///
2306        /// Note that this is generally not useful for a `RegexSet` since a
2307        /// `RegexSet` can only report whether a pattern matches or not. Since
2308        /// greediness never impacts whether a match is found or not (only the
2309        /// offsets of the match), it follows that whether parts of a pattern
2310        /// are greedy or not doesn't matter for a `RegexSet`.
2311        ///
2312        /// The default for this is `false`.
2313        pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
2314            self.builder.swap_greed(yes);
2315            self
2316        }
2317
2318        /// This configures verbose mode for all of the patterns.
2319        ///
2320        /// When enabled, whitespace will treated as insignifcant in the
2321        /// pattern and `#` can be used to start a comment until the next new
2322        /// line.
2323        ///
2324        /// Normally, in most places in a pattern, whitespace is treated
2325        /// literally. For example ` +` will match one or more ASCII whitespace
2326        /// characters.
2327        ///
2328        /// When verbose mode is enabled, `\#` can be used to match a literal
2329        /// `#` and `\ ` can be used to match a literal ASCII whitespace
2330        /// character.
2331        ///
2332        /// Verbose mode is useful for permitting regexes to be formatted and
2333        /// broken up more nicely. This may make them more easily readable.
2334        ///
2335        /// This setting can also be configured using the inline flag `x` in
2336        /// the pattern.
2337        ///
2338        /// The default for this is `false`.
2339        ///
2340        /// # Example
2341        ///
2342        /// ```
2343        /// use regex::bytes::RegexSetBuilder;
2344        ///
2345        /// let pat = r"
2346        ///     \b
2347        ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
2348        ///     [\s--\n]+                   # whitespace should separate names
2349        ///     (?: # middle name can be an initial!
2350        ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
2351        ///         [\s--\n]+
2352        ///     )?
2353        ///     (?<last>\p{Uppercase}\w*)
2354        ///     \b
2355        /// ";
2356        /// let re = RegexSetBuilder::new([pat])
2357        ///     .ignore_whitespace(true)
2358        ///     .build()
2359        ///     .unwrap();
2360        /// assert!(re.is_match(b"Harry Potter"));
2361        /// assert!(re.is_match(b"Harry J. Potter"));
2362        /// assert!(re.is_match(b"Harry James Potter"));
2363        /// assert!(!re.is_match(b"harry J. Potter"));
2364        /// ```
2365        pub fn ignore_whitespace(
2366            &mut self,
2367            yes: bool,
2368        ) -> &mut RegexSetBuilder {
2369            self.builder.ignore_whitespace(yes);
2370            self
2371        }
2372
2373        /// This configures octal mode for all of the patterns.
2374        ///
2375        /// Octal syntax is a little-known way of uttering Unicode codepoints
2376        /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
2377        /// equivalent patterns, where the last example shows octal syntax.
2378        ///
2379        /// While supporting octal syntax isn't in and of itself a problem,
2380        /// it does make good error messages harder. That is, in PCRE based
2381        /// regex engines, syntax like `\1` invokes a backreference, which is
2382        /// explicitly unsupported this library. However, many users expect
2383        /// backreferences to be supported. Therefore, when octal support
2384        /// is disabled, the error message will explicitly mention that
2385        /// backreferences aren't supported.
2386        ///
2387        /// The default for this is `false`.
2388        ///
2389        /// # Example
2390        ///
2391        /// ```
2392        /// use regex::bytes::RegexSetBuilder;
2393        ///
2394        /// // Normally this pattern would not compile, with an error message
2395        /// // about backreferences not being supported. But with octal mode
2396        /// // enabled, octal escape sequences work.
2397        /// let re = RegexSetBuilder::new([r"\141"])
2398        ///     .octal(true)
2399        ///     .build()
2400        ///     .unwrap();
2401        /// assert!(re.is_match(b"a"));
2402        /// ```
2403        pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
2404            self.builder.octal(yes);
2405            self
2406        }
2407
2408        /// Sets the approximate size limit, in bytes, of the compiled regex.
2409        ///
2410        /// This roughly corresponds to the number of heap memory, in
2411        /// bytes, occupied by a single regex. If the regex would otherwise
2412        /// approximately exceed this limit, then compiling that regex will
2413        /// fail.
2414        ///
2415        /// The main utility of a method like this is to avoid compiling
2416        /// regexes that use an unexpected amount of resources, such as
2417        /// time and memory. Even if the memory usage of a large regex is
2418        /// acceptable, its search time may not be. Namely, worst case time
2419        /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
2420        /// `n ~ len(haystack)`. That is, search time depends, in part, on the
2421        /// size of the compiled regex. This means that putting a limit on the
2422        /// size of the regex limits how much a regex can impact search time.
2423        ///
2424        /// For more information about regex size limits, see the section on
2425        /// [untrusted inputs](crate#untrusted-input) in the top-level crate
2426        /// documentation.
2427        ///
2428        /// The default for this is some reasonable number that permits most
2429        /// patterns to compile successfully.
2430        ///
2431        /// # Example
2432        ///
2433        /// ```
2434        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
2435        /// use regex::bytes::RegexSetBuilder;
2436        ///
2437        /// // It may surprise you how big some seemingly small patterns can
2438        /// // be! Since \w is Unicode aware, this generates a regex that can
2439        /// // match approximately 140,000 distinct codepoints.
2440        /// assert!(
2441        ///     RegexSetBuilder::new([r"\w"])
2442        ///         .size_limit(45_000)
2443        ///         .build()
2444        ///         .is_err()
2445        /// );
2446        /// ```
2447        pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
2448            self.builder.size_limit(bytes);
2449            self
2450        }
2451
2452        /// Set the approximate capacity, in bytes, of the cache of transitions
2453        /// used by the lazy DFA.
2454        ///
2455        /// While the lazy DFA isn't always used, in tends to be the most
2456        /// commonly use regex engine in default configurations. It tends to
2457        /// adopt the performance profile of a fully build DFA, but without the
2458        /// downside of taking worst case exponential time to build.
2459        ///
2460        /// The downside is that it needs to keep a cache of transitions and
2461        /// states that are built while running a search, and this cache
2462        /// can fill up. When it fills up, the cache will reset itself. Any
2463        /// previously generated states and transitions will then need to be
2464        /// re-generated. If this happens too many times, then this library
2465        /// will bail out of using the lazy DFA and switch to a different regex
2466        /// engine.
2467        ///
2468        /// If your regex provokes this particular downside of the lazy DFA,
2469        /// then it may be beneficial to increase its cache capacity. This will
2470        /// potentially reduce the frequency of cache resetting (ideally to
2471        /// `0`). While it won't fix all potential performance problems with
2472        /// the lazy DFA, increasing the cache capacity does fix some.
2473        ///
2474        /// There is no easy way to determine, a priori, whether increasing
2475        /// this cache capacity will help. In general, the larger your regex,
2476        /// the more cache it's likely to use. But that isn't an ironclad rule.
2477        /// For example, a regex like `[01]*1[01]{N}` would normally produce a
2478        /// fully build DFA that is exponential in size with respect to `N`.
2479        /// The lazy DFA will prevent exponential space blow-up, but it cache
2480        /// is likely to fill up, even when it's large and even for smallish
2481        /// values of `N`.
2482        ///
2483        /// If you aren't sure whether this helps or not, it is sensible to
2484        /// set this to some arbitrarily large number in testing, such as
2485        /// `usize::MAX`. Namely, this represents the amount of capacity that
2486        /// *may* be used. It's probably not a good idea to use `usize::MAX` in
2487        /// production though, since it implies there are no controls on heap
2488        /// memory used by this library during a search. In effect, set it to
2489        /// whatever you're willing to allocate for a single regex search.
2490        pub fn dfa_size_limit(
2491            &mut self,
2492            bytes: usize,
2493        ) -> &mut RegexSetBuilder {
2494            self.builder.dfa_size_limit(bytes);
2495            self
2496        }
2497
2498        /// Set the nesting limit for this parser.
2499        ///
2500        /// The nesting limit controls how deep the abstract syntax tree is
2501        /// allowed to be. If the AST exceeds the given limit (e.g., with too
2502        /// many nested groups), then an error is returned by the parser.
2503        ///
2504        /// The purpose of this limit is to act as a heuristic to prevent stack
2505        /// overflow for consumers that do structural induction on an AST using
2506        /// explicit recursion. While this crate never does this (instead using
2507        /// constant stack space and moving the call stack to the heap), other
2508        /// crates may.
2509        ///
2510        /// This limit is not checked until the entire AST is parsed.
2511        /// Therefore, if callers want to put a limit on the amount of heap
2512        /// space used, then they should impose a limit on the length, in
2513        /// bytes, of the concrete pattern string. In particular, this is
2514        /// viable since this parser implementation will limit itself to heap
2515        /// space proportional to the length of the pattern string. See also
2516        /// the [untrusted inputs](crate#untrusted-input) section in the
2517        /// top-level crate documentation for more information about this.
2518        ///
2519        /// Note that a nest limit of `0` will return a nest limit error for
2520        /// most patterns but not all. For example, a nest limit of `0` permits
2521        /// `a` but not `ab`, since `ab` requires an explicit concatenation,
2522        /// which results in a nest depth of `1`. In general, a nest limit is
2523        /// not something that manifests in an obvious way in the concrete
2524        /// syntax, therefore, it should not be used in a granular way.
2525        ///
2526        /// # Example
2527        ///
2528        /// ```
2529        /// use regex::bytes::RegexSetBuilder;
2530        ///
2531        /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
2532        /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
2533        /// ```
2534        pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
2535            self.builder.nest_limit(limit);
2536            self
2537        }
2538    }
2539}
regex/builders.rs

regex/
builders.rs