regex_syntax/hir/
mod.rs

1/*!
2Defines a high-level intermediate (HIR) representation for regular expressions.
3
4The HIR is represented by the [`Hir`] type, and it principally constructed via
5[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6may use the smart constructors defined on `Hir` to build their own by hand. The
7smart constructors simultaneously simplify and "optimize" the HIR, and are also
8the same routines used by translation.
9
10Most regex engines only have an HIR like this, and usually construct it
11directly from the concrete syntax. This crate however first parses the
12concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13as mentioned above. It's done this way to facilitate better error reporting,
14and to have a structured representation of a regex that faithfully represents
15its concrete syntax. Namely, while an `Hir` value can be converted back to an
16equivalent regex pattern string, it is unlikely to look like the original due
17to its simplified structure.
18*/
19
20use core::{char, cmp};
21
22use alloc::{
23    boxed::Box,
24    format,
25    string::{String, ToString},
26    vec,
27    vec::Vec,
28};
29
30use crate::{
31    ast::Span,
32    hir::interval::{Interval, IntervalSet, IntervalSetIter},
33    unicode,
34};
35
36pub use crate::{
37    hir::visitor::{visit, Visitor},
38    unicode::CaseFoldError,
39};
40
41mod interval;
42pub mod literal;
43pub mod print;
44pub mod translate;
45mod visitor;
46
47/// An error that can occur while translating an `Ast` to a `Hir`.
48#[derive(Clone, Debug, Eq, PartialEq)]
49pub struct Error {
50    /// The kind of error.
51    kind: ErrorKind,
52    /// The original pattern that the translator's Ast was parsed from. Every
53    /// span in an error is a valid range into this string.
54    pattern: String,
55    /// The span of this error, derived from the Ast given to the translator.
56    span: Span,
57}
58
59impl Error {
60    /// Return the type of this error.
61    pub fn kind(&self) -> &ErrorKind {
62        &self.kind
63    }
64
65    /// The original pattern string in which this error occurred.
66    ///
67    /// Every span reported by this error is reported in terms of this string.
68    pub fn pattern(&self) -> &str {
69        &self.pattern
70    }
71
72    /// Return the span at which this error occurred.
73    pub fn span(&self) -> &Span {
74        &self.span
75    }
76}
77
78/// The type of an error that occurred while building an `Hir`.
79///
80/// This error type is marked as `non_exhaustive`. This means that adding a
81/// new variant is not considered a breaking change.
82#[non_exhaustive]
83#[derive(Clone, Debug, Eq, PartialEq)]
84pub enum ErrorKind {
85    /// This error occurs when a Unicode feature is used when Unicode
86    /// support is disabled. For example `(?-u:\pL)` would trigger this error.
87    UnicodeNotAllowed,
88    /// This error occurs when translating a pattern that could match a byte
89    /// sequence that isn't UTF-8 and `utf8` was enabled.
90    InvalidUtf8,
91    /// This error occurs when one uses a non-ASCII byte for a line terminator,
92    /// but where Unicode mode is enabled and UTF-8 mode is disabled.
93    InvalidLineTerminator,
94    /// This occurs when an unrecognized Unicode property name could not
95    /// be found.
96    UnicodePropertyNotFound,
97    /// This occurs when an unrecognized Unicode property value could not
98    /// be found.
99    UnicodePropertyValueNotFound,
100    /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101    /// `\d`) could not be found. This can occur when the `unicode-perl`
102    /// crate feature is not enabled.
103    UnicodePerlClassNotFound,
104    /// This occurs when the Unicode simple case mapping tables are not
105    /// available, and the regular expression required Unicode aware case
106    /// insensitivity.
107    UnicodeCaseUnavailable,
108}
109
110#[cfg(feature = "std")]
111impl std::error::Error for Error {}
112
113impl core::fmt::Display for Error {
114    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115        crate::error::Formatter::from(self).fmt(f)
116    }
117}
118
119impl core::fmt::Display for ErrorKind {
120    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121        use self::ErrorKind::*;
122
123        let msg = match *self {
124            UnicodeNotAllowed => "Unicode not allowed here",
125            InvalidUtf8 => "pattern can match invalid UTF-8",
126            InvalidLineTerminator => "invalid line terminator, must be ASCII",
127            UnicodePropertyNotFound => "Unicode property not found",
128            UnicodePropertyValueNotFound => "Unicode property value not found",
129            UnicodePerlClassNotFound => {
130                "Unicode-aware Perl class not found \
131                 (make sure the unicode-perl feature is enabled)"
132            }
133            UnicodeCaseUnavailable => {
134                "Unicode-aware case insensitivity matching is not available \
135                 (make sure the unicode-case feature is enabled)"
136            }
137        };
138        f.write_str(msg)
139    }
140}
141
142/// A high-level intermediate representation (HIR) for a regular expression.
143///
144/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145/// An `HirKind` indicates what kind of regular expression it is (a literal,
146/// a repetition, a look-around assertion, etc.), where as a `Properties`
147/// describes various facts about the regular expression. For example, whether
148/// it matches UTF-8 or if it matches the empty string.
149///
150/// The HIR of a regular expression represents an intermediate step between
151/// its abstract syntax (a structured description of the concrete syntax) and
152/// an actual regex matcher. The purpose of HIR is to make regular expressions
153/// easier to analyze. In particular, the AST is much more complex than the
154/// HIR. For example, while an AST supports arbitrarily nested character
155/// classes, the HIR will flatten all nested classes into a single set. The HIR
156/// will also "compile away" every flag present in the concrete syntax. For
157/// example, users of HIR expressions never need to worry about case folding;
158/// it is handled automatically by the translator (e.g., by translating
159/// `(?i:A)` to `[aA]`).
160///
161/// The specific type of an HIR expression can be accessed via its `kind`
162/// or `into_kind` methods. This extra level of indirection exists for two
163/// reasons:
164///
165/// 1. Construction of an HIR expression *must* use the constructor methods on
166/// this `Hir` type instead of building the `HirKind` values directly. This
167/// permits construction to enforce invariants like "concatenations always
168/// consist of two or more sub-expressions."
169/// 2. Every HIR expression contains attributes that are defined inductively,
170/// and can be computed cheaply during the construction process. For example,
171/// one such attribute is whether the expression must match at the beginning of
172/// the haystack.
173///
174/// In particular, if you have an `HirKind` value, then there is intentionally
175/// no way to build an `Hir` value from it. You instead need to do case
176/// analysis on the `HirKind` value and build the `Hir` value using its smart
177/// constructors.
178///
179/// # UTF-8
180///
181/// If the HIR was produced by a translator with
182/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184/// matches.
185///
186/// For empty matches, those can occur at any position. It is the
187/// responsibility of the regex engine to determine whether empty matches are
188/// permitted between the code units of a single codepoint.
189///
190/// # Stack space
191///
192/// This type defines its own destructor that uses constant stack space and
193/// heap space proportional to the size of the HIR.
194///
195/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196/// expression pattern string, and uses constant stack space and heap space
197/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198/// be _semantically_ equivalent to the original concrete syntax, but it may
199/// look very different. (And potentially not practically readable by a human.)
200///
201/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202/// stack space. The implementation will also suppress some details (such as
203/// the `Properties` inlined into every `Hir` value to make it less noisy).
204#[derive(Clone, Eq, PartialEq)]
205pub struct Hir {
206    /// The underlying HIR kind.
207    kind: HirKind,
208    /// Analysis info about this HIR, computed during construction.
209    props: Properties,
210}
211
212/// Methods for accessing the underlying `HirKind` and `Properties`.
213impl Hir {
214    /// Returns a reference to the underlying HIR kind.
215    pub fn kind(&self) -> &HirKind {
216        &self.kind
217    }
218
219    /// Consumes ownership of this HIR expression and returns its underlying
220    /// `HirKind`.
221    pub fn into_kind(mut self) -> HirKind {
222        core::mem::replace(&mut self.kind, HirKind::Empty)
223    }
224
225    /// Returns the properties computed for this `Hir`.
226    pub fn properties(&self) -> &Properties {
227        &self.props
228    }
229
230    /// Splits this HIR into its constituent parts.
231    ///
232    /// This is useful because `let Hir { kind, props } = hir;` does not work
233    /// because of `Hir`'s custom `Drop` implementation.
234    fn into_parts(mut self) -> (HirKind, Properties) {
235        (
236            core::mem::replace(&mut self.kind, HirKind::Empty),
237            core::mem::replace(&mut self.props, Properties::empty()),
238        )
239    }
240}
241
242/// Smart constructors for HIR values.
243///
244/// These constructors are called "smart" because they do inductive work or
245/// simplifications. For example, calling `Hir::repetition` with a repetition
246/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247/// since it is equivalent to an empty regex. Another example is calling
248/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249/// just get back the original `expr` since it's precisely equivalent.
250///
251/// Smart constructors enable maintaining invariants about the HIR data type
252/// while also simulanteously keeping the representation as simple as possible.
253impl Hir {
254    /// Returns an empty HIR expression.
255    ///
256    /// An empty HIR expression always matches, including the empty string.
257    #[inline]
258    pub fn empty() -> Hir {
259        let props = Properties::empty();
260        Hir { kind: HirKind::Empty, props }
261    }
262
263    /// Returns an HIR expression that can never match anything. That is,
264    /// the size of the set of strings in the language described by the HIR
265    /// returned is `0`.
266    ///
267    /// This is distinct from [`Hir::empty`] in that the empty string matches
268    /// the HIR returned by `Hir::empty`. That is, the set of strings in the
269    /// language describe described by `Hir::empty` is non-empty.
270    ///
271    /// Note that currently, the HIR returned uses an empty character class to
272    /// indicate that nothing can match. An equivalent expression that cannot
273    /// match is an empty alternation, but all such "fail" expressions are
274    /// normalized (via smart constructors) to empty character classes. This is
275    /// because empty character classes can be spelled in the concrete syntax
276    /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277    /// empty alternations cannot.
278    #[inline]
279    pub fn fail() -> Hir {
280        let class = Class::Bytes(ClassBytes::empty());
281        let props = Properties::class(&class);
282        // We can't just call Hir::class here because it defers to Hir::fail
283        // in order to canonicalize the Hir value used to represent "cannot
284        // match."
285        Hir { kind: HirKind::Class(class), props }
286    }
287
288    /// Creates a literal HIR expression.
289    ///
290    /// This accepts anything that can be converted into a `Box<[u8]>`.
291    ///
292    /// Note that there is no mechanism for storing a `char` or a `Box<str>`
293    /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294    /// any HIR node) matches valid UTF-8 exclusively can be queried via
295    /// [`Properties::is_utf8`].
296    ///
297    /// # Example
298    ///
299    /// This example shows that concatenations of `Literal` HIR values will
300    /// automatically get flattened and combined together. So for example, even
301    /// if you concat multiple `Literal` values that are themselves not valid
302    /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303    /// how "smart" Hir's smart constructors are.
304    ///
305    /// ```
306    /// use regex_syntax::hir::{Hir, HirKind, Literal};
307    ///
308    /// let literals = vec![
309    ///     Hir::literal([0xE2]),
310    ///     Hir::literal([0x98]),
311    ///     Hir::literal([0x83]),
312    /// ];
313    /// // Each literal, on its own, is invalid UTF-8.
314    /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
315    ///
316    /// let concat = Hir::concat(literals);
317    /// // But the concatenation is valid UTF-8!
318    /// assert!(concat.properties().is_utf8());
319    ///
320    /// // And also notice that the literals have been concatenated into a
321    /// // single `Literal`, to the point where there is no explicit `Concat`!
322    /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323    /// assert_eq!(&expected, concat.kind());
324    /// ```
325    ///
326    /// # Example: building a literal from a `char`
327    ///
328    /// This example shows how to build a single `Hir` literal from a `char`
329    /// value. Since a [`Literal`] is just bytes, we just need to UTF-8
330    /// encode a `char` value:
331    ///
332    /// ```
333    /// use regex_syntax::hir::{Hir, HirKind, Literal};
334    ///
335    /// let ch = '☃';
336    /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes());
337    ///
338    /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
339    /// assert_eq!(&expected, got.kind());
340    /// ```
341    #[inline]
342    pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343        let bytes = lit.into();
344        if bytes.is_empty() {
345            return Hir::empty();
346        }
347
348        let lit = Literal(bytes);
349        let props = Properties::literal(&lit);
350        Hir { kind: HirKind::Literal(lit), props }
351    }
352
353    /// Creates a class HIR expression. The class may either be defined over
354    /// ranges of Unicode codepoints or ranges of raw byte values.
355    ///
356    /// Note that an empty class is permitted. An empty class is equivalent to
357    /// `Hir::fail()`.
358    #[inline]
359    pub fn class(class: Class) -> Hir {
360        if class.is_empty() {
361            return Hir::fail();
362        } else if let Some(bytes) = class.literal() {
363            return Hir::literal(bytes);
364        }
365        let props = Properties::class(&class);
366        Hir { kind: HirKind::Class(class), props }
367    }
368
369    /// Creates a look-around assertion HIR expression.
370    #[inline]
371    pub fn look(look: Look) -> Hir {
372        let props = Properties::look(look);
373        Hir { kind: HirKind::Look(look), props }
374    }
375
376    /// Creates a repetition HIR expression.
377    #[inline]
378    pub fn repetition(mut rep: Repetition) -> Hir {
379        // If the sub-expression of a repetition can only match the empty
380        // string, then we force its maximum to be at most 1.
381        if rep.sub.properties().maximum_len() == Some(0) {
382            rep.min = cmp::min(rep.min, 1);
383            rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
384        }
385        // The regex 'a{0}' is always equivalent to the empty regex. This is
386        // true even when 'a' is an expression that never matches anything
387        // (like '\P{any}').
388        //
389        // Additionally, the regex 'a{1}' is always equivalent to 'a'.
390        if rep.min == 0 && rep.max == Some(0) {
391            return Hir::empty();
392        } else if rep.min == 1 && rep.max == Some(1) {
393            return *rep.sub;
394        }
395        let props = Properties::repetition(&rep);
396        Hir { kind: HirKind::Repetition(rep), props }
397    }
398
399    /// Creates a capture HIR expression.
400    ///
401    /// Note that there is no explicit HIR value for a non-capturing group.
402    /// Since a non-capturing group only exists to override precedence in the
403    /// concrete syntax and since an HIR already does its own grouping based on
404    /// what is parsed, there is no need to explicitly represent non-capturing
405    /// groups in the HIR.
406    #[inline]
407    pub fn capture(capture: Capture) -> Hir {
408        let props = Properties::capture(&capture);
409        Hir { kind: HirKind::Capture(capture), props }
410    }
411
412    /// Returns the concatenation of the given expressions.
413    ///
414    /// This attempts to flatten and simplify the concatenation as appropriate.
415    ///
416    /// # Example
417    ///
418    /// This shows a simple example of basic flattening of both concatenations
419    /// and literals.
420    ///
421    /// ```
422    /// use regex_syntax::hir::Hir;
423    ///
424    /// let hir = Hir::concat(vec![
425    ///     Hir::concat(vec![
426    ///         Hir::literal([b'a']),
427    ///         Hir::literal([b'b']),
428    ///         Hir::literal([b'c']),
429    ///     ]),
430    ///     Hir::concat(vec![
431    ///         Hir::literal([b'x']),
432    ///         Hir::literal([b'y']),
433    ///         Hir::literal([b'z']),
434    ///     ]),
435    /// ]);
436    /// let expected = Hir::literal("abcxyz".as_bytes());
437    /// assert_eq!(expected, hir);
438    /// ```
439    pub fn concat(subs: Vec<Hir>) -> Hir {
440        // We rebuild the concatenation by simplifying it. Would be nice to do
441        // it in place, but that seems a little tricky?
442        let mut new = vec![];
443        // This gobbles up any adjacent literals in a concatenation and smushes
444        // them together. Basically, when we see a literal, we add its bytes
445        // to 'prior_lit', and whenever we see anything else, we first take
446        // any bytes in 'prior_lit' and add it to the 'new' concatenation.
447        let mut prior_lit: Option<Vec<u8>> = None;
448        for sub in subs {
449            let (kind, props) = sub.into_parts();
450            match kind {
451                HirKind::Literal(Literal(bytes)) => {
452                    if let Some(ref mut prior_bytes) = prior_lit {
453                        prior_bytes.extend_from_slice(&bytes);
454                    } else {
455                        prior_lit = Some(bytes.to_vec());
456                    }
457                }
458                // We also flatten concats that are direct children of another
459                // concat. We only need to do this one level deep since
460                // Hir::concat is the only way to build concatenations, and so
461                // flattening happens inductively.
462                HirKind::Concat(subs2) => {
463                    for sub2 in subs2 {
464                        let (kind2, props2) = sub2.into_parts();
465                        match kind2 {
466                            HirKind::Literal(Literal(bytes)) => {
467                                if let Some(ref mut prior_bytes) = prior_lit {
468                                    prior_bytes.extend_from_slice(&bytes);
469                                } else {
470                                    prior_lit = Some(bytes.to_vec());
471                                }
472                            }
473                            kind2 => {
474                                if let Some(prior_bytes) = prior_lit.take() {
475                                    new.push(Hir::literal(prior_bytes));
476                                }
477                                new.push(Hir { kind: kind2, props: props2 });
478                            }
479                        }
480                    }
481                }
482                // We can just skip empty HIRs.
483                HirKind::Empty => {}
484                kind => {
485                    if let Some(prior_bytes) = prior_lit.take() {
486                        new.push(Hir::literal(prior_bytes));
487                    }
488                    new.push(Hir { kind, props });
489                }
490            }
491        }
492        if let Some(prior_bytes) = prior_lit.take() {
493            new.push(Hir::literal(prior_bytes));
494        }
495        if new.is_empty() {
496            return Hir::empty();
497        } else if new.len() == 1 {
498            return new.pop().unwrap();
499        }
500        let props = Properties::concat(&new);
501        Hir { kind: HirKind::Concat(new), props }
502    }
503
504    /// Returns the alternation of the given expressions.
505    ///
506    /// This flattens and simplifies the alternation as appropriate. This may
507    /// include factoring out common prefixes or even rewriting the alternation
508    /// as a character class.
509    ///
510    /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
511    /// is not possible for one to write an empty alternation, or even an
512    /// alternation with a single sub-expression, in the concrete syntax of a
513    /// regex.)
514    ///
515    /// # Example
516    ///
517    /// This is a simple example showing how an alternation might get
518    /// simplified.
519    ///
520    /// ```
521    /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
522    ///
523    /// let hir = Hir::alternation(vec![
524    ///     Hir::literal([b'a']),
525    ///     Hir::literal([b'b']),
526    ///     Hir::literal([b'c']),
527    ///     Hir::literal([b'd']),
528    ///     Hir::literal([b'e']),
529    ///     Hir::literal([b'f']),
530    /// ]);
531    /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
532    ///     ClassUnicodeRange::new('a', 'f'),
533    /// ])));
534    /// assert_eq!(expected, hir);
535    /// ```
536    ///
537    /// And another example showing how common prefixes might get factored
538    /// out.
539    ///
540    /// ```
541    /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
542    ///
543    /// let hir = Hir::alternation(vec![
544    ///     Hir::concat(vec![
545    ///         Hir::literal("abc".as_bytes()),
546    ///         Hir::class(Class::Unicode(ClassUnicode::new([
547    ///             ClassUnicodeRange::new('A', 'Z'),
548    ///         ]))),
549    ///     ]),
550    ///     Hir::concat(vec![
551    ///         Hir::literal("abc".as_bytes()),
552    ///         Hir::class(Class::Unicode(ClassUnicode::new([
553    ///             ClassUnicodeRange::new('a', 'z'),
554    ///         ]))),
555    ///     ]),
556    /// ]);
557    /// let expected = Hir::concat(vec![
558    ///     Hir::literal("abc".as_bytes()),
559    ///     Hir::alternation(vec![
560    ///         Hir::class(Class::Unicode(ClassUnicode::new([
561    ///             ClassUnicodeRange::new('A', 'Z'),
562    ///         ]))),
563    ///         Hir::class(Class::Unicode(ClassUnicode::new([
564    ///             ClassUnicodeRange::new('a', 'z'),
565    ///         ]))),
566    ///     ]),
567    /// ]);
568    /// assert_eq!(expected, hir);
569    /// ```
570    ///
571    /// Note that these sorts of simplifications are not guaranteed.
572    pub fn alternation(subs: Vec<Hir>) -> Hir {
573        // We rebuild the alternation by simplifying it. We proceed similarly
574        // as the concatenation case. But in this case, there's no literal
575        // simplification happening. We're just flattening alternations.
576        let mut new = Vec::with_capacity(subs.len());
577        for sub in subs {
578            let (kind, props) = sub.into_parts();
579            match kind {
580                HirKind::Alternation(subs2) => {
581                    new.extend(subs2);
582                }
583                kind => {
584                    new.push(Hir { kind, props });
585                }
586            }
587        }
588        if new.is_empty() {
589            return Hir::fail();
590        } else if new.len() == 1 {
591            return new.pop().unwrap();
592        }
593        // Now that it's completely flattened, look for the special case of
594        // 'char1|char2|...|charN' and collapse that into a class. Note that
595        // we look for 'char' first and then bytes. The issue here is that if
596        // we find both non-ASCII codepoints and non-ASCII singleton bytes,
597        // then it isn't actually possible to smush them into a single class.
598        // (Because classes are either "all codepoints" or "all bytes." You
599        // can have a class that both matches non-ASCII but valid UTF-8 and
600        // invalid UTF-8.) So we look for all chars and then all bytes, and
601        // don't handle anything else.
602        if let Some(singletons) = singleton_chars(&new) {
603            let it = singletons
604                .into_iter()
605                .map(|ch| ClassUnicodeRange { start: ch, end: ch });
606            return Hir::class(Class::Unicode(ClassUnicode::new(it)));
607        }
608        if let Some(singletons) = singleton_bytes(&new) {
609            let it = singletons
610                .into_iter()
611                .map(|b| ClassBytesRange { start: b, end: b });
612            return Hir::class(Class::Bytes(ClassBytes::new(it)));
613        }
614        // Similar to singleton chars, we can also look for alternations of
615        // classes. Those can be smushed into a single class.
616        if let Some(cls) = class_chars(&new) {
617            return Hir::class(cls);
618        }
619        if let Some(cls) = class_bytes(&new) {
620            return Hir::class(cls);
621        }
622        // Factor out a common prefix if we can, which might potentially
623        // simplify the expression and unlock other optimizations downstream.
624        // It also might generally make NFA matching and DFA construction
625        // faster by reducing the scope of branching in the regex.
626        new = match lift_common_prefix(new) {
627            Ok(hir) => return hir,
628            Err(unchanged) => unchanged,
629        };
630        let props = Properties::alternation(&new);
631        Hir { kind: HirKind::Alternation(new), props }
632    }
633
634    /// Returns an HIR expression for `.`.
635    ///
636    /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
637    /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
638    /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
639    /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
640    /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
641    /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
642    ///
643    /// # Example
644    ///
645    /// Note that this is a convenience routine for constructing the correct
646    /// character class based on the value of `Dot`. There is no explicit "dot"
647    /// HIR value. It is just an abbreviation for a common character class.
648    ///
649    /// ```
650    /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
651    ///
652    /// let hir = Hir::dot(Dot::AnyByte);
653    /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
654    ///     ClassBytesRange::new(0x00, 0xFF),
655    /// ])));
656    /// assert_eq!(expected, hir);
657    /// ```
658    #[inline]
659    pub fn dot(dot: Dot) -> Hir {
660        match dot {
661            Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([
662                ClassUnicodeRange::new('\0', '\u{10FFFF}'),
663            ]))),
664            Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([
665                ClassBytesRange::new(b'\0', b'\xFF'),
666            ]))),
667            Dot::AnyCharExcept(ch) => {
668                let mut cls =
669                    ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
670                cls.negate();
671                Hir::class(Class::Unicode(cls))
672            }
673            Dot::AnyCharExceptLF => {
674                Hir::class(Class::Unicode(ClassUnicode::new([
675                    ClassUnicodeRange::new('\0', '\x09'),
676                    ClassUnicodeRange::new('\x0B', '\u{10FFFF}'),
677                ])))
678            }
679            Dot::AnyCharExceptCRLF => {
680                Hir::class(Class::Unicode(ClassUnicode::new([
681                    ClassUnicodeRange::new('\0', '\x09'),
682                    ClassUnicodeRange::new('\x0B', '\x0C'),
683                    ClassUnicodeRange::new('\x0E', '\u{10FFFF}'),
684                ])))
685            }
686            Dot::AnyByteExcept(byte) => {
687                let mut cls =
688                    ClassBytes::new([ClassBytesRange::new(byte, byte)]);
689                cls.negate();
690                Hir::class(Class::Bytes(cls))
691            }
692            Dot::AnyByteExceptLF => {
693                Hir::class(Class::Bytes(ClassBytes::new([
694                    ClassBytesRange::new(b'\0', b'\x09'),
695                    ClassBytesRange::new(b'\x0B', b'\xFF'),
696                ])))
697            }
698            Dot::AnyByteExceptCRLF => {
699                Hir::class(Class::Bytes(ClassBytes::new([
700                    ClassBytesRange::new(b'\0', b'\x09'),
701                    ClassBytesRange::new(b'\x0B', b'\x0C'),
702                    ClassBytesRange::new(b'\x0E', b'\xFF'),
703                ])))
704            }
705        }
706    }
707}
708
709/// The underlying kind of an arbitrary [`Hir`] expression.
710///
711/// An `HirKind` is principally useful for doing case analysis on the type
712/// of a regular expression. If you're looking to build new `Hir` values,
713/// then you _must_ use the smart constructors defined on `Hir`, like
714/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
715/// not expose any way of building an `Hir` directly from an `HirKind`.
716#[derive(Clone, Debug, Eq, PartialEq)]
717pub enum HirKind {
718    /// The empty regular expression, which matches everything, including the
719    /// empty string.
720    Empty,
721    /// A literalstring that matches exactly these bytes.
722    Literal(Literal),
723    /// A single character class that matches any of the characters in the
724    /// class. A class can either consist of Unicode scalar values as
725    /// characters, or it can use bytes.
726    ///
727    /// A class may be empty. In which case, it matches nothing.
728    Class(Class),
729    /// A look-around assertion. A look-around match always has zero length.
730    Look(Look),
731    /// A repetition operation applied to a sub-expression.
732    Repetition(Repetition),
733    /// A capturing group, which contains a sub-expression.
734    Capture(Capture),
735    /// A concatenation of expressions.
736    ///
737    /// A concatenation matches only if each of its sub-expressions match one
738    /// after the other.
739    ///
740    /// Concatenations are guaranteed by `Hir`'s smart constructors to always
741    /// have at least two sub-expressions.
742    Concat(Vec<Hir>),
743    /// An alternation of expressions.
744    ///
745    /// An alternation matches only if at least one of its sub-expressions
746    /// match. If multiple sub-expressions match, then the leftmost is
747    /// preferred.
748    ///
749    /// Alternations are guaranteed by `Hir`'s smart constructors to always
750    /// have at least two sub-expressions.
751    Alternation(Vec<Hir>),
752}
753
754impl HirKind {
755    /// Returns a slice of this kind's sub-expressions, if any.
756    pub fn subs(&self) -> &[Hir] {
757        use core::slice::from_ref;
758
759        match *self {
760            HirKind::Empty
761            | HirKind::Literal(_)
762            | HirKind::Class(_)
763            | HirKind::Look(_) => &[],
764            HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
765            HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
766            HirKind::Concat(ref subs) => subs,
767            HirKind::Alternation(ref subs) => subs,
768        }
769    }
770}
771
772impl core::fmt::Debug for Hir {
773    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
774        self.kind.fmt(f)
775    }
776}
777
778/// Print a display representation of this Hir.
779///
780/// The result of this is a valid regular expression pattern string.
781///
782/// This implementation uses constant stack space and heap space proportional
783/// to the size of the `Hir`.
784impl core::fmt::Display for Hir {
785    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
786        crate::hir::print::Printer::new().print(self, f)
787    }
788}
789
790/// The high-level intermediate representation of a literal.
791///
792/// A literal corresponds to `0` or more bytes that should be matched
793/// literally. The smart constructors defined on `Hir` will automatically
794/// concatenate adjacent literals into one literal, and will even automatically
795/// replace empty literals with `Hir::empty()`.
796///
797/// Note that despite a literal being represented by a sequence of bytes, its
798/// `Debug` implementation will attempt to print it as a normal string. (That
799/// is, not a sequence of decimal numbers.)
800#[derive(Clone, Eq, PartialEq)]
801pub struct Literal(pub Box<[u8]>);
802
803impl core::fmt::Debug for Literal {
804    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
805        crate::debug::Bytes(&self.0).fmt(f)
806    }
807}
808
809/// The high-level intermediate representation of a character class.
810///
811/// A character class corresponds to a set of characters. A character is either
812/// defined by a Unicode scalar value or a byte.
813///
814/// A character class, regardless of its character type, is represented by a
815/// sequence of non-overlapping non-adjacent ranges of characters.
816///
817/// There are no guarantees about which class variant is used. Generally
818/// speaking, the Unicode variat is used whenever a class needs to contain
819/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
820/// when Unicode mode is disabled. For example, at the time of writing, the
821/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
822/// `[a\u00A0]` due to optimizations.
823///
824/// Note that `Bytes` variant may be produced even when it exclusively matches
825/// valid UTF-8. This is because a `Bytes` variant represents an intention by
826/// the author of the regular expression to disable Unicode mode, which in turn
827/// impacts the semantics of case insensitive matching. For example, `(?i)k`
828/// and `(?i-u)k` will not match the same set of strings.
829#[derive(Clone, Eq, PartialEq)]
830pub enum Class {
831    /// A set of characters represented by Unicode scalar values.
832    Unicode(ClassUnicode),
833    /// A set of characters represented by arbitrary bytes (one byte per
834    /// character).
835    Bytes(ClassBytes),
836}
837
838impl Class {
839    /// Apply Unicode simple case folding to this character class, in place.
840    /// The character class will be expanded to include all simple case folded
841    /// character variants.
842    ///
843    /// If this is a byte oriented character class, then this will be limited
844    /// to the ASCII ranges `A-Z` and `a-z`.
845    ///
846    /// # Panics
847    ///
848    /// This routine panics when the case mapping data necessary for this
849    /// routine to complete is unavailable. This occurs when the `unicode-case`
850    /// feature is not enabled and the underlying class is Unicode oriented.
851    ///
852    /// Callers should prefer using `try_case_fold_simple` instead, which will
853    /// return an error instead of panicking.
854    pub fn case_fold_simple(&mut self) {
855        match *self {
856            Class::Unicode(ref mut x) => x.case_fold_simple(),
857            Class::Bytes(ref mut x) => x.case_fold_simple(),
858        }
859    }
860
861    /// Apply Unicode simple case folding to this character class, in place.
862    /// The character class will be expanded to include all simple case folded
863    /// character variants.
864    ///
865    /// If this is a byte oriented character class, then this will be limited
866    /// to the ASCII ranges `A-Z` and `a-z`.
867    ///
868    /// # Error
869    ///
870    /// This routine returns an error when the case mapping data necessary
871    /// for this routine to complete is unavailable. This occurs when the
872    /// `unicode-case` feature is not enabled and the underlying class is
873    /// Unicode oriented.
874    pub fn try_case_fold_simple(
875        &mut self,
876    ) -> core::result::Result<(), CaseFoldError> {
877        match *self {
878            Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
879            Class::Bytes(ref mut x) => x.case_fold_simple(),
880        }
881        Ok(())
882    }
883
884    /// Negate this character class in place.
885    ///
886    /// After completion, this character class will contain precisely the
887    /// characters that weren't previously in the class.
888    pub fn negate(&mut self) {
889        match *self {
890            Class::Unicode(ref mut x) => x.negate(),
891            Class::Bytes(ref mut x) => x.negate(),
892        }
893    }
894
895    /// Returns true if and only if this character class will only ever match
896    /// valid UTF-8.
897    ///
898    /// A character class can match invalid UTF-8 only when the following
899    /// conditions are met:
900    ///
901    /// 1. The translator was configured to permit generating an expression
902    ///    that can match invalid UTF-8. (By default, this is disabled.)
903    /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
904    ///    syntax or in the parser builder. By default, Unicode mode is
905    ///    enabled.
906    pub fn is_utf8(&self) -> bool {
907        match *self {
908            Class::Unicode(_) => true,
909            Class::Bytes(ref x) => x.is_ascii(),
910        }
911    }
912
913    /// Returns the length, in bytes, of the smallest string matched by this
914    /// character class.
915    ///
916    /// For non-empty byte oriented classes, this always returns `1`. For
917    /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
918    /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
919    /// be returned.
920    ///
921    /// # Example
922    ///
923    /// This example shows some examples of regexes and their corresponding
924    /// minimum length, if any.
925    ///
926    /// ```
927    /// use regex_syntax::{hir::Properties, parse};
928    ///
929    /// // The empty string has a min length of 0.
930    /// let hir = parse(r"")?;
931    /// assert_eq!(Some(0), hir.properties().minimum_len());
932    /// // As do other types of regexes that only match the empty string.
933    /// let hir = parse(r"^$\b\B")?;
934    /// assert_eq!(Some(0), hir.properties().minimum_len());
935    /// // A regex that can match the empty string but match more is still 0.
936    /// let hir = parse(r"a*")?;
937    /// assert_eq!(Some(0), hir.properties().minimum_len());
938    /// // A regex that matches nothing has no minimum defined.
939    /// let hir = parse(r"[a&&b]")?;
940    /// assert_eq!(None, hir.properties().minimum_len());
941    /// // Character classes usually have a minimum length of 1.
942    /// let hir = parse(r"\w")?;
943    /// assert_eq!(Some(1), hir.properties().minimum_len());
944    /// // But sometimes Unicode classes might be bigger!
945    /// let hir = parse(r"\p{Cyrillic}")?;
946    /// assert_eq!(Some(2), hir.properties().minimum_len());
947    ///
948    /// # Ok::<(), Box<dyn std::error::Error>>(())
949    /// ```
950    pub fn minimum_len(&self) -> Option<usize> {
951        match *self {
952            Class::Unicode(ref x) => x.minimum_len(),
953            Class::Bytes(ref x) => x.minimum_len(),
954        }
955    }
956
957    /// Returns the length, in bytes, of the longest string matched by this
958    /// character class.
959    ///
960    /// For non-empty byte oriented classes, this always returns `1`. For
961    /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
962    /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
963    /// be returned.
964    ///
965    /// # Example
966    ///
967    /// This example shows some examples of regexes and their corresponding
968    /// maximum length, if any.
969    ///
970    /// ```
971    /// use regex_syntax::{hir::Properties, parse};
972    ///
973    /// // The empty string has a max length of 0.
974    /// let hir = parse(r"")?;
975    /// assert_eq!(Some(0), hir.properties().maximum_len());
976    /// // As do other types of regexes that only match the empty string.
977    /// let hir = parse(r"^$\b\B")?;
978    /// assert_eq!(Some(0), hir.properties().maximum_len());
979    /// // A regex that matches nothing has no maximum defined.
980    /// let hir = parse(r"[a&&b]")?;
981    /// assert_eq!(None, hir.properties().maximum_len());
982    /// // Bounded repeats work as you expect.
983    /// let hir = parse(r"x{2,10}")?;
984    /// assert_eq!(Some(10), hir.properties().maximum_len());
985    /// // An unbounded repeat means there is no maximum.
986    /// let hir = parse(r"x{2,}")?;
987    /// assert_eq!(None, hir.properties().maximum_len());
988    /// // With Unicode enabled, \w can match up to 4 bytes!
989    /// let hir = parse(r"\w")?;
990    /// assert_eq!(Some(4), hir.properties().maximum_len());
991    /// // Without Unicode enabled, \w matches at most 1 byte.
992    /// let hir = parse(r"(?-u)\w")?;
993    /// assert_eq!(Some(1), hir.properties().maximum_len());
994    ///
995    /// # Ok::<(), Box<dyn std::error::Error>>(())
996    /// ```
997    pub fn maximum_len(&self) -> Option<usize> {
998        match *self {
999            Class::Unicode(ref x) => x.maximum_len(),
1000            Class::Bytes(ref x) => x.maximum_len(),
1001        }
1002    }
1003
1004    /// Returns true if and only if this character class is empty. That is,
1005    /// it has no elements.
1006    ///
1007    /// An empty character can never match anything, including an empty string.
1008    pub fn is_empty(&self) -> bool {
1009        match *self {
1010            Class::Unicode(ref x) => x.ranges().is_empty(),
1011            Class::Bytes(ref x) => x.ranges().is_empty(),
1012        }
1013    }
1014
1015    /// If this class consists of exactly one element (whether a codepoint or a
1016    /// byte), then return it as a literal byte string.
1017    ///
1018    /// If this class is empty or contains more than one element, then `None`
1019    /// is returned.
1020    pub fn literal(&self) -> Option<Vec<u8>> {
1021        match *self {
1022            Class::Unicode(ref x) => x.literal(),
1023            Class::Bytes(ref x) => x.literal(),
1024        }
1025    }
1026}
1027
1028impl core::fmt::Debug for Class {
1029    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1030        use crate::debug::Byte;
1031
1032        let mut fmter = f.debug_set();
1033        match *self {
1034            Class::Unicode(ref cls) => {
1035                for r in cls.ranges().iter() {
1036                    fmter.entry(&(r.start..=r.end));
1037                }
1038            }
1039            Class::Bytes(ref cls) => {
1040                for r in cls.ranges().iter() {
1041                    fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1042                }
1043            }
1044        }
1045        fmter.finish()
1046    }
1047}
1048
1049/// A set of characters represented by Unicode scalar values.
1050#[derive(Clone, Debug, Eq, PartialEq)]
1051pub struct ClassUnicode {
1052    set: IntervalSet<ClassUnicodeRange>,
1053}
1054
1055impl ClassUnicode {
1056    /// Create a new class from a sequence of ranges.
1057    ///
1058    /// The given ranges do not need to be in any specific order, and ranges
1059    /// may overlap. Ranges will automatically be sorted into a canonical
1060    /// non-overlapping order.
1061    pub fn new<I>(ranges: I) -> ClassUnicode
1062    where
1063        I: IntoIterator<Item = ClassUnicodeRange>,
1064    {
1065        ClassUnicode { set: IntervalSet::new(ranges) }
1066    }
1067
1068    /// Create a new class with no ranges.
1069    ///
1070    /// An empty class matches nothing. That is, it is equivalent to
1071    /// [`Hir::fail`].
1072    pub fn empty() -> ClassUnicode {
1073        ClassUnicode::new(vec![])
1074    }
1075
1076    /// Add a new range to this set.
1077    pub fn push(&mut self, range: ClassUnicodeRange) {
1078        self.set.push(range);
1079    }
1080
1081    /// Return an iterator over all ranges in this class.
1082    ///
1083    /// The iterator yields ranges in ascending order.
1084    pub fn iter(&self) -> ClassUnicodeIter<'_> {
1085        ClassUnicodeIter(self.set.iter())
1086    }
1087
1088    /// Return the underlying ranges as a slice.
1089    pub fn ranges(&self) -> &[ClassUnicodeRange] {
1090        self.set.intervals()
1091    }
1092
1093    /// Expand this character class such that it contains all case folded
1094    /// characters, according to Unicode's "simple" mapping. For example, if
1095    /// this class consists of the range `a-z`, then applying case folding will
1096    /// result in the class containing both the ranges `a-z` and `A-Z`.
1097    ///
1098    /// # Panics
1099    ///
1100    /// This routine panics when the case mapping data necessary for this
1101    /// routine to complete is unavailable. This occurs when the `unicode-case`
1102    /// feature is not enabled.
1103    ///
1104    /// Callers should prefer using `try_case_fold_simple` instead, which will
1105    /// return an error instead of panicking.
1106    pub fn case_fold_simple(&mut self) {
1107        self.set
1108            .case_fold_simple()
1109            .expect("unicode-case feature must be enabled");
1110    }
1111
1112    /// Expand this character class such that it contains all case folded
1113    /// characters, according to Unicode's "simple" mapping. For example, if
1114    /// this class consists of the range `a-z`, then applying case folding will
1115    /// result in the class containing both the ranges `a-z` and `A-Z`.
1116    ///
1117    /// # Error
1118    ///
1119    /// This routine returns an error when the case mapping data necessary
1120    /// for this routine to complete is unavailable. This occurs when the
1121    /// `unicode-case` feature is not enabled.
1122    pub fn try_case_fold_simple(
1123        &mut self,
1124    ) -> core::result::Result<(), CaseFoldError> {
1125        self.set.case_fold_simple()
1126    }
1127
1128    /// Negate this character class.
1129    ///
1130    /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1131    /// set, then it will not be in this set after negation.
1132    pub fn negate(&mut self) {
1133        self.set.negate();
1134    }
1135
1136    /// Union this character class with the given character class, in place.
1137    pub fn union(&mut self, other: &ClassUnicode) {
1138        self.set.union(&other.set);
1139    }
1140
1141    /// Intersect this character class with the given character class, in
1142    /// place.
1143    pub fn intersect(&mut self, other: &ClassUnicode) {
1144        self.set.intersect(&other.set);
1145    }
1146
1147    /// Subtract the given character class from this character class, in place.
1148    pub fn difference(&mut self, other: &ClassUnicode) {
1149        self.set.difference(&other.set);
1150    }
1151
1152    /// Compute the symmetric difference of the given character classes, in
1153    /// place.
1154    ///
1155    /// This computes the symmetric difference of two character classes. This
1156    /// removes all elements in this class that are also in the given class,
1157    /// but all adds all elements from the given class that aren't in this
1158    /// class. That is, the class will contain all elements in either class,
1159    /// but will not contain any elements that are in both classes.
1160    pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1161        self.set.symmetric_difference(&other.set);
1162    }
1163
1164    /// Returns true if and only if this character class will either match
1165    /// nothing or only ASCII bytes. Stated differently, this returns false
1166    /// if and only if this class contains a non-ASCII codepoint.
1167    pub fn is_ascii(&self) -> bool {
1168        self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
1169    }
1170
1171    /// Returns the length, in bytes, of the smallest string matched by this
1172    /// character class.
1173    ///
1174    /// Returns `None` when the class is empty.
1175    pub fn minimum_len(&self) -> Option<usize> {
1176        let first = self.ranges().get(0)?;
1177        // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1178        Some(first.start.len_utf8())
1179    }
1180
1181    /// Returns the length, in bytes, of the longest string matched by this
1182    /// character class.
1183    ///
1184    /// Returns `None` when the class is empty.
1185    pub fn maximum_len(&self) -> Option<usize> {
1186        let last = self.ranges().last()?;
1187        // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1188        Some(last.end.len_utf8())
1189    }
1190
1191    /// If this class consists of exactly one codepoint, then return it as
1192    /// a literal byte string.
1193    ///
1194    /// If this class is empty or contains more than one codepoint, then `None`
1195    /// is returned.
1196    pub fn literal(&self) -> Option<Vec<u8>> {
1197        let rs = self.ranges();
1198        if rs.len() == 1 && rs[0].start == rs[0].end {
1199            Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
1200        } else {
1201            None
1202        }
1203    }
1204
1205    /// If this class consists of only ASCII ranges, then return its
1206    /// corresponding and equivalent byte class.
1207    pub fn to_byte_class(&self) -> Option<ClassBytes> {
1208        if !self.is_ascii() {
1209            return None;
1210        }
1211        Some(ClassBytes::new(self.ranges().iter().map(|r| {
1212            // Since we are guaranteed that our codepoint range is ASCII, the
1213            // 'u8::try_from' calls below are guaranteed to be correct.
1214            ClassBytesRange {
1215                start: u8::try_from(r.start).unwrap(),
1216                end: u8::try_from(r.end).unwrap(),
1217            }
1218        })))
1219    }
1220}
1221
1222/// An iterator over all ranges in a Unicode character class.
1223///
1224/// The lifetime `'a` refers to the lifetime of the underlying class.
1225#[derive(Debug)]
1226pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1227
1228impl<'a> Iterator for ClassUnicodeIter<'a> {
1229    type Item = &'a ClassUnicodeRange;
1230
1231    fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1232        self.0.next()
1233    }
1234}
1235
1236/// A single range of characters represented by Unicode scalar values.
1237///
1238/// The range is closed. That is, the start and end of the range are included
1239/// in the range.
1240#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1241pub struct ClassUnicodeRange {
1242    start: char,
1243    end: char,
1244}
1245
1246impl core::fmt::Debug for ClassUnicodeRange {
1247    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1248        let start = if !self.start.is_whitespace() && !self.start.is_control()
1249        {
1250            self.start.to_string()
1251        } else {
1252            format!("0x{:X}", u32::from(self.start))
1253        };
1254        let end = if !self.end.is_whitespace() && !self.end.is_control() {
1255            self.end.to_string()
1256        } else {
1257            format!("0x{:X}", u32::from(self.end))
1258        };
1259        f.debug_struct("ClassUnicodeRange")
1260            .field("start", &start)
1261            .field("end", &end)
1262            .finish()
1263    }
1264}
1265
1266impl Interval for ClassUnicodeRange {
1267    type Bound = char;
1268
1269    #[inline]
1270    fn lower(&self) -> char {
1271        self.start
1272    }
1273    #[inline]
1274    fn upper(&self) -> char {
1275        self.end
1276    }
1277    #[inline]
1278    fn set_lower(&mut self, bound: char) {
1279        self.start = bound;
1280    }
1281    #[inline]
1282    fn set_upper(&mut self, bound: char) {
1283        self.end = bound;
1284    }
1285
1286    /// Apply simple case folding to this Unicode scalar value range.
1287    ///
1288    /// Additional ranges are appended to the given vector. Canonical ordering
1289    /// is *not* maintained in the given vector.
1290    fn case_fold_simple(
1291        &self,
1292        ranges: &mut Vec<ClassUnicodeRange>,
1293    ) -> Result<(), unicode::CaseFoldError> {
1294        let mut folder = unicode::SimpleCaseFolder::new()?;
1295        if !folder.overlaps(self.start, self.end) {
1296            return Ok(());
1297        }
1298        let (start, end) = (u32::from(self.start), u32::from(self.end));
1299        for cp in (start..=end).filter_map(char::from_u32) {
1300            for &cp_folded in folder.mapping(cp) {
1301                ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1302            }
1303        }
1304        Ok(())
1305    }
1306}
1307
1308impl ClassUnicodeRange {
1309    /// Create a new Unicode scalar value range for a character class.
1310    ///
1311    /// The returned range is always in a canonical form. That is, the range
1312    /// returned always satisfies the invariant that `start <= end`.
1313    pub fn new(start: char, end: char) -> ClassUnicodeRange {
1314        ClassUnicodeRange::create(start, end)
1315    }
1316
1317    /// Return the start of this range.
1318    ///
1319    /// The start of a range is always less than or equal to the end of the
1320    /// range.
1321    pub fn start(&self) -> char {
1322        self.start
1323    }
1324
1325    /// Return the end of this range.
1326    ///
1327    /// The end of a range is always greater than or equal to the start of the
1328    /// range.
1329    pub fn end(&self) -> char {
1330        self.end
1331    }
1332
1333    /// Returns the number of codepoints in this range.
1334    pub fn len(&self) -> usize {
1335        let diff = 1 + u32::from(self.end) - u32::from(self.start);
1336        // This is likely to panic in 16-bit targets since a usize can only fit
1337        // 2^16. It's not clear what to do here, other than to return an error
1338        // when building a Unicode class that contains a range whose length
1339        // overflows usize. (Which, to be honest, is probably quite common on
1340        // 16-bit targets. For example, this would imply that '.' and '\p{any}'
1341        // would be impossible to build.)
1342        usize::try_from(diff).expect("char class len fits in usize")
1343    }
1344}
1345
1346/// A set of characters represented by arbitrary bytes.
1347///
1348/// Each byte corresponds to one character.
1349#[derive(Clone, Debug, Eq, PartialEq)]
1350pub struct ClassBytes {
1351    set: IntervalSet<ClassBytesRange>,
1352}
1353
1354impl ClassBytes {
1355    /// Create a new class from a sequence of ranges.
1356    ///
1357    /// The given ranges do not need to be in any specific order, and ranges
1358    /// may overlap. Ranges will automatically be sorted into a canonical
1359    /// non-overlapping order.
1360    pub fn new<I>(ranges: I) -> ClassBytes
1361    where
1362        I: IntoIterator<Item = ClassBytesRange>,
1363    {
1364        ClassBytes { set: IntervalSet::new(ranges) }
1365    }
1366
1367    /// Create a new class with no ranges.
1368    ///
1369    /// An empty class matches nothing. That is, it is equivalent to
1370    /// [`Hir::fail`].
1371    pub fn empty() -> ClassBytes {
1372        ClassBytes::new(vec![])
1373    }
1374
1375    /// Add a new range to this set.
1376    pub fn push(&mut self, range: ClassBytesRange) {
1377        self.set.push(range);
1378    }
1379
1380    /// Return an iterator over all ranges in this class.
1381    ///
1382    /// The iterator yields ranges in ascending order.
1383    pub fn iter(&self) -> ClassBytesIter<'_> {
1384        ClassBytesIter(self.set.iter())
1385    }
1386
1387    /// Return the underlying ranges as a slice.
1388    pub fn ranges(&self) -> &[ClassBytesRange] {
1389        self.set.intervals()
1390    }
1391
1392    /// Expand this character class such that it contains all case folded
1393    /// characters. For example, if this class consists of the range `a-z`,
1394    /// then applying case folding will result in the class containing both the
1395    /// ranges `a-z` and `A-Z`.
1396    ///
1397    /// Note that this only applies ASCII case folding, which is limited to the
1398    /// characters `a-z` and `A-Z`.
1399    pub fn case_fold_simple(&mut self) {
1400        self.set.case_fold_simple().expect("ASCII case folding never fails");
1401    }
1402
1403    /// Negate this byte class.
1404    ///
1405    /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1406    /// will not be in this set after negation.
1407    pub fn negate(&mut self) {
1408        self.set.negate();
1409    }
1410
1411    /// Union this byte class with the given byte class, in place.
1412    pub fn union(&mut self, other: &ClassBytes) {
1413        self.set.union(&other.set);
1414    }
1415
1416    /// Intersect this byte class with the given byte class, in place.
1417    pub fn intersect(&mut self, other: &ClassBytes) {
1418        self.set.intersect(&other.set);
1419    }
1420
1421    /// Subtract the given byte class from this byte class, in place.
1422    pub fn difference(&mut self, other: &ClassBytes) {
1423        self.set.difference(&other.set);
1424    }
1425
1426    /// Compute the symmetric difference of the given byte classes, in place.
1427    ///
1428    /// This computes the symmetric difference of two byte classes. This
1429    /// removes all elements in this class that are also in the given class,
1430    /// but all adds all elements from the given class that aren't in this
1431    /// class. That is, the class will contain all elements in either class,
1432    /// but will not contain any elements that are in both classes.
1433    pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1434        self.set.symmetric_difference(&other.set);
1435    }
1436
1437    /// Returns true if and only if this character class will either match
1438    /// nothing or only ASCII bytes. Stated differently, this returns false
1439    /// if and only if this class contains a non-ASCII byte.
1440    pub fn is_ascii(&self) -> bool {
1441        self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1442    }
1443
1444    /// Returns the length, in bytes, of the smallest string matched by this
1445    /// character class.
1446    ///
1447    /// Returns `None` when the class is empty.
1448    pub fn minimum_len(&self) -> Option<usize> {
1449        if self.ranges().is_empty() {
1450            None
1451        } else {
1452            Some(1)
1453        }
1454    }
1455
1456    /// Returns the length, in bytes, of the longest string matched by this
1457    /// character class.
1458    ///
1459    /// Returns `None` when the class is empty.
1460    pub fn maximum_len(&self) -> Option<usize> {
1461        if self.ranges().is_empty() {
1462            None
1463        } else {
1464            Some(1)
1465        }
1466    }
1467
1468    /// If this class consists of exactly one byte, then return it as
1469    /// a literal byte string.
1470    ///
1471    /// If this class is empty or contains more than one byte, then `None`
1472    /// is returned.
1473    pub fn literal(&self) -> Option<Vec<u8>> {
1474        let rs = self.ranges();
1475        if rs.len() == 1 && rs[0].start == rs[0].end {
1476            Some(vec![rs[0].start])
1477        } else {
1478            None
1479        }
1480    }
1481
1482    /// If this class consists of only ASCII ranges, then return its
1483    /// corresponding and equivalent Unicode class.
1484    pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1485        if !self.is_ascii() {
1486            return None;
1487        }
1488        Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1489            // Since we are guaranteed that our byte range is ASCII, the
1490            // 'char::from' calls below are correct and will not erroneously
1491            // convert a raw byte value into its corresponding codepoint.
1492            ClassUnicodeRange {
1493                start: char::from(r.start),
1494                end: char::from(r.end),
1495            }
1496        })))
1497    }
1498}
1499
1500/// An iterator over all ranges in a byte character class.
1501///
1502/// The lifetime `'a` refers to the lifetime of the underlying class.
1503#[derive(Debug)]
1504pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1505
1506impl<'a> Iterator for ClassBytesIter<'a> {
1507    type Item = &'a ClassBytesRange;
1508
1509    fn next(&mut self) -> Option<&'a ClassBytesRange> {
1510        self.0.next()
1511    }
1512}
1513
1514/// A single range of characters represented by arbitrary bytes.
1515///
1516/// The range is closed. That is, the start and end of the range are included
1517/// in the range.
1518#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1519pub struct ClassBytesRange {
1520    start: u8,
1521    end: u8,
1522}
1523
1524impl Interval for ClassBytesRange {
1525    type Bound = u8;
1526
1527    #[inline]
1528    fn lower(&self) -> u8 {
1529        self.start
1530    }
1531    #[inline]
1532    fn upper(&self) -> u8 {
1533        self.end
1534    }
1535    #[inline]
1536    fn set_lower(&mut self, bound: u8) {
1537        self.start = bound;
1538    }
1539    #[inline]
1540    fn set_upper(&mut self, bound: u8) {
1541        self.end = bound;
1542    }
1543
1544    /// Apply simple case folding to this byte range. Only ASCII case mappings
1545    /// (for a-z) are applied.
1546    ///
1547    /// Additional ranges are appended to the given vector. Canonical ordering
1548    /// is *not* maintained in the given vector.
1549    fn case_fold_simple(
1550        &self,
1551        ranges: &mut Vec<ClassBytesRange>,
1552    ) -> Result<(), unicode::CaseFoldError> {
1553        if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1554            let lower = cmp::max(self.start, b'a');
1555            let upper = cmp::min(self.end, b'z');
1556            ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1557        }
1558        if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1559            let lower = cmp::max(self.start, b'A');
1560            let upper = cmp::min(self.end, b'Z');
1561            ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1562        }
1563        Ok(())
1564    }
1565}
1566
1567impl ClassBytesRange {
1568    /// Create a new byte range for a character class.
1569    ///
1570    /// The returned range is always in a canonical form. That is, the range
1571    /// returned always satisfies the invariant that `start <= end`.
1572    pub fn new(start: u8, end: u8) -> ClassBytesRange {
1573        ClassBytesRange::create(start, end)
1574    }
1575
1576    /// Return the start of this range.
1577    ///
1578    /// The start of a range is always less than or equal to the end of the
1579    /// range.
1580    pub fn start(&self) -> u8 {
1581        self.start
1582    }
1583
1584    /// Return the end of this range.
1585    ///
1586    /// The end of a range is always greater than or equal to the start of the
1587    /// range.
1588    pub fn end(&self) -> u8 {
1589        self.end
1590    }
1591
1592    /// Returns the number of bytes in this range.
1593    pub fn len(&self) -> usize {
1594        usize::from(self.end.checked_sub(self.start).unwrap())
1595            .checked_add(1)
1596            .unwrap()
1597    }
1598}
1599
1600impl core::fmt::Debug for ClassBytesRange {
1601    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1602        f.debug_struct("ClassBytesRange")
1603            .field("start", &crate::debug::Byte(self.start))
1604            .field("end", &crate::debug::Byte(self.end))
1605            .finish()
1606    }
1607}
1608
1609/// The high-level intermediate representation for a look-around assertion.
1610///
1611/// An assertion match is always zero-length. Also called an "empty match."
1612#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1613pub enum Look {
1614    /// Match the beginning of text. Specifically, this matches at the starting
1615    /// position of the input.
1616    Start = 1 << 0,
1617    /// Match the end of text. Specifically, this matches at the ending
1618    /// position of the input.
1619    End = 1 << 1,
1620    /// Match the beginning of a line or the beginning of text. Specifically,
1621    /// this matches at the starting position of the input, or at the position
1622    /// immediately following a `\n` character.
1623    StartLF = 1 << 2,
1624    /// Match the end of a line or the end of text. Specifically, this matches
1625    /// at the end position of the input, or at the position immediately
1626    /// preceding a `\n` character.
1627    EndLF = 1 << 3,
1628    /// Match the beginning of a line or the beginning of text. Specifically,
1629    /// this matches at the starting position of the input, or at the position
1630    /// immediately following either a `\r` or `\n` character, but never after
1631    /// a `\r` when a `\n` follows.
1632    StartCRLF = 1 << 4,
1633    /// Match the end of a line or the end of text. Specifically, this matches
1634    /// at the end position of the input, or at the position immediately
1635    /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1636    /// precedes it.
1637    EndCRLF = 1 << 5,
1638    /// Match an ASCII-only word boundary. That is, this matches a position
1639    /// where the left adjacent character and right adjacent character
1640    /// correspond to a word and non-word or a non-word and word character.
1641    WordAscii = 1 << 6,
1642    /// Match an ASCII-only negation of a word boundary.
1643    WordAsciiNegate = 1 << 7,
1644    /// Match a Unicode-aware word boundary. That is, this matches a position
1645    /// where the left adjacent character and right adjacent character
1646    /// correspond to a word and non-word or a non-word and word character.
1647    WordUnicode = 1 << 8,
1648    /// Match a Unicode-aware negation of a word boundary.
1649    WordUnicodeNegate = 1 << 9,
1650    /// Match the start of an ASCII-only word boundary. That is, this matches a
1651    /// position at either the beginning of the haystack or where the previous
1652    /// character is not a word character and the following character is a word
1653    /// character.
1654    WordStartAscii = 1 << 10,
1655    /// Match the end of an ASCII-only word boundary. That is, this matches
1656    /// a position at either the end of the haystack or where the previous
1657    /// character is a word character and the following character is not a word
1658    /// character.
1659    WordEndAscii = 1 << 11,
1660    /// Match the start of a Unicode word boundary. That is, this matches a
1661    /// position at either the beginning of the haystack or where the previous
1662    /// character is not a word character and the following character is a word
1663    /// character.
1664    WordStartUnicode = 1 << 12,
1665    /// Match the end of a Unicode word boundary. That is, this matches a
1666    /// position at either the end of the haystack or where the previous
1667    /// character is a word character and the following character is not a word
1668    /// character.
1669    WordEndUnicode = 1 << 13,
1670    /// Match the start half of an ASCII-only word boundary. That is, this
1671    /// matches a position at either the beginning of the haystack or where the
1672    /// previous character is not a word character.
1673    WordStartHalfAscii = 1 << 14,
1674    /// Match the end half of an ASCII-only word boundary. That is, this
1675    /// matches a position at either the end of the haystack or where the
1676    /// following character is not a word character.
1677    WordEndHalfAscii = 1 << 15,
1678    /// Match the start half of a Unicode word boundary. That is, this matches
1679    /// a position at either the beginning of the haystack or where the
1680    /// previous character is not a word character.
1681    WordStartHalfUnicode = 1 << 16,
1682    /// Match the end half of a Unicode word boundary. That is, this matches
1683    /// a position at either the end of the haystack or where the following
1684    /// character is not a word character.
1685    WordEndHalfUnicode = 1 << 17,
1686}
1687
1688impl Look {
1689    /// Flip the look-around assertion to its equivalent for reverse searches.
1690    /// For example, `StartLF` gets translated to `EndLF`.
1691    ///
1692    /// Some assertions, such as `WordUnicode`, remain the same since they
1693    /// match the same positions regardless of the direction of the search.
1694    #[inline]
1695    pub const fn reversed(self) -> Look {
1696        match self {
1697            Look::Start => Look::End,
1698            Look::End => Look::Start,
1699            Look::StartLF => Look::EndLF,
1700            Look::EndLF => Look::StartLF,
1701            Look::StartCRLF => Look::EndCRLF,
1702            Look::EndCRLF => Look::StartCRLF,
1703            Look::WordAscii => Look::WordAscii,
1704            Look::WordAsciiNegate => Look::WordAsciiNegate,
1705            Look::WordUnicode => Look::WordUnicode,
1706            Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1707            Look::WordStartAscii => Look::WordEndAscii,
1708            Look::WordEndAscii => Look::WordStartAscii,
1709            Look::WordStartUnicode => Look::WordEndUnicode,
1710            Look::WordEndUnicode => Look::WordStartUnicode,
1711            Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1712            Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1713            Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1714            Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1715        }
1716    }
1717
1718    /// Return the underlying representation of this look-around enumeration
1719    /// as an integer. Giving the return value to the [`Look::from_repr`]
1720    /// constructor is guaranteed to return the same look-around variant that
1721    /// one started with within a semver compatible release of this crate.
1722    #[inline]
1723    pub const fn as_repr(self) -> u32 {
1724        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1725        // actual int.
1726        self as u32
1727    }
1728
1729    /// Given the underlying representation of a `Look` value, return the
1730    /// corresponding `Look` value if the representation is valid. Otherwise
1731    /// `None` is returned.
1732    #[inline]
1733    pub const fn from_repr(repr: u32) -> Option<Look> {
1734        match repr {
1735            0b00_0000_0000_0000_0001 => Some(Look::Start),
1736            0b00_0000_0000_0000_0010 => Some(Look::End),
1737            0b00_0000_0000_0000_0100 => Some(Look::StartLF),
1738            0b00_0000_0000_0000_1000 => Some(Look::EndLF),
1739            0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
1740            0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
1741            0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
1742            0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
1743            0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
1744            0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
1745            0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
1746            0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
1747            0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
1748            0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
1749            0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
1750            0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
1751            0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
1752            0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
1753            _ => None,
1754        }
1755    }
1756
1757    /// Returns a convenient single codepoint representation of this
1758    /// look-around assertion. Each assertion is guaranteed to be represented
1759    /// by a distinct character.
1760    ///
1761    /// This is useful for succinctly representing a look-around assertion in
1762    /// human friendly but succinct output intended for a programmer working on
1763    /// regex internals.
1764    #[inline]
1765    pub const fn as_char(self) -> char {
1766        match self {
1767            Look::Start => 'A',
1768            Look::End => 'z',
1769            Look::StartLF => '^',
1770            Look::EndLF => '$',
1771            Look::StartCRLF => 'r',
1772            Look::EndCRLF => 'R',
1773            Look::WordAscii => 'b',
1774            Look::WordAsciiNegate => 'B',
1775            Look::WordUnicode => '𝛃',
1776            Look::WordUnicodeNegate => '𝚩',
1777            Look::WordStartAscii => '<',
1778            Look::WordEndAscii => '>',
1779            Look::WordStartUnicode => '〈',
1780            Look::WordEndUnicode => '〉',
1781            Look::WordStartHalfAscii => '◁',
1782            Look::WordEndHalfAscii => '▷',
1783            Look::WordStartHalfUnicode => '◀',
1784            Look::WordEndHalfUnicode => '▶',
1785        }
1786    }
1787}
1788
1789/// The high-level intermediate representation for a capturing group.
1790///
1791/// A capturing group always has an index and a child expression. It may
1792/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1793/// necessary.
1794///
1795/// Note that there is no explicit representation of a non-capturing group
1796/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1797/// the recursive structure of the `Hir` itself.
1798#[derive(Clone, Debug, Eq, PartialEq)]
1799pub struct Capture {
1800    /// The capture index of the capture.
1801    pub index: u32,
1802    /// The name of the capture, if it exists.
1803    pub name: Option<Box<str>>,
1804    /// The expression inside the capturing group, which may be empty.
1805    pub sub: Box<Hir>,
1806}
1807
1808/// The high-level intermediate representation of a repetition operator.
1809///
1810/// A repetition operator permits the repetition of an arbitrary
1811/// sub-expression.
1812#[derive(Clone, Debug, Eq, PartialEq)]
1813pub struct Repetition {
1814    /// The minimum range of the repetition.
1815    ///
1816    /// Note that special cases like `?`, `+` and `*` all get translated into
1817    /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1818    ///
1819    /// When `min` is zero, this expression can match the empty string
1820    /// regardless of what its sub-expression is.
1821    pub min: u32,
1822    /// The maximum range of the repetition.
1823    ///
1824    /// Note that when `max` is `None`, `min` acts as a lower bound but where
1825    /// there is no upper bound. For something like `x{5}` where the min and
1826    /// max are equivalent, `min` will be set to `5` and `max` will be set to
1827    /// `Some(5)`.
1828    pub max: Option<u32>,
1829    /// Whether this repetition operator is greedy or not. A greedy operator
1830    /// will match as much as it can. A non-greedy operator will match as
1831    /// little as it can.
1832    ///
1833    /// Typically, operators are greedy by default and are only non-greedy when
1834    /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1835    /// not. However, this can be inverted via the `U` "ungreedy" flag.
1836    pub greedy: bool,
1837    /// The expression being repeated.
1838    pub sub: Box<Hir>,
1839}
1840
1841impl Repetition {
1842    /// Returns a new repetition with the same `min`, `max` and `greedy`
1843    /// values, but with its sub-expression replaced with the one given.
1844    pub fn with(&self, sub: Hir) -> Repetition {
1845        Repetition {
1846            min: self.min,
1847            max: self.max,
1848            greedy: self.greedy,
1849            sub: Box::new(sub),
1850        }
1851    }
1852}
1853
1854/// A type describing the different flavors of `.`.
1855///
1856/// This type is meant to be used with [`Hir::dot`], which is a convenience
1857/// routine for building HIR values derived from the `.` regex.
1858#[non_exhaustive]
1859#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1860pub enum Dot {
1861    /// Matches the UTF-8 encoding of any Unicode scalar value.
1862    ///
1863    /// This is equivalent to `(?su:.)` and also `\p{any}`.
1864    AnyChar,
1865    /// Matches any byte value.
1866    ///
1867    /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1868    AnyByte,
1869    /// Matches the UTF-8 encoding of any Unicode scalar value except for the
1870    /// `char` given.
1871    ///
1872    /// This is equivalent to using `(?u-s:.)` with the line terminator set
1873    /// to a particular ASCII byte. (Because of peculiarities in the regex
1874    /// engines, a line terminator must be a single byte. It follows that when
1875    /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1876    /// value. That is, ti must be ASCII.)
1877    ///
1878    /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1879    /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1880    AnyCharExcept(char),
1881    /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1882    ///
1883    /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1884    AnyCharExceptLF,
1885    /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1886    /// and `\n`.
1887    ///
1888    /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1889    AnyCharExceptCRLF,
1890    /// Matches any byte value except for the `u8` given.
1891    ///
1892    /// This is equivalent to using `(?-us:.)` with the line terminator set
1893    /// to a particular ASCII byte. (Because of peculiarities in the regex
1894    /// engines, a line terminator must be a single byte. It follows that when
1895    /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1896    /// value. That is, ti must be ASCII.)
1897    ///
1898    /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1899    /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1900    AnyByteExcept(u8),
1901    /// Matches any byte value except for `\n`.
1902    ///
1903    /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1904    AnyByteExceptLF,
1905    /// Matches any byte value except for `\r` and `\n`.
1906    ///
1907    /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1908    AnyByteExceptCRLF,
1909}
1910
1911/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1912/// space but heap space proportional to the depth of the total `Hir`.
1913impl Drop for Hir {
1914    fn drop(&mut self) {
1915        use core::mem;
1916
1917        match *self.kind() {
1918            HirKind::Empty
1919            | HirKind::Literal(_)
1920            | HirKind::Class(_)
1921            | HirKind::Look(_) => return,
1922            HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1923            HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1924                return
1925            }
1926            HirKind::Concat(ref x) if x.is_empty() => return,
1927            HirKind::Alternation(ref x) if x.is_empty() => return,
1928            _ => {}
1929        }
1930
1931        let mut stack = vec![mem::replace(self, Hir::empty())];
1932        while let Some(mut expr) = stack.pop() {
1933            match expr.kind {
1934                HirKind::Empty
1935                | HirKind::Literal(_)
1936                | HirKind::Class(_)
1937                | HirKind::Look(_) => {}
1938                HirKind::Capture(ref mut x) => {
1939                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
1940                }
1941                HirKind::Repetition(ref mut x) => {
1942                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
1943                }
1944                HirKind::Concat(ref mut x) => {
1945                    stack.extend(x.drain(..));
1946                }
1947                HirKind::Alternation(ref mut x) => {
1948                    stack.extend(x.drain(..));
1949                }
1950            }
1951        }
1952    }
1953}
1954
1955/// A type that collects various properties of an HIR value.
1956///
1957/// Properties are always scalar values and represent meta data that is
1958/// computed inductively on an HIR value. Properties are defined for all
1959/// HIR values.
1960///
1961/// All methods on a `Properties` value take constant time and are meant to
1962/// be cheap to call.
1963#[derive(Clone, Debug, Eq, PartialEq)]
1964pub struct Properties(Box<PropertiesI>);
1965
1966/// The property definition. It is split out so that we can box it, and
1967/// there by make `Properties` use less stack size. This is kind-of important
1968/// because every HIR value has a `Properties` attached to it.
1969///
1970/// This does have the unfortunate consequence that creating any HIR value
1971/// always leads to at least one alloc for properties, but this is generally
1972/// true anyway (for pretty much all HirKinds except for look-arounds).
1973#[derive(Clone, Debug, Eq, PartialEq)]
1974struct PropertiesI {
1975    minimum_len: Option<usize>,
1976    maximum_len: Option<usize>,
1977    look_set: LookSet,
1978    look_set_prefix: LookSet,
1979    look_set_suffix: LookSet,
1980    look_set_prefix_any: LookSet,
1981    look_set_suffix_any: LookSet,
1982    utf8: bool,
1983    explicit_captures_len: usize,
1984    static_explicit_captures_len: Option<usize>,
1985    literal: bool,
1986    alternation_literal: bool,
1987}
1988
1989impl Properties {
1990    /// Returns the length (in bytes) of the smallest string matched by this
1991    /// HIR.
1992    ///
1993    /// A return value of `0` is possible and occurs when the HIR can match an
1994    /// empty string.
1995    ///
1996    /// `None` is returned when there is no minimum length. This occurs in
1997    /// precisely the cases where the HIR matches nothing. i.e., The language
1998    /// the regex matches is empty. An example of such a regex is `\P{any}`.
1999    #[inline]
2000    pub fn minimum_len(&self) -> Option<usize> {
2001        self.0.minimum_len
2002    }
2003
2004    /// Returns the length (in bytes) of the longest string matched by this
2005    /// HIR.
2006    ///
2007    /// A return value of `0` is possible and occurs when nothing longer than
2008    /// the empty string is in the language described by this HIR.
2009    ///
2010    /// `None` is returned when there is no longest matching string. This
2011    /// occurs when the HIR matches nothing or when there is no upper bound on
2012    /// the length of matching strings. Example of such regexes are `\P{any}`
2013    /// (matches nothing) and `a+` (has no upper bound).
2014    #[inline]
2015    pub fn maximum_len(&self) -> Option<usize> {
2016        self.0.maximum_len
2017    }
2018
2019    /// Returns a set of all look-around assertions that appear at least once
2020    /// in this HIR value.
2021    #[inline]
2022    pub fn look_set(&self) -> LookSet {
2023        self.0.look_set
2024    }
2025
2026    /// Returns a set of all look-around assertions that appear as a prefix for
2027    /// this HIR value. That is, the set returned corresponds to the set of
2028    /// assertions that must be passed before matching any bytes in a haystack.
2029    ///
2030    /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2031    /// if and only if the HIR is fully anchored at the start.
2032    #[inline]
2033    pub fn look_set_prefix(&self) -> LookSet {
2034        self.0.look_set_prefix
2035    }
2036
2037    /// Returns a set of all look-around assertions that appear as a _possible_
2038    /// prefix for this HIR value. That is, the set returned corresponds to the
2039    /// set of assertions that _may_ be passed before matching any bytes in a
2040    /// haystack.
2041    ///
2042    /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2043    /// true if and only if it's possible for the regex to match through a
2044    /// anchored assertion before consuming any input.
2045    #[inline]
2046    pub fn look_set_prefix_any(&self) -> LookSet {
2047        self.0.look_set_prefix_any
2048    }
2049
2050    /// Returns a set of all look-around assertions that appear as a suffix for
2051    /// this HIR value. That is, the set returned corresponds to the set of
2052    /// assertions that must be passed in order to be considered a match after
2053    /// all other consuming HIR expressions.
2054    ///
2055    /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2056    /// if and only if the HIR is fully anchored at the end.
2057    #[inline]
2058    pub fn look_set_suffix(&self) -> LookSet {
2059        self.0.look_set_suffix
2060    }
2061
2062    /// Returns a set of all look-around assertions that appear as a _possible_
2063    /// suffix for this HIR value. That is, the set returned corresponds to the
2064    /// set of assertions that _may_ be passed before matching any bytes in a
2065    /// haystack.
2066    ///
2067    /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2068    /// true if and only if it's possible for the regex to match through a
2069    /// anchored assertion at the end of a match without consuming any input.
2070    #[inline]
2071    pub fn look_set_suffix_any(&self) -> LookSet {
2072        self.0.look_set_suffix_any
2073    }
2074
2075    /// Return true if and only if the corresponding HIR will always match
2076    /// valid UTF-8.
2077    ///
2078    /// When this returns false, then it is possible for this HIR expression to
2079    /// match invalid UTF-8, including by matching between the code units of
2080    /// a single UTF-8 encoded codepoint.
2081    ///
2082    /// Note that this returns true even when the corresponding HIR can match
2083    /// the empty string. Since an empty string can technically appear between
2084    /// UTF-8 code units, it is possible for a match to be reported that splits
2085    /// a codepoint which could in turn be considered matching invalid UTF-8.
2086    /// However, it is generally assumed that such empty matches are handled
2087    /// specially by the search routine if it is absolutely required that
2088    /// matches not split a codepoint.
2089    ///
2090    /// # Example
2091    ///
2092    /// This code example shows the UTF-8 property of a variety of patterns.
2093    ///
2094    /// ```
2095    /// use regex_syntax::{ParserBuilder, parse};
2096    ///
2097    /// // Examples of 'is_utf8() == true'.
2098    /// assert!(parse(r"a")?.properties().is_utf8());
2099    /// assert!(parse(r"[^a]")?.properties().is_utf8());
2100    /// assert!(parse(r".")?.properties().is_utf8());
2101    /// assert!(parse(r"\W")?.properties().is_utf8());
2102    /// assert!(parse(r"\b")?.properties().is_utf8());
2103    /// assert!(parse(r"\B")?.properties().is_utf8());
2104    /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2105    /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2106    /// // Unicode mode is enabled by default, and in
2107    /// // that mode, all \x hex escapes are treated as
2108    /// // codepoints. So this actually matches the UTF-8
2109    /// // encoding of U+00FF.
2110    /// assert!(parse(r"\xFF")?.properties().is_utf8());
2111    ///
2112    /// // Now we show examples of 'is_utf8() == false'.
2113    /// // The only way to do this is to force the parser
2114    /// // to permit invalid UTF-8, otherwise all of these
2115    /// // would fail to parse!
2116    /// let parse = |pattern| {
2117    ///     ParserBuilder::new().utf8(false).build().parse(pattern)
2118    /// };
2119    /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2120    /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2121    /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2122    /// // Conversely to the equivalent example above,
2123    /// // when Unicode mode is disabled, \x hex escapes
2124    /// // are treated as their raw byte values.
2125    /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2126    /// // Note that just because we disabled UTF-8 in the
2127    /// // parser doesn't mean we still can't use Unicode.
2128    /// // It is enabled by default, so \xFF is still
2129    /// // equivalent to matching the UTF-8 encoding of
2130    /// // U+00FF by default.
2131    /// assert!(parse(r"\xFF")?.properties().is_utf8());
2132    /// // Even though we use raw bytes that individually
2133    /// // are not valid UTF-8, when combined together, the
2134    /// // overall expression *does* match valid UTF-8!
2135    /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2136    ///
2137    /// # Ok::<(), Box<dyn std::error::Error>>(())
2138    /// ```
2139    #[inline]
2140    pub fn is_utf8(&self) -> bool {
2141        self.0.utf8
2142    }
2143
2144    /// Returns the total number of explicit capturing groups in the
2145    /// corresponding HIR.
2146    ///
2147    /// Note that this does not include the implicit capturing group
2148    /// corresponding to the entire match that is typically included by regex
2149    /// engines.
2150    ///
2151    /// # Example
2152    ///
2153    /// This method will return `0` for `a` and `1` for `(a)`:
2154    ///
2155    /// ```
2156    /// use regex_syntax::parse;
2157    ///
2158    /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
2159    /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
2160    ///
2161    /// # Ok::<(), Box<dyn std::error::Error>>(())
2162    /// ```
2163    #[inline]
2164    pub fn explicit_captures_len(&self) -> usize {
2165        self.0.explicit_captures_len
2166    }
2167
2168    /// Returns the total number of explicit capturing groups that appear in
2169    /// every possible match.
2170    ///
2171    /// If the number of capture groups can vary depending on the match, then
2172    /// this returns `None`. That is, a value is only returned when the number
2173    /// of matching groups is invariant or "static."
2174    ///
2175    /// Note that this does not include the implicit capturing group
2176    /// corresponding to the entire match.
2177    ///
2178    /// # Example
2179    ///
2180    /// This shows a few cases where a static number of capture groups is
2181    /// available and a few cases where it is not.
2182    ///
2183    /// ```
2184    /// use regex_syntax::parse;
2185    ///
2186    /// let len = |pattern| {
2187    ///     parse(pattern).map(|h| {
2188    ///         h.properties().static_explicit_captures_len()
2189    ///     })
2190    /// };
2191    ///
2192    /// assert_eq!(Some(0), len("a")?);
2193    /// assert_eq!(Some(1), len("(a)")?);
2194    /// assert_eq!(Some(1), len("(a)|(b)")?);
2195    /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2196    /// assert_eq!(None, len("(a)|b")?);
2197    /// assert_eq!(None, len("a|(b)")?);
2198    /// assert_eq!(None, len("(b)*")?);
2199    /// assert_eq!(Some(1), len("(b)+")?);
2200    ///
2201    /// # Ok::<(), Box<dyn std::error::Error>>(())
2202    /// ```
2203    #[inline]
2204    pub fn static_explicit_captures_len(&self) -> Option<usize> {
2205        self.0.static_explicit_captures_len
2206    }
2207
2208    /// Return true if and only if this HIR is a simple literal. This is
2209    /// only true when this HIR expression is either itself a `Literal` or a
2210    /// concatenation of only `Literal`s.
2211    ///
2212    /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2213    /// the empty string are not (even though they contain sub-expressions that
2214    /// are literals).
2215    #[inline]
2216    pub fn is_literal(&self) -> bool {
2217        self.0.literal
2218    }
2219
2220    /// Return true if and only if this HIR is either a simple literal or an
2221    /// alternation of simple literals. This is only
2222    /// true when this HIR expression is either itself a `Literal` or a
2223    /// concatenation of only `Literal`s or an alternation of only `Literal`s.
2224    ///
2225    /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
2226    /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2227    /// (even though that contain sub-expressions that are literals).
2228    #[inline]
2229    pub fn is_alternation_literal(&self) -> bool {
2230        self.0.alternation_literal
2231    }
2232
2233    /// Returns the total amount of heap memory usage, in bytes, used by this
2234    /// `Properties` value.
2235    #[inline]
2236    pub fn memory_usage(&self) -> usize {
2237        core::mem::size_of::<PropertiesI>()
2238    }
2239
2240    /// Returns a new set of properties that corresponds to the union of the
2241    /// iterator of properties given.
2242    ///
2243    /// This is useful when one has multiple `Hir` expressions and wants
2244    /// to combine them into a single alternation without constructing the
2245    /// corresponding `Hir`. This routine provides a way of combining the
2246    /// properties of each `Hir` expression into one set of properties
2247    /// representing the union of those expressions.
2248    ///
2249    /// # Example: union with HIRs that never match
2250    ///
2251    /// This example shows that unioning properties together with one that
2252    /// represents a regex that never matches will "poison" certain attributes,
2253    /// like the minimum and maximum lengths.
2254    ///
2255    /// ```
2256    /// use regex_syntax::{hir::Properties, parse};
2257    ///
2258    /// let hir1 = parse("ab?c?")?;
2259    /// assert_eq!(Some(1), hir1.properties().minimum_len());
2260    /// assert_eq!(Some(3), hir1.properties().maximum_len());
2261    ///
2262    /// let hir2 = parse(r"[a&&b]")?;
2263    /// assert_eq!(None, hir2.properties().minimum_len());
2264    /// assert_eq!(None, hir2.properties().maximum_len());
2265    ///
2266    /// let hir3 = parse(r"wxy?z?")?;
2267    /// assert_eq!(Some(2), hir3.properties().minimum_len());
2268    /// assert_eq!(Some(4), hir3.properties().maximum_len());
2269    ///
2270    /// let unioned = Properties::union([
2271    ///		hir1.properties(),
2272    ///		hir2.properties(),
2273    ///		hir3.properties(),
2274    ///	]);
2275    /// assert_eq!(None, unioned.minimum_len());
2276    /// assert_eq!(None, unioned.maximum_len());
2277    ///
2278    /// # Ok::<(), Box<dyn std::error::Error>>(())
2279    /// ```
2280    ///
2281    /// The maximum length can also be "poisoned" by a pattern that has no
2282    /// upper bound on the length of a match. The minimum length remains
2283    /// unaffected:
2284    ///
2285    /// ```
2286    /// use regex_syntax::{hir::Properties, parse};
2287    ///
2288    /// let hir1 = parse("ab?c?")?;
2289    /// assert_eq!(Some(1), hir1.properties().minimum_len());
2290    /// assert_eq!(Some(3), hir1.properties().maximum_len());
2291    ///
2292    /// let hir2 = parse(r"a+")?;
2293    /// assert_eq!(Some(1), hir2.properties().minimum_len());
2294    /// assert_eq!(None, hir2.properties().maximum_len());
2295    ///
2296    /// let hir3 = parse(r"wxy?z?")?;
2297    /// assert_eq!(Some(2), hir3.properties().minimum_len());
2298    /// assert_eq!(Some(4), hir3.properties().maximum_len());
2299    ///
2300    /// let unioned = Properties::union([
2301    ///		hir1.properties(),
2302    ///		hir2.properties(),
2303    ///		hir3.properties(),
2304    ///	]);
2305    /// assert_eq!(Some(1), unioned.minimum_len());
2306    /// assert_eq!(None, unioned.maximum_len());
2307    ///
2308    /// # Ok::<(), Box<dyn std::error::Error>>(())
2309    /// ```
2310    pub fn union<I, P>(props: I) -> Properties
2311    where
2312        I: IntoIterator<Item = P>,
2313        P: core::borrow::Borrow<Properties>,
2314    {
2315        let mut it = props.into_iter().peekable();
2316        // While empty alternations aren't possible, we still behave as if they
2317        // are. When we have an empty alternate, then clearly the look-around
2318        // prefix and suffix is empty. Otherwise, it is the intersection of all
2319        // prefixes and suffixes (respectively) of the branches.
2320        let fix = if it.peek().is_none() {
2321            LookSet::empty()
2322        } else {
2323            LookSet::full()
2324        };
2325        // And also, an empty alternate means we have 0 static capture groups,
2326        // but we otherwise start with the number corresponding to the first
2327        // alternate. If any subsequent alternate has a different number of
2328        // static capture groups, then we overall have a variation and not a
2329        // static number of groups.
2330        let static_explicit_captures_len =
2331            it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2332        // The base case is an empty alternation, which matches nothing.
2333        // Note though that empty alternations aren't possible, because the
2334        // Hir::alternation smart constructor rewrites those as empty character
2335        // classes.
2336        let mut props = PropertiesI {
2337            minimum_len: None,
2338            maximum_len: None,
2339            look_set: LookSet::empty(),
2340            look_set_prefix: fix,
2341            look_set_suffix: fix,
2342            look_set_prefix_any: LookSet::empty(),
2343            look_set_suffix_any: LookSet::empty(),
2344            utf8: true,
2345            explicit_captures_len: 0,
2346            static_explicit_captures_len,
2347            literal: false,
2348            alternation_literal: true,
2349        };
2350        let (mut min_poisoned, mut max_poisoned) = (false, false);
2351        // Handle properties that need to visit every child hir.
2352        for prop in it {
2353            let p = prop.borrow();
2354            props.look_set.set_union(p.look_set());
2355            props.look_set_prefix.set_intersect(p.look_set_prefix());
2356            props.look_set_suffix.set_intersect(p.look_set_suffix());
2357            props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2358            props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2359            props.utf8 = props.utf8 && p.is_utf8();
2360            props.explicit_captures_len = props
2361                .explicit_captures_len
2362                .saturating_add(p.explicit_captures_len());
2363            if props.static_explicit_captures_len
2364                != p.static_explicit_captures_len()
2365            {
2366                props.static_explicit_captures_len = None;
2367            }
2368            props.alternation_literal =
2369                props.alternation_literal && p.is_literal();
2370            if !min_poisoned {
2371                if let Some(xmin) = p.minimum_len() {
2372                    if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2373                        props.minimum_len = Some(xmin);
2374                    }
2375                } else {
2376                    props.minimum_len = None;
2377                    min_poisoned = true;
2378                }
2379            }
2380            if !max_poisoned {
2381                if let Some(xmax) = p.maximum_len() {
2382                    if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2383                        props.maximum_len = Some(xmax);
2384                    }
2385                } else {
2386                    props.maximum_len = None;
2387                    max_poisoned = true;
2388                }
2389            }
2390        }
2391        Properties(Box::new(props))
2392    }
2393}
2394
2395impl Properties {
2396    /// Create a new set of HIR properties for an empty regex.
2397    fn empty() -> Properties {
2398        let inner = PropertiesI {
2399            minimum_len: Some(0),
2400            maximum_len: Some(0),
2401            look_set: LookSet::empty(),
2402            look_set_prefix: LookSet::empty(),
2403            look_set_suffix: LookSet::empty(),
2404            look_set_prefix_any: LookSet::empty(),
2405            look_set_suffix_any: LookSet::empty(),
2406            // It is debatable whether an empty regex always matches at valid
2407            // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2408            // it is clearly false. There are, for example, many empty strings
2409            // between the bytes encoding a '☃'.
2410            //
2411            // However, when Unicode mode is enabled, the fundamental atom
2412            // of matching is really a codepoint. And in that scenario, an
2413            // empty regex is defined to only match at valid UTF-8 boundaries
2414            // and to never split a codepoint. It just so happens that this
2415            // enforcement is somewhat tricky to do for regexes that match
2416            // the empty string inside regex engines themselves. It usually
2417            // requires some layer above the regex engine to filter out such
2418            // matches.
2419            //
2420            // In any case, 'true' is really the only coherent option. If it
2421            // were false, for example, then 'a*' would also need to be false
2422            // since it too can match the empty string.
2423            utf8: true,
2424            explicit_captures_len: 0,
2425            static_explicit_captures_len: Some(0),
2426            literal: false,
2427            alternation_literal: false,
2428        };
2429        Properties(Box::new(inner))
2430    }
2431
2432    /// Create a new set of HIR properties for a literal regex.
2433    fn literal(lit: &Literal) -> Properties {
2434        let inner = PropertiesI {
2435            minimum_len: Some(lit.0.len()),
2436            maximum_len: Some(lit.0.len()),
2437            look_set: LookSet::empty(),
2438            look_set_prefix: LookSet::empty(),
2439            look_set_suffix: LookSet::empty(),
2440            look_set_prefix_any: LookSet::empty(),
2441            look_set_suffix_any: LookSet::empty(),
2442            utf8: core::str::from_utf8(&lit.0).is_ok(),
2443            explicit_captures_len: 0,
2444            static_explicit_captures_len: Some(0),
2445            literal: true,
2446            alternation_literal: true,
2447        };
2448        Properties(Box::new(inner))
2449    }
2450
2451    /// Create a new set of HIR properties for a character class.
2452    fn class(class: &Class) -> Properties {
2453        let inner = PropertiesI {
2454            minimum_len: class.minimum_len(),
2455            maximum_len: class.maximum_len(),
2456            look_set: LookSet::empty(),
2457            look_set_prefix: LookSet::empty(),
2458            look_set_suffix: LookSet::empty(),
2459            look_set_prefix_any: LookSet::empty(),
2460            look_set_suffix_any: LookSet::empty(),
2461            utf8: class.is_utf8(),
2462            explicit_captures_len: 0,
2463            static_explicit_captures_len: Some(0),
2464            literal: false,
2465            alternation_literal: false,
2466        };
2467        Properties(Box::new(inner))
2468    }
2469
2470    /// Create a new set of HIR properties for a look-around assertion.
2471    fn look(look: Look) -> Properties {
2472        let inner = PropertiesI {
2473            minimum_len: Some(0),
2474            maximum_len: Some(0),
2475            look_set: LookSet::singleton(look),
2476            look_set_prefix: LookSet::singleton(look),
2477            look_set_suffix: LookSet::singleton(look),
2478            look_set_prefix_any: LookSet::singleton(look),
2479            look_set_suffix_any: LookSet::singleton(look),
2480            // This requires a little explanation. Basically, we don't consider
2481            // matching an empty string to be equivalent to matching invalid
2482            // UTF-8, even though technically matching every empty string will
2483            // split the UTF-8 encoding of a single codepoint when treating a
2484            // UTF-8 encoded string as a sequence of bytes. Our defense here is
2485            // that in such a case, a codepoint should logically be treated as
2486            // the fundamental atom for matching, and thus the only valid match
2487            // points are between codepoints and not bytes.
2488            //
2489            // More practically, this is true here because it's also true
2490            // for 'Hir::empty()', otherwise something like 'a*' would be
2491            // considered to match invalid UTF-8. That in turn makes this
2492            // property borderline useless.
2493            utf8: true,
2494            explicit_captures_len: 0,
2495            static_explicit_captures_len: Some(0),
2496            literal: false,
2497            alternation_literal: false,
2498        };
2499        Properties(Box::new(inner))
2500    }
2501
2502    /// Create a new set of HIR properties for a repetition.
2503    fn repetition(rep: &Repetition) -> Properties {
2504        let p = rep.sub.properties();
2505        let minimum_len = p.minimum_len().map(|child_min| {
2506            let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2507            child_min.saturating_mul(rep_min)
2508        });
2509        let maximum_len = rep.max.and_then(|rep_max| {
2510            let rep_max = usize::try_from(rep_max).ok()?;
2511            let child_max = p.maximum_len()?;
2512            child_max.checked_mul(rep_max)
2513        });
2514
2515        let mut inner = PropertiesI {
2516            minimum_len,
2517            maximum_len,
2518            look_set: p.look_set(),
2519            look_set_prefix: LookSet::empty(),
2520            look_set_suffix: LookSet::empty(),
2521            look_set_prefix_any: p.look_set_prefix_any(),
2522            look_set_suffix_any: p.look_set_suffix_any(),
2523            utf8: p.is_utf8(),
2524            explicit_captures_len: p.explicit_captures_len(),
2525            static_explicit_captures_len: p.static_explicit_captures_len(),
2526            literal: false,
2527            alternation_literal: false,
2528        };
2529        // If the repetition operator can match the empty string, then its
2530        // lookset prefix and suffixes themselves remain empty since they are
2531        // no longer required to match.
2532        if rep.min > 0 {
2533            inner.look_set_prefix = p.look_set_prefix();
2534            inner.look_set_suffix = p.look_set_suffix();
2535        }
2536        // If the static captures len of the sub-expression is not known or
2537        // is greater than zero, then it automatically propagates to the
2538        // repetition, regardless of the repetition. Otherwise, it might
2539        // change, but only when the repetition can match 0 times.
2540        if rep.min == 0
2541            && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
2542        {
2543            // If we require a match 0 times, then our captures len is
2544            // guaranteed to be zero. Otherwise, if we *can* match the empty
2545            // string, then it's impossible to know how many captures will be
2546            // in the resulting match.
2547            if rep.max == Some(0) {
2548                inner.static_explicit_captures_len = Some(0);
2549            } else {
2550                inner.static_explicit_captures_len = None;
2551            }
2552        }
2553        Properties(Box::new(inner))
2554    }
2555
2556    /// Create a new set of HIR properties for a capture.
2557    fn capture(capture: &Capture) -> Properties {
2558        let p = capture.sub.properties();
2559        Properties(Box::new(PropertiesI {
2560            explicit_captures_len: p.explicit_captures_len().saturating_add(1),
2561            static_explicit_captures_len: p
2562                .static_explicit_captures_len()
2563                .map(|len| len.saturating_add(1)),
2564            literal: false,
2565            alternation_literal: false,
2566            ..*p.0.clone()
2567        }))
2568    }
2569
2570    /// Create a new set of HIR properties for a concatenation.
2571    fn concat(concat: &[Hir]) -> Properties {
2572        // The base case is an empty concatenation, which matches the empty
2573        // string. Note though that empty concatenations aren't possible,
2574        // because the Hir::concat smart constructor rewrites those as
2575        // Hir::empty.
2576        let mut props = PropertiesI {
2577            minimum_len: Some(0),
2578            maximum_len: Some(0),
2579            look_set: LookSet::empty(),
2580            look_set_prefix: LookSet::empty(),
2581            look_set_suffix: LookSet::empty(),
2582            look_set_prefix_any: LookSet::empty(),
2583            look_set_suffix_any: LookSet::empty(),
2584            utf8: true,
2585            explicit_captures_len: 0,
2586            static_explicit_captures_len: Some(0),
2587            literal: true,
2588            alternation_literal: true,
2589        };
2590        // Handle properties that need to visit every child hir.
2591        for x in concat.iter() {
2592            let p = x.properties();
2593            props.look_set.set_union(p.look_set());
2594            props.utf8 = props.utf8 && p.is_utf8();
2595            props.explicit_captures_len = props
2596                .explicit_captures_len
2597                .saturating_add(p.explicit_captures_len());
2598            props.static_explicit_captures_len = p
2599                .static_explicit_captures_len()
2600                .and_then(|len1| {
2601                    Some((len1, props.static_explicit_captures_len?))
2602                })
2603                .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
2604            props.literal = props.literal && p.is_literal();
2605            props.alternation_literal =
2606                props.alternation_literal && p.is_alternation_literal();
2607            if let Some(minimum_len) = props.minimum_len {
2608                match p.minimum_len() {
2609                    None => props.minimum_len = None,
2610                    Some(len) => {
2611                        // We use saturating arithmetic here because the
2612                        // minimum is just a lower bound. We can't go any
2613                        // higher than what our number types permit.
2614                        props.minimum_len =
2615                            Some(minimum_len.saturating_add(len));
2616                    }
2617                }
2618            }
2619            if let Some(maximum_len) = props.maximum_len {
2620                match p.maximum_len() {
2621                    None => props.maximum_len = None,
2622                    Some(len) => {
2623                        props.maximum_len = maximum_len.checked_add(len)
2624                    }
2625                }
2626            }
2627        }
2628        // Handle the prefix properties, which only requires visiting
2629        // child exprs until one matches more than the empty string.
2630        let mut it = concat.iter();
2631        while let Some(x) = it.next() {
2632            props.look_set_prefix.set_union(x.properties().look_set_prefix());
2633            props
2634                .look_set_prefix_any
2635                .set_union(x.properties().look_set_prefix_any());
2636            if x.properties().maximum_len().map_or(true, |x| x > 0) {
2637                break;
2638            }
2639        }
2640        // Same thing for the suffix properties, but in reverse.
2641        let mut it = concat.iter().rev();
2642        while let Some(x) = it.next() {
2643            props.look_set_suffix.set_union(x.properties().look_set_suffix());
2644            props
2645                .look_set_suffix_any
2646                .set_union(x.properties().look_set_suffix_any());
2647            if x.properties().maximum_len().map_or(true, |x| x > 0) {
2648                break;
2649            }
2650        }
2651        Properties(Box::new(props))
2652    }
2653
2654    /// Create a new set of HIR properties for a concatenation.
2655    fn alternation(alts: &[Hir]) -> Properties {
2656        Properties::union(alts.iter().map(|hir| hir.properties()))
2657    }
2658}
2659
2660/// A set of look-around assertions.
2661///
2662/// This is useful for efficiently tracking look-around assertions. For
2663/// example, an [`Hir`] provides properties that return `LookSet`s.
2664#[derive(Clone, Copy, Default, Eq, PartialEq)]
2665pub struct LookSet {
2666    /// The underlying representation this set is exposed to make it possible
2667    /// to store it somewhere efficiently. The representation is that
2668    /// of a bitset, where each assertion occupies bit `i` where `i =
2669    /// Look::as_repr()`.
2670    ///
2671    /// Note that users of this internal representation must permit the full
2672    /// range of `u16` values to be represented. For example, even if the
2673    /// current implementation only makes use of the 10 least significant bits,
2674    /// it may use more bits in a future semver compatible release.
2675    pub bits: u32,
2676}
2677
2678impl LookSet {
2679    /// Create an empty set of look-around assertions.
2680    #[inline]
2681    pub fn empty() -> LookSet {
2682        LookSet { bits: 0 }
2683    }
2684
2685    /// Create a full set of look-around assertions.
2686    ///
2687    /// This set contains all possible look-around assertions.
2688    #[inline]
2689    pub fn full() -> LookSet {
2690        LookSet { bits: !0 }
2691    }
2692
2693    /// Create a look-around set containing the look-around assertion given.
2694    ///
2695    /// This is a convenience routine for creating an empty set and inserting
2696    /// one look-around assertions.
2697    #[inline]
2698    pub fn singleton(look: Look) -> LookSet {
2699        LookSet::empty().insert(look)
2700    }
2701
2702    /// Returns the total number of look-around assertions in this set.
2703    #[inline]
2704    pub fn len(self) -> usize {
2705        // OK because max value always fits in a u8, which in turn always
2706        // fits in a usize, regardless of target.
2707        usize::try_from(self.bits.count_ones()).unwrap()
2708    }
2709
2710    /// Returns true if and only if this set is empty.
2711    #[inline]
2712    pub fn is_empty(self) -> bool {
2713        self.len() == 0
2714    }
2715
2716    /// Returns true if and only if the given look-around assertion is in this
2717    /// set.
2718    #[inline]
2719    pub fn contains(self, look: Look) -> bool {
2720        self.bits & look.as_repr() != 0
2721    }
2722
2723    /// Returns true if and only if this set contains any anchor assertions.
2724    /// This includes both "start/end of haystack" and "start/end of line."
2725    #[inline]
2726    pub fn contains_anchor(&self) -> bool {
2727        self.contains_anchor_haystack() || self.contains_anchor_line()
2728    }
2729
2730    /// Returns true if and only if this set contains any "start/end of
2731    /// haystack" anchors. This doesn't include "start/end of line" anchors.
2732    #[inline]
2733    pub fn contains_anchor_haystack(&self) -> bool {
2734        self.contains(Look::Start) || self.contains(Look::End)
2735    }
2736
2737    /// Returns true if and only if this set contains any "start/end of line"
2738    /// anchors. This doesn't include "start/end of haystack" anchors. This
2739    /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2740    #[inline]
2741    pub fn contains_anchor_line(&self) -> bool {
2742        self.contains(Look::StartLF)
2743            || self.contains(Look::EndLF)
2744            || self.contains(Look::StartCRLF)
2745            || self.contains(Look::EndCRLF)
2746    }
2747
2748    /// Returns true if and only if this set contains any "start/end of line"
2749    /// anchors that only treat `\n` as line terminators. This does not include
2750    /// haystack anchors or CRLF aware line anchors.
2751    #[inline]
2752    pub fn contains_anchor_lf(&self) -> bool {
2753        self.contains(Look::StartLF) || self.contains(Look::EndLF)
2754    }
2755
2756    /// Returns true if and only if this set contains any "start/end of line"
2757    /// anchors that are CRLF-aware. This doesn't include "start/end of
2758    /// haystack" or "start/end of line-feed" anchors.
2759    #[inline]
2760    pub fn contains_anchor_crlf(&self) -> bool {
2761        self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
2762    }
2763
2764    /// Returns true if and only if this set contains any word boundary or
2765    /// negated word boundary assertions. This include both Unicode and ASCII
2766    /// word boundaries.
2767    #[inline]
2768    pub fn contains_word(self) -> bool {
2769        self.contains_word_unicode() || self.contains_word_ascii()
2770    }
2771
2772    /// Returns true if and only if this set contains any Unicode word boundary
2773    /// or negated Unicode word boundary assertions.
2774    #[inline]
2775    pub fn contains_word_unicode(self) -> bool {
2776        self.contains(Look::WordUnicode)
2777            || self.contains(Look::WordUnicodeNegate)
2778            || self.contains(Look::WordStartUnicode)
2779            || self.contains(Look::WordEndUnicode)
2780            || self.contains(Look::WordStartHalfUnicode)
2781            || self.contains(Look::WordEndHalfUnicode)
2782    }
2783
2784    /// Returns true if and only if this set contains any ASCII word boundary
2785    /// or negated ASCII word boundary assertions.
2786    #[inline]
2787    pub fn contains_word_ascii(self) -> bool {
2788        self.contains(Look::WordAscii)
2789            || self.contains(Look::WordAsciiNegate)
2790            || self.contains(Look::WordStartAscii)
2791            || self.contains(Look::WordEndAscii)
2792            || self.contains(Look::WordStartHalfAscii)
2793            || self.contains(Look::WordEndHalfAscii)
2794    }
2795
2796    /// Returns an iterator over all of the look-around assertions in this set.
2797    #[inline]
2798    pub fn iter(self) -> LookSetIter {
2799        LookSetIter { set: self }
2800    }
2801
2802    /// Return a new set that is equivalent to the original, but with the given
2803    /// assertion added to it. If the assertion is already in the set, then the
2804    /// returned set is equivalent to the original.
2805    #[inline]
2806    pub fn insert(self, look: Look) -> LookSet {
2807        LookSet { bits: self.bits | look.as_repr() }
2808    }
2809
2810    /// Updates this set in place with the result of inserting the given
2811    /// assertion into this set.
2812    #[inline]
2813    pub fn set_insert(&mut self, look: Look) {
2814        *self = self.insert(look);
2815    }
2816
2817    /// Return a new set that is equivalent to the original, but with the given
2818    /// assertion removed from it. If the assertion is not in the set, then the
2819    /// returned set is equivalent to the original.
2820    #[inline]
2821    pub fn remove(self, look: Look) -> LookSet {
2822        LookSet { bits: self.bits & !look.as_repr() }
2823    }
2824
2825    /// Updates this set in place with the result of removing the given
2826    /// assertion from this set.
2827    #[inline]
2828    pub fn set_remove(&mut self, look: Look) {
2829        *self = self.remove(look);
2830    }
2831
2832    /// Returns a new set that is the result of subtracting the given set from
2833    /// this set.
2834    #[inline]
2835    pub fn subtract(self, other: LookSet) -> LookSet {
2836        LookSet { bits: self.bits & !other.bits }
2837    }
2838
2839    /// Updates this set in place with the result of subtracting the given set
2840    /// from this set.
2841    #[inline]
2842    pub fn set_subtract(&mut self, other: LookSet) {
2843        *self = self.subtract(other);
2844    }
2845
2846    /// Returns a new set that is the union of this and the one given.
2847    #[inline]
2848    pub fn union(self, other: LookSet) -> LookSet {
2849        LookSet { bits: self.bits | other.bits }
2850    }
2851
2852    /// Updates this set in place with the result of unioning it with the one
2853    /// given.
2854    #[inline]
2855    pub fn set_union(&mut self, other: LookSet) {
2856        *self = self.union(other);
2857    }
2858
2859    /// Returns a new set that is the intersection of this and the one given.
2860    #[inline]
2861    pub fn intersect(self, other: LookSet) -> LookSet {
2862        LookSet { bits: self.bits & other.bits }
2863    }
2864
2865    /// Updates this set in place with the result of intersecting it with the
2866    /// one given.
2867    #[inline]
2868    pub fn set_intersect(&mut self, other: LookSet) {
2869        *self = self.intersect(other);
2870    }
2871
2872    /// Return a `LookSet` from the slice given as a native endian 32-bit
2873    /// integer.
2874    ///
2875    /// # Panics
2876    ///
2877    /// This panics if `slice.len() < 4`.
2878    #[inline]
2879    pub fn read_repr(slice: &[u8]) -> LookSet {
2880        let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
2881        LookSet { bits }
2882    }
2883
2884    /// Write a `LookSet` as a native endian 32-bit integer to the beginning
2885    /// of the slice given.
2886    ///
2887    /// # Panics
2888    ///
2889    /// This panics if `slice.len() < 4`.
2890    #[inline]
2891    pub fn write_repr(self, slice: &mut [u8]) {
2892        let raw = self.bits.to_ne_bytes();
2893        slice[0] = raw[0];
2894        slice[1] = raw[1];
2895        slice[2] = raw[2];
2896        slice[3] = raw[3];
2897    }
2898}
2899
2900impl core::fmt::Debug for LookSet {
2901    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2902        if self.is_empty() {
2903            return write!(f, "∅");
2904        }
2905        for look in self.iter() {
2906            write!(f, "{}", look.as_char())?;
2907        }
2908        Ok(())
2909    }
2910}
2911
2912/// An iterator over all look-around assertions in a [`LookSet`].
2913///
2914/// This iterator is created by [`LookSet::iter`].
2915#[derive(Clone, Debug)]
2916pub struct LookSetIter {
2917    set: LookSet,
2918}
2919
2920impl Iterator for LookSetIter {
2921    type Item = Look;
2922
2923    #[inline]
2924    fn next(&mut self) -> Option<Look> {
2925        if self.set.is_empty() {
2926            return None;
2927        }
2928        // We'll never have more than u8::MAX distinct look-around assertions,
2929        // so 'bit' will always fit into a u16.
2930        let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2931        let look = Look::from_repr(1 << bit)?;
2932        self.set = self.set.remove(look);
2933        Some(look)
2934    }
2935}
2936
2937/// Given a sequence of HIR values where each value corresponds to a Unicode
2938/// class (or an all-ASCII byte class), return a single Unicode class
2939/// corresponding to the union of the classes found.
2940fn class_chars(hirs: &[Hir]) -> Option<Class> {
2941    let mut cls = ClassUnicode::new(vec![]);
2942    for hir in hirs.iter() {
2943        match *hir.kind() {
2944            HirKind::Class(Class::Unicode(ref cls2)) => {
2945                cls.union(cls2);
2946            }
2947            HirKind::Class(Class::Bytes(ref cls2)) => {
2948                cls.union(&cls2.to_unicode_class()?);
2949            }
2950            _ => return None,
2951        };
2952    }
2953    Some(Class::Unicode(cls))
2954}
2955
2956/// Given a sequence of HIR values where each value corresponds to a byte class
2957/// (or an all-ASCII Unicode class), return a single byte class corresponding
2958/// to the union of the classes found.
2959fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2960    let mut cls = ClassBytes::new(vec![]);
2961    for hir in hirs.iter() {
2962        match *hir.kind() {
2963            HirKind::Class(Class::Unicode(ref cls2)) => {
2964                cls.union(&cls2.to_byte_class()?);
2965            }
2966            HirKind::Class(Class::Bytes(ref cls2)) => {
2967                cls.union(cls2);
2968            }
2969            _ => return None,
2970        };
2971    }
2972    Some(Class::Bytes(cls))
2973}
2974
2975/// Given a sequence of HIR values where each value corresponds to a literal
2976/// that is a single `char`, return that sequence of `char`s. Otherwise return
2977/// None. No deduplication is done.
2978fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2979    let mut singletons = vec![];
2980    for hir in hirs.iter() {
2981        let literal = match *hir.kind() {
2982            HirKind::Literal(Literal(ref bytes)) => bytes,
2983            _ => return None,
2984        };
2985        let ch = match crate::debug::utf8_decode(literal) {
2986            None => return None,
2987            Some(Err(_)) => return None,
2988            Some(Ok(ch)) => ch,
2989        };
2990        if literal.len() != ch.len_utf8() {
2991            return None;
2992        }
2993        singletons.push(ch);
2994    }
2995    Some(singletons)
2996}
2997
2998/// Given a sequence of HIR values where each value corresponds to a literal
2999/// that is a single byte, return that sequence of bytes. Otherwise return
3000/// None. No deduplication is done.
3001fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
3002    let mut singletons = vec![];
3003    for hir in hirs.iter() {
3004        let literal = match *hir.kind() {
3005            HirKind::Literal(Literal(ref bytes)) => bytes,
3006            _ => return None,
3007        };
3008        if literal.len() != 1 {
3009            return None;
3010        }
3011        singletons.push(literal[0]);
3012    }
3013    Some(singletons)
3014}
3015
3016/// Looks for a common prefix in the list of alternation branches given. If one
3017/// is found, then an equivalent but (hopefully) simplified Hir is returned.
3018/// Otherwise, the original given list of branches is returned unmodified.
3019///
3020/// This is not quite as good as it could be. Right now, it requires that
3021/// all branches are 'Concat' expressions. It also doesn't do well with
3022/// literals. For example, given 'foofoo|foobar', it will not refactor it to
3023/// 'foo(?:foo|bar)' because literals are flattened into their own special
3024/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3025/// instead of a string of bytes because of this. Otherwise, handling the
3026/// current representation in this routine will be pretty gnarly. Sigh.)
3027fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3028    if hirs.len() <= 1 {
3029        return Err(hirs);
3030    }
3031    let mut prefix = match hirs[0].kind() {
3032        HirKind::Concat(ref xs) => &**xs,
3033        _ => return Err(hirs),
3034    };
3035    if prefix.is_empty() {
3036        return Err(hirs);
3037    }
3038    for h in hirs.iter().skip(1) {
3039        let concat = match h.kind() {
3040            HirKind::Concat(ref xs) => xs,
3041            _ => return Err(hirs),
3042        };
3043        let common_len = prefix
3044            .iter()
3045            .zip(concat.iter())
3046            .take_while(|(x, y)| x == y)
3047            .count();
3048        prefix = &prefix[..common_len];
3049        if prefix.is_empty() {
3050            return Err(hirs);
3051        }
3052    }
3053    let len = prefix.len();
3054    assert_ne!(0, len);
3055    let mut prefix_concat = vec![];
3056    let mut suffix_alts = vec![];
3057    for h in hirs {
3058        let mut concat = match h.into_kind() {
3059            HirKind::Concat(xs) => xs,
3060            // We required all sub-expressions to be
3061            // concats above, so we're only here if we
3062            // have a concat.
3063            _ => unreachable!(),
3064        };
3065        suffix_alts.push(Hir::concat(concat.split_off(len)));
3066        if prefix_concat.is_empty() {
3067            prefix_concat = concat;
3068        }
3069    }
3070    let mut concat = prefix_concat;
3071    concat.push(Hir::alternation(suffix_alts));
3072    Ok(Hir::concat(concat))
3073}
3074
3075#[cfg(test)]
3076mod tests {
3077    use super::*;
3078
3079    fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3080        let ranges: Vec<ClassUnicodeRange> = ranges
3081            .iter()
3082            .map(|&(s, e)| ClassUnicodeRange::new(s, e))
3083            .collect();
3084        ClassUnicode::new(ranges)
3085    }
3086
3087    fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3088        let ranges: Vec<ClassBytesRange> =
3089            ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
3090        ClassBytes::new(ranges)
3091    }
3092
3093    fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3094        cls.iter().map(|x| (x.start(), x.end())).collect()
3095    }
3096
3097    #[cfg(feature = "unicode-case")]
3098    fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3099        let mut cls_ = cls.clone();
3100        cls_.case_fold_simple();
3101        cls_
3102    }
3103
3104    fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3105        let mut cls_ = cls1.clone();
3106        cls_.union(cls2);
3107        cls_
3108    }
3109
3110    fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3111        let mut cls_ = cls1.clone();
3112        cls_.intersect(cls2);
3113        cls_
3114    }
3115
3116    fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3117        let mut cls_ = cls1.clone();
3118        cls_.difference(cls2);
3119        cls_
3120    }
3121
3122    fn usymdifference(
3123        cls1: &ClassUnicode,
3124        cls2: &ClassUnicode,
3125    ) -> ClassUnicode {
3126        let mut cls_ = cls1.clone();
3127        cls_.symmetric_difference(cls2);
3128        cls_
3129    }
3130
3131    fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3132        let mut cls_ = cls.clone();
3133        cls_.negate();
3134        cls_
3135    }
3136
3137    fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3138        cls.iter().map(|x| (x.start(), x.end())).collect()
3139    }
3140
3141    fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3142        let mut cls_ = cls.clone();
3143        cls_.case_fold_simple();
3144        cls_
3145    }
3146
3147    fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3148        let mut cls_ = cls1.clone();
3149        cls_.union(cls2);
3150        cls_
3151    }
3152
3153    fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3154        let mut cls_ = cls1.clone();
3155        cls_.intersect(cls2);
3156        cls_
3157    }
3158
3159    fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3160        let mut cls_ = cls1.clone();
3161        cls_.difference(cls2);
3162        cls_
3163    }
3164
3165    fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3166        let mut cls_ = cls1.clone();
3167        cls_.symmetric_difference(cls2);
3168        cls_
3169    }
3170
3171    fn bnegate(cls: &ClassBytes) -> ClassBytes {
3172        let mut cls_ = cls.clone();
3173        cls_.negate();
3174        cls_
3175    }
3176
3177    #[test]
3178    fn class_range_canonical_unicode() {
3179        let range = ClassUnicodeRange::new('\u{00FF}', '\0');
3180        assert_eq!('\0', range.start());
3181        assert_eq!('\u{00FF}', range.end());
3182    }
3183
3184    #[test]
3185    fn class_range_canonical_bytes() {
3186        let range = ClassBytesRange::new(b'\xFF', b'\0');
3187        assert_eq!(b'\0', range.start());
3188        assert_eq!(b'\xFF', range.end());
3189    }
3190
3191    #[test]
3192    fn class_canonicalize_unicode() {
3193        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3194        let expected = vec![('a', 'c'), ('x', 'z')];
3195        assert_eq!(expected, uranges(&cls));
3196
3197        let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3198        let expected = vec![('a', 'c'), ('x', 'z')];
3199        assert_eq!(expected, uranges(&cls));
3200
3201        let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3202        let expected = vec![('w', 'z')];
3203        assert_eq!(expected, uranges(&cls));
3204
3205        let cls = uclass(&[
3206            ('c', 'f'),
3207            ('a', 'g'),
3208            ('d', 'j'),
3209            ('a', 'c'),
3210            ('m', 'p'),
3211            ('l', 's'),
3212        ]);
3213        let expected = vec![('a', 'j'), ('l', 's')];
3214        assert_eq!(expected, uranges(&cls));
3215
3216        let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3217        let expected = vec![('u', 'z')];
3218        assert_eq!(expected, uranges(&cls));
3219
3220        let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
3221        let expected = vec![('\x00', '\u{10FFFF}')];
3222        assert_eq!(expected, uranges(&cls));
3223
3224        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3225        let expected = vec![('a', 'b')];
3226        assert_eq!(expected, uranges(&cls));
3227    }
3228
3229    #[test]
3230    fn class_canonicalize_bytes() {
3231        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3232        let expected = vec![(b'a', b'c'), (b'x', b'z')];
3233        assert_eq!(expected, branges(&cls));
3234
3235        let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3236        let expected = vec![(b'a', b'c'), (b'x', b'z')];
3237        assert_eq!(expected, branges(&cls));
3238
3239        let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3240        let expected = vec![(b'w', b'z')];
3241        assert_eq!(expected, branges(&cls));
3242
3243        let cls = bclass(&[
3244            (b'c', b'f'),
3245            (b'a', b'g'),
3246            (b'd', b'j'),
3247            (b'a', b'c'),
3248            (b'm', b'p'),
3249            (b'l', b's'),
3250        ]);
3251        let expected = vec![(b'a', b'j'), (b'l', b's')];
3252        assert_eq!(expected, branges(&cls));
3253
3254        let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3255        let expected = vec![(b'u', b'z')];
3256        assert_eq!(expected, branges(&cls));
3257
3258        let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
3259        let expected = vec![(b'\x00', b'\xFF')];
3260        assert_eq!(expected, branges(&cls));
3261
3262        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3263        let expected = vec![(b'a', b'b')];
3264        assert_eq!(expected, branges(&cls));
3265    }
3266
3267    #[test]
3268    #[cfg(feature = "unicode-case")]
3269    fn class_case_fold_unicode() {
3270        let cls = uclass(&[
3271            ('C', 'F'),
3272            ('A', 'G'),
3273            ('D', 'J'),
3274            ('A', 'C'),
3275            ('M', 'P'),
3276            ('L', 'S'),
3277            ('c', 'f'),
3278        ]);
3279        let expected = uclass(&[
3280            ('A', 'J'),
3281            ('L', 'S'),
3282            ('a', 'j'),
3283            ('l', 's'),
3284            ('\u{17F}', '\u{17F}'),
3285        ]);
3286        assert_eq!(expected, ucasefold(&cls));
3287
3288        let cls = uclass(&[('A', 'Z')]);
3289        let expected = uclass(&[
3290            ('A', 'Z'),
3291            ('a', 'z'),
3292            ('\u{17F}', '\u{17F}'),
3293            ('\u{212A}', '\u{212A}'),
3294        ]);
3295        assert_eq!(expected, ucasefold(&cls));
3296
3297        let cls = uclass(&[('a', 'z')]);
3298        let expected = uclass(&[
3299            ('A', 'Z'),
3300            ('a', 'z'),
3301            ('\u{17F}', '\u{17F}'),
3302            ('\u{212A}', '\u{212A}'),
3303        ]);
3304        assert_eq!(expected, ucasefold(&cls));
3305
3306        let cls = uclass(&[('A', 'A'), ('_', '_')]);
3307        let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3308        assert_eq!(expected, ucasefold(&cls));
3309
3310        let cls = uclass(&[('A', 'A'), ('=', '=')]);
3311        let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3312        assert_eq!(expected, ucasefold(&cls));
3313
3314        let cls = uclass(&[('\x00', '\x10')]);
3315        assert_eq!(cls, ucasefold(&cls));
3316
3317        let cls = uclass(&[('k', 'k')]);
3318        let expected =
3319            uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
3320        assert_eq!(expected, ucasefold(&cls));
3321
3322        let cls = uclass(&[('@', '@')]);
3323        assert_eq!(cls, ucasefold(&cls));
3324    }
3325
3326    #[test]
3327    #[cfg(not(feature = "unicode-case"))]
3328    fn class_case_fold_unicode_disabled() {
3329        let mut cls = uclass(&[
3330            ('C', 'F'),
3331            ('A', 'G'),
3332            ('D', 'J'),
3333            ('A', 'C'),
3334            ('M', 'P'),
3335            ('L', 'S'),
3336            ('c', 'f'),
3337        ]);
3338        assert!(cls.try_case_fold_simple().is_err());
3339    }
3340
3341    #[test]
3342    #[should_panic]
3343    #[cfg(not(feature = "unicode-case"))]
3344    fn class_case_fold_unicode_disabled_panics() {
3345        let mut cls = uclass(&[
3346            ('C', 'F'),
3347            ('A', 'G'),
3348            ('D', 'J'),
3349            ('A', 'C'),
3350            ('M', 'P'),
3351            ('L', 'S'),
3352            ('c', 'f'),
3353        ]);
3354        cls.case_fold_simple();
3355    }
3356
3357    #[test]
3358    fn class_case_fold_bytes() {
3359        let cls = bclass(&[
3360            (b'C', b'F'),
3361            (b'A', b'G'),
3362            (b'D', b'J'),
3363            (b'A', b'C'),
3364            (b'M', b'P'),
3365            (b'L', b'S'),
3366            (b'c', b'f'),
3367        ]);
3368        let expected =
3369            bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3370        assert_eq!(expected, bcasefold(&cls));
3371
3372        let cls = bclass(&[(b'A', b'Z')]);
3373        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3374        assert_eq!(expected, bcasefold(&cls));
3375
3376        let cls = bclass(&[(b'a', b'z')]);
3377        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3378        assert_eq!(expected, bcasefold(&cls));
3379
3380        let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3381        let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3382        assert_eq!(expected, bcasefold(&cls));
3383
3384        let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3385        let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3386        assert_eq!(expected, bcasefold(&cls));
3387
3388        let cls = bclass(&[(b'\x00', b'\x10')]);
3389        assert_eq!(cls, bcasefold(&cls));
3390
3391        let cls = bclass(&[(b'k', b'k')]);
3392        let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3393        assert_eq!(expected, bcasefold(&cls));
3394
3395        let cls = bclass(&[(b'@', b'@')]);
3396        assert_eq!(cls, bcasefold(&cls));
3397    }
3398
3399    #[test]
3400    fn class_negate_unicode() {
3401        let cls = uclass(&[('a', 'a')]);
3402        let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
3403        assert_eq!(expected, unegate(&cls));
3404
3405        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3406        let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
3407        assert_eq!(expected, unegate(&cls));
3408
3409        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3410        let expected = uclass(&[
3411            ('\x00', '\x60'),
3412            ('\x64', '\x77'),
3413            ('\x7B', '\u{10FFFF}'),
3414        ]);
3415        assert_eq!(expected, unegate(&cls));
3416
3417        let cls = uclass(&[('\x00', 'a')]);
3418        let expected = uclass(&[('\x62', '\u{10FFFF}')]);
3419        assert_eq!(expected, unegate(&cls));
3420
3421        let cls = uclass(&[('a', '\u{10FFFF}')]);
3422        let expected = uclass(&[('\x00', '\x60')]);
3423        assert_eq!(expected, unegate(&cls));
3424
3425        let cls = uclass(&[('\x00', '\u{10FFFF}')]);
3426        let expected = uclass(&[]);
3427        assert_eq!(expected, unegate(&cls));
3428
3429        let cls = uclass(&[]);
3430        let expected = uclass(&[('\x00', '\u{10FFFF}')]);
3431        assert_eq!(expected, unegate(&cls));
3432
3433        let cls =
3434            uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
3435        let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
3436        assert_eq!(expected, unegate(&cls));
3437
3438        let cls = uclass(&[('\x00', '\u{D7FF}')]);
3439        let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3440        assert_eq!(expected, unegate(&cls));
3441
3442        let cls = uclass(&[('\x00', '\u{D7FE}')]);
3443        let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
3444        assert_eq!(expected, unegate(&cls));
3445
3446        let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3447        let expected = uclass(&[('\x00', '\u{D7FF}')]);
3448        assert_eq!(expected, unegate(&cls));
3449
3450        let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
3451        let expected = uclass(&[('\x00', '\u{E000}')]);
3452        assert_eq!(expected, unegate(&cls));
3453    }
3454
3455    #[test]
3456    fn class_negate_bytes() {
3457        let cls = bclass(&[(b'a', b'a')]);
3458        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
3459        assert_eq!(expected, bnegate(&cls));
3460
3461        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3462        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
3463        assert_eq!(expected, bnegate(&cls));
3464
3465        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3466        let expected = bclass(&[
3467            (b'\x00', b'\x60'),
3468            (b'\x64', b'\x77'),
3469            (b'\x7B', b'\xFF'),
3470        ]);
3471        assert_eq!(expected, bnegate(&cls));
3472
3473        let cls = bclass(&[(b'\x00', b'a')]);
3474        let expected = bclass(&[(b'\x62', b'\xFF')]);
3475        assert_eq!(expected, bnegate(&cls));
3476
3477        let cls = bclass(&[(b'a', b'\xFF')]);
3478        let expected = bclass(&[(b'\x00', b'\x60')]);
3479        assert_eq!(expected, bnegate(&cls));
3480
3481        let cls = bclass(&[(b'\x00', b'\xFF')]);
3482        let expected = bclass(&[]);
3483        assert_eq!(expected, bnegate(&cls));
3484
3485        let cls = bclass(&[]);
3486        let expected = bclass(&[(b'\x00', b'\xFF')]);
3487        assert_eq!(expected, bnegate(&cls));
3488
3489        let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
3490        let expected = bclass(&[(b'\xFE', b'\xFE')]);
3491        assert_eq!(expected, bnegate(&cls));
3492    }
3493
3494    #[test]
3495    fn class_union_unicode() {
3496        let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3497        let cls2 = uclass(&[('a', 'z')]);
3498        let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3499        assert_eq!(expected, uunion(&cls1, &cls2));
3500    }
3501
3502    #[test]
3503    fn class_union_bytes() {
3504        let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3505        let cls2 = bclass(&[(b'a', b'z')]);
3506        let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3507        assert_eq!(expected, bunion(&cls1, &cls2));
3508    }
3509
3510    #[test]
3511    fn class_intersect_unicode() {
3512        let cls1 = uclass(&[]);
3513        let cls2 = uclass(&[('a', 'a')]);
3514        let expected = uclass(&[]);
3515        assert_eq!(expected, uintersect(&cls1, &cls2));
3516
3517        let cls1 = uclass(&[('a', 'a')]);
3518        let cls2 = uclass(&[('a', 'a')]);
3519        let expected = uclass(&[('a', 'a')]);
3520        assert_eq!(expected, uintersect(&cls1, &cls2));
3521
3522        let cls1 = uclass(&[('a', 'a')]);
3523        let cls2 = uclass(&[('b', 'b')]);
3524        let expected = uclass(&[]);
3525        assert_eq!(expected, uintersect(&cls1, &cls2));
3526
3527        let cls1 = uclass(&[('a', 'a')]);
3528        let cls2 = uclass(&[('a', 'c')]);
3529        let expected = uclass(&[('a', 'a')]);
3530        assert_eq!(expected, uintersect(&cls1, &cls2));
3531
3532        let cls1 = uclass(&[('a', 'b')]);
3533        let cls2 = uclass(&[('a', 'c')]);
3534        let expected = uclass(&[('a', 'b')]);
3535        assert_eq!(expected, uintersect(&cls1, &cls2));
3536
3537        let cls1 = uclass(&[('a', 'b')]);
3538        let cls2 = uclass(&[('b', 'c')]);
3539        let expected = uclass(&[('b', 'b')]);
3540        assert_eq!(expected, uintersect(&cls1, &cls2));
3541
3542        let cls1 = uclass(&[('a', 'b')]);
3543        let cls2 = uclass(&[('c', 'd')]);
3544        let expected = uclass(&[]);
3545        assert_eq!(expected, uintersect(&cls1, &cls2));
3546
3547        let cls1 = uclass(&[('b', 'c')]);
3548        let cls2 = uclass(&[('a', 'd')]);
3549        let expected = uclass(&[('b', 'c')]);
3550        assert_eq!(expected, uintersect(&cls1, &cls2));
3551
3552        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3553        let cls2 = uclass(&[('a', 'h')]);
3554        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3555        assert_eq!(expected, uintersect(&cls1, &cls2));
3556
3557        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3558        let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3559        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3560        assert_eq!(expected, uintersect(&cls1, &cls2));
3561
3562        let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3563        let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3564        let expected = uclass(&[]);
3565        assert_eq!(expected, uintersect(&cls1, &cls2));
3566
3567        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3568        let cls2 = uclass(&[('h', 'h')]);
3569        let expected = uclass(&[('h', 'h')]);
3570        assert_eq!(expected, uintersect(&cls1, &cls2));
3571
3572        let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3573        let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3574        let expected = uclass(&[]);
3575        assert_eq!(expected, uintersect(&cls1, &cls2));
3576
3577        let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3578        let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3579        let expected = uclass(&[('b', 'f')]);
3580        assert_eq!(expected, uintersect(&cls1, &cls2));
3581    }
3582
3583    #[test]
3584    fn class_intersect_bytes() {
3585        let cls1 = bclass(&[]);
3586        let cls2 = bclass(&[(b'a', b'a')]);
3587        let expected = bclass(&[]);
3588        assert_eq!(expected, bintersect(&cls1, &cls2));
3589
3590        let cls1 = bclass(&[(b'a', b'a')]);
3591        let cls2 = bclass(&[(b'a', b'a')]);
3592        let expected = bclass(&[(b'a', b'a')]);
3593        assert_eq!(expected, bintersect(&cls1, &cls2));
3594
3595        let cls1 = bclass(&[(b'a', b'a')]);
3596        let cls2 = bclass(&[(b'b', b'b')]);
3597        let expected = bclass(&[]);
3598        assert_eq!(expected, bintersect(&cls1, &cls2));
3599
3600        let cls1 = bclass(&[(b'a', b'a')]);
3601        let cls2 = bclass(&[(b'a', b'c')]);
3602        let expected = bclass(&[(b'a', b'a')]);
3603        assert_eq!(expected, bintersect(&cls1, &cls2));
3604
3605        let cls1 = bclass(&[(b'a', b'b')]);
3606        let cls2 = bclass(&[(b'a', b'c')]);
3607        let expected = bclass(&[(b'a', b'b')]);
3608        assert_eq!(expected, bintersect(&cls1, &cls2));
3609
3610        let cls1 = bclass(&[(b'a', b'b')]);
3611        let cls2 = bclass(&[(b'b', b'c')]);
3612        let expected = bclass(&[(b'b', b'b')]);
3613        assert_eq!(expected, bintersect(&cls1, &cls2));
3614
3615        let cls1 = bclass(&[(b'a', b'b')]);
3616        let cls2 = bclass(&[(b'c', b'd')]);
3617        let expected = bclass(&[]);
3618        assert_eq!(expected, bintersect(&cls1, &cls2));
3619
3620        let cls1 = bclass(&[(b'b', b'c')]);
3621        let cls2 = bclass(&[(b'a', b'd')]);
3622        let expected = bclass(&[(b'b', b'c')]);
3623        assert_eq!(expected, bintersect(&cls1, &cls2));
3624
3625        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3626        let cls2 = bclass(&[(b'a', b'h')]);
3627        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3628        assert_eq!(expected, bintersect(&cls1, &cls2));
3629
3630        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3631        let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3632        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3633        assert_eq!(expected, bintersect(&cls1, &cls2));
3634
3635        let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3636        let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3637        let expected = bclass(&[]);
3638        assert_eq!(expected, bintersect(&cls1, &cls2));
3639
3640        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3641        let cls2 = bclass(&[(b'h', b'h')]);
3642        let expected = bclass(&[(b'h', b'h')]);
3643        assert_eq!(expected, bintersect(&cls1, &cls2));
3644
3645        let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3646        let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3647        let expected = bclass(&[]);
3648        assert_eq!(expected, bintersect(&cls1, &cls2));
3649
3650        let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3651        let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3652        let expected = bclass(&[(b'b', b'f')]);
3653        assert_eq!(expected, bintersect(&cls1, &cls2));
3654    }
3655
3656    #[test]
3657    fn class_difference_unicode() {
3658        let cls1 = uclass(&[('a', 'a')]);
3659        let cls2 = uclass(&[('a', 'a')]);
3660        let expected = uclass(&[]);
3661        assert_eq!(expected, udifference(&cls1, &cls2));
3662
3663        let cls1 = uclass(&[('a', 'a')]);
3664        let cls2 = uclass(&[]);
3665        let expected = uclass(&[('a', 'a')]);
3666        assert_eq!(expected, udifference(&cls1, &cls2));
3667
3668        let cls1 = uclass(&[]);
3669        let cls2 = uclass(&[('a', 'a')]);
3670        let expected = uclass(&[]);
3671        assert_eq!(expected, udifference(&cls1, &cls2));
3672
3673        let cls1 = uclass(&[('a', 'z')]);
3674        let cls2 = uclass(&[('a', 'a')]);
3675        let expected = uclass(&[('b', 'z')]);
3676        assert_eq!(expected, udifference(&cls1, &cls2));
3677
3678        let cls1 = uclass(&[('a', 'z')]);
3679        let cls2 = uclass(&[('z', 'z')]);
3680        let expected = uclass(&[('a', 'y')]);
3681        assert_eq!(expected, udifference(&cls1, &cls2));
3682
3683        let cls1 = uclass(&[('a', 'z')]);
3684        let cls2 = uclass(&[('m', 'm')]);
3685        let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3686        assert_eq!(expected, udifference(&cls1, &cls2));
3687
3688        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3689        let cls2 = uclass(&[('a', 'z')]);
3690        let expected = uclass(&[]);
3691        assert_eq!(expected, udifference(&cls1, &cls2));
3692
3693        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3694        let cls2 = uclass(&[('d', 'v')]);
3695        let expected = uclass(&[('a', 'c')]);
3696        assert_eq!(expected, udifference(&cls1, &cls2));
3697
3698        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3699        let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3700        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3701        assert_eq!(expected, udifference(&cls1, &cls2));
3702
3703        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3704        let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3705        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3706        assert_eq!(expected, udifference(&cls1, &cls2));
3707
3708        let cls1 = uclass(&[('x', 'z')]);
3709        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3710        let expected = uclass(&[('x', 'z')]);
3711        assert_eq!(expected, udifference(&cls1, &cls2));
3712
3713        let cls1 = uclass(&[('a', 'z')]);
3714        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3715        let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3716        assert_eq!(expected, udifference(&cls1, &cls2));
3717    }
3718
3719    #[test]
3720    fn class_difference_bytes() {
3721        let cls1 = bclass(&[(b'a', b'a')]);
3722        let cls2 = bclass(&[(b'a', b'a')]);
3723        let expected = bclass(&[]);
3724        assert_eq!(expected, bdifference(&cls1, &cls2));
3725
3726        let cls1 = bclass(&[(b'a', b'a')]);
3727        let cls2 = bclass(&[]);
3728        let expected = bclass(&[(b'a', b'a')]);
3729        assert_eq!(expected, bdifference(&cls1, &cls2));
3730
3731        let cls1 = bclass(&[]);
3732        let cls2 = bclass(&[(b'a', b'a')]);
3733        let expected = bclass(&[]);
3734        assert_eq!(expected, bdifference(&cls1, &cls2));
3735
3736        let cls1 = bclass(&[(b'a', b'z')]);
3737        let cls2 = bclass(&[(b'a', b'a')]);
3738        let expected = bclass(&[(b'b', b'z')]);
3739        assert_eq!(expected, bdifference(&cls1, &cls2));
3740
3741        let cls1 = bclass(&[(b'a', b'z')]);
3742        let cls2 = bclass(&[(b'z', b'z')]);
3743        let expected = bclass(&[(b'a', b'y')]);
3744        assert_eq!(expected, bdifference(&cls1, &cls2));
3745
3746        let cls1 = bclass(&[(b'a', b'z')]);
3747        let cls2 = bclass(&[(b'm', b'm')]);
3748        let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3749        assert_eq!(expected, bdifference(&cls1, &cls2));
3750
3751        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3752        let cls2 = bclass(&[(b'a', b'z')]);
3753        let expected = bclass(&[]);
3754        assert_eq!(expected, bdifference(&cls1, &cls2));
3755
3756        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3757        let cls2 = bclass(&[(b'd', b'v')]);
3758        let expected = bclass(&[(b'a', b'c')]);
3759        assert_eq!(expected, bdifference(&cls1, &cls2));
3760
3761        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3762        let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3763        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3764        assert_eq!(expected, bdifference(&cls1, &cls2));
3765
3766        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3767        let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3768        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3769        assert_eq!(expected, bdifference(&cls1, &cls2));
3770
3771        let cls1 = bclass(&[(b'x', b'z')]);
3772        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3773        let expected = bclass(&[(b'x', b'z')]);
3774        assert_eq!(expected, bdifference(&cls1, &cls2));
3775
3776        let cls1 = bclass(&[(b'a', b'z')]);
3777        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3778        let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3779        assert_eq!(expected, bdifference(&cls1, &cls2));
3780    }
3781
3782    #[test]
3783    fn class_symmetric_difference_unicode() {
3784        let cls1 = uclass(&[('a', 'm')]);
3785        let cls2 = uclass(&[('g', 't')]);
3786        let expected = uclass(&[('a', 'f'), ('n', 't')]);
3787        assert_eq!(expected, usymdifference(&cls1, &cls2));
3788    }
3789
3790    #[test]
3791    fn class_symmetric_difference_bytes() {
3792        let cls1 = bclass(&[(b'a', b'm')]);
3793        let cls2 = bclass(&[(b'g', b't')]);
3794        let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3795        assert_eq!(expected, bsymdifference(&cls1, &cls2));
3796    }
3797
3798    // We use a thread with an explicit stack size to test that our destructor
3799    // for Hir can handle arbitrarily sized expressions in constant stack
3800    // space. In case we run on a platform without threads (WASM?), we limit
3801    // this test to Windows/Unix.
3802    #[test]
3803    #[cfg(any(unix, windows))]
3804    fn no_stack_overflow_on_drop() {
3805        use std::thread;
3806
3807        let run = || {
3808            let mut expr = Hir::empty();
3809            for _ in 0..100 {
3810                expr = Hir::capture(Capture {
3811                    index: 1,
3812                    name: None,
3813                    sub: Box::new(expr),
3814                });
3815                expr = Hir::repetition(Repetition {
3816                    min: 0,
3817                    max: Some(1),
3818                    greedy: true,
3819                    sub: Box::new(expr),
3820                });
3821
3822                expr = Hir {
3823                    kind: HirKind::Concat(vec![expr]),
3824                    props: Properties::empty(),
3825                };
3826                expr = Hir {
3827                    kind: HirKind::Alternation(vec![expr]),
3828                    props: Properties::empty(),
3829                };
3830            }
3831            assert!(!matches!(*expr.kind(), HirKind::Empty));
3832        };
3833
3834        // We run our test on a thread with a small stack size so we can
3835        // force the issue more easily.
3836        //
3837        // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3838        // for context on the specific stack size chosen here.
3839        thread::Builder::new()
3840            .stack_size(16 << 10)
3841            .spawn(run)
3842            .unwrap()
3843            .join()
3844            .unwrap();
3845    }
3846
3847    #[test]
3848    fn look_set_iter() {
3849        let set = LookSet::empty();
3850        assert_eq!(0, set.iter().count());
3851
3852        let set = LookSet::full();
3853        assert_eq!(18, set.iter().count());
3854
3855        let set =
3856            LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3857        assert_eq!(2, set.iter().count());
3858
3859        let set = LookSet::empty().insert(Look::StartLF);
3860        assert_eq!(1, set.iter().count());
3861
3862        let set = LookSet::empty().insert(Look::WordAsciiNegate);
3863        assert_eq!(1, set.iter().count());
3864    }
3865
3866    #[test]
3867    fn look_set_debug() {
3868        let res = format!("{:?}", LookSet::empty());
3869        assert_eq!("∅", res);
3870        let res = format!("{:?}", LookSet::full());
3871        assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
3872    }
3873}
regex_syntax/hir/mod.rs

regex_syntax/hir/
mod.rs