regex_syntax/ast/
mod.rs

1/*!
2Defines an abstract syntax for regular expressions.
3*/
4
5use core::cmp::Ordering;
6
7use alloc::{boxed::Box, string::String, vec, vec::Vec};
8
9pub use crate::ast::visitor::{visit, Visitor};
10
11pub mod parse;
12pub mod print;
13mod visitor;
14
15/// An error that occurred while parsing a regular expression into an abstract
16/// syntax tree.
17///
18/// Note that not all ASTs represents a valid regular expression. For example,
19/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20/// valid Unicode property name. That particular error is reported when
21/// translating an AST to the high-level intermediate representation (`HIR`).
22#[derive(Clone, Debug, Eq, PartialEq)]
23#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24pub struct Error {
25    /// The kind of error.
26    kind: ErrorKind,
27    /// The original pattern that the parser generated the error from. Every
28    /// span in an error is a valid range into this string.
29    pattern: String,
30    /// The span of this error.
31    span: Span,
32}
33
34impl Error {
35    /// Return the type of this error.
36    pub fn kind(&self) -> &ErrorKind {
37        &self.kind
38    }
39
40    /// The original pattern string in which this error occurred.
41    ///
42    /// Every span reported by this error is reported in terms of this string.
43    pub fn pattern(&self) -> &str {
44        &self.pattern
45    }
46
47    /// Return the span at which this error occurred.
48    pub fn span(&self) -> &Span {
49        &self.span
50    }
51
52    /// Return an auxiliary span. This span exists only for some errors that
53    /// benefit from being able to point to two locations in the original
54    /// regular expression. For example, "duplicate" errors will have the
55    /// main error position set to the duplicate occurrence while its
56    /// auxiliary span will be set to the initial occurrence.
57    pub fn auxiliary_span(&self) -> Option<&Span> {
58        use self::ErrorKind::*;
59        match self.kind {
60            FlagDuplicate { ref original } => Some(original),
61            FlagRepeatedNegation { ref original, .. } => Some(original),
62            GroupNameDuplicate { ref original, .. } => Some(original),
63            _ => None,
64        }
65    }
66}
67
68/// The type of an error that occurred while building an AST.
69///
70/// This error type is marked as `non_exhaustive`. This means that adding a
71/// new variant is not considered a breaking change.
72#[non_exhaustive]
73#[derive(Clone, Debug, Eq, PartialEq)]
74#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
75pub enum ErrorKind {
76    /// The capturing group limit was exceeded.
77    ///
78    /// Note that this represents a limit on the total number of capturing
79    /// groups in a regex and not necessarily the number of nested capturing
80    /// groups. That is, the nest limit can be low and it is still possible for
81    /// this error to occur.
82    CaptureLimitExceeded,
83    /// An invalid escape sequence was found in a character class set.
84    ClassEscapeInvalid,
85    /// An invalid character class range was found. An invalid range is any
86    /// range where the start is greater than the end.
87    ClassRangeInvalid,
88    /// An invalid range boundary was found in a character class. Range
89    /// boundaries must be a single literal codepoint, but this error indicates
90    /// that something else was found, such as a nested class.
91    ClassRangeLiteral,
92    /// An opening `[` was found with no corresponding closing `]`.
93    ClassUnclosed,
94    /// Note that this error variant is no longer used. Namely, a decimal
95    /// number can only appear as a repetition quantifier. When the number
96    /// in a repetition quantifier is empty, then it gets its own specialized
97    /// error, `RepetitionCountDecimalEmpty`.
98    DecimalEmpty,
99    /// An invalid decimal number was given where one was expected.
100    DecimalInvalid,
101    /// A bracketed hex literal was empty.
102    EscapeHexEmpty,
103    /// A bracketed hex literal did not correspond to a Unicode scalar value.
104    EscapeHexInvalid,
105    /// An invalid hexadecimal digit was found.
106    EscapeHexInvalidDigit,
107    /// EOF was found before an escape sequence was completed.
108    EscapeUnexpectedEof,
109    /// An unrecognized escape sequence.
110    EscapeUnrecognized,
111    /// A dangling negation was used when setting flags, e.g., `i-`.
112    FlagDanglingNegation,
113    /// A flag was used twice, e.g., `i-i`.
114    FlagDuplicate {
115        /// The position of the original flag. The error position
116        /// points to the duplicate flag.
117        original: Span,
118    },
119    /// The negation operator was used twice, e.g., `-i-s`.
120    FlagRepeatedNegation {
121        /// The position of the original negation operator. The error position
122        /// points to the duplicate negation operator.
123        original: Span,
124    },
125    /// Expected a flag but got EOF, e.g., `(?`.
126    FlagUnexpectedEof,
127    /// Unrecognized flag, e.g., `a`.
128    FlagUnrecognized,
129    /// A duplicate capture name was found.
130    GroupNameDuplicate {
131        /// The position of the initial occurrence of the capture name. The
132        /// error position itself points to the duplicate occurrence.
133        original: Span,
134    },
135    /// A capture group name is empty, e.g., `(?P<>abc)`.
136    GroupNameEmpty,
137    /// An invalid character was seen for a capture group name. This includes
138    /// errors where the first character is a digit (even though subsequent
139    /// characters are allowed to be digits).
140    GroupNameInvalid,
141    /// A closing `>` could not be found for a capture group name.
142    GroupNameUnexpectedEof,
143    /// An unclosed group, e.g., `(ab`.
144    ///
145    /// The span of this error corresponds to the unclosed parenthesis.
146    GroupUnclosed,
147    /// An unopened group, e.g., `ab)`.
148    GroupUnopened,
149    /// The nest limit was exceeded. The limit stored here is the limit
150    /// configured in the parser.
151    NestLimitExceeded(u32),
152    /// The range provided in a counted repetition operator is invalid. The
153    /// range is invalid if the start is greater than the end.
154    RepetitionCountInvalid,
155    /// An opening `{` was not followed by a valid decimal value.
156    /// For example, `x{}` or `x{]}` would fail.
157    RepetitionCountDecimalEmpty,
158    /// An opening `{` was found with no corresponding closing `}`.
159    RepetitionCountUnclosed,
160    /// A repetition operator was applied to a missing sub-expression. This
161    /// occurs, for example, in the regex consisting of just a `*` or even
162    /// `(?i)*`. It is, however, possible to create a repetition operating on
163    /// an empty sub-expression. For example, `()*` is still considered valid.
164    RepetitionMissing,
165    /// The special word boundary syntax, `\b{something}`, was used, but
166    /// either EOF without `}` was seen, or an invalid character in the
167    /// braces was seen.
168    SpecialWordBoundaryUnclosed,
169    /// The special word boundary syntax, `\b{something}`, was used, but
170    /// `something` was not recognized as a valid word boundary kind.
171    SpecialWordBoundaryUnrecognized,
172    /// The syntax `\b{` was observed, but afterwards the end of the pattern
173    /// was observed without being able to tell whether it was meant to be a
174    /// bounded repetition on the `\b` or the beginning of a special word
175    /// boundary assertion.
176    SpecialWordOrRepetitionUnexpectedEof,
177    /// The Unicode class is not valid. This typically occurs when a `\p` is
178    /// followed by something other than a `{`.
179    UnicodeClassInvalid,
180    /// When octal support is disabled, this error is produced when an octal
181    /// escape is used. The octal escape is assumed to be an invocation of
182    /// a backreference, which is the common case.
183    UnsupportedBackreference,
184    /// When syntax similar to PCRE's look-around is used, this error is
185    /// returned. Some example syntaxes that are rejected include, but are
186    /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
187    /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
188    /// error is used to improve the user experience.
189    UnsupportedLookAround,
190}
191
192#[cfg(feature = "std")]
193impl std::error::Error for Error {}
194
195impl core::fmt::Display for Error {
196    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
197        crate::error::Formatter::from(self).fmt(f)
198    }
199}
200
201impl core::fmt::Display for ErrorKind {
202    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
203        use self::ErrorKind::*;
204        match *self {
205            CaptureLimitExceeded => write!(
206                f,
207                "exceeded the maximum number of \
208                 capturing groups ({})",
209                u32::MAX
210            ),
211            ClassEscapeInvalid => {
212                write!(f, "invalid escape sequence found in character class")
213            }
214            ClassRangeInvalid => write!(
215                f,
216                "invalid character class range, \
217                 the start must be <= the end"
218            ),
219            ClassRangeLiteral => {
220                write!(f, "invalid range boundary, must be a literal")
221            }
222            ClassUnclosed => write!(f, "unclosed character class"),
223            DecimalEmpty => write!(f, "decimal literal empty"),
224            DecimalInvalid => write!(f, "decimal literal invalid"),
225            EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
226            EscapeHexInvalid => {
227                write!(f, "hexadecimal literal is not a Unicode scalar value")
228            }
229            EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
230            EscapeUnexpectedEof => write!(
231                f,
232                "incomplete escape sequence, \
233                 reached end of pattern prematurely"
234            ),
235            EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
236            FlagDanglingNegation => {
237                write!(f, "dangling flag negation operator")
238            }
239            FlagDuplicate { .. } => write!(f, "duplicate flag"),
240            FlagRepeatedNegation { .. } => {
241                write!(f, "flag negation operator repeated")
242            }
243            FlagUnexpectedEof => {
244                write!(f, "expected flag but got end of regex")
245            }
246            FlagUnrecognized => write!(f, "unrecognized flag"),
247            GroupNameDuplicate { .. } => {
248                write!(f, "duplicate capture group name")
249            }
250            GroupNameEmpty => write!(f, "empty capture group name"),
251            GroupNameInvalid => write!(f, "invalid capture group character"),
252            GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
253            GroupUnclosed => write!(f, "unclosed group"),
254            GroupUnopened => write!(f, "unopened group"),
255            NestLimitExceeded(limit) => write!(
256                f,
257                "exceed the maximum number of \
258                 nested parentheses/brackets ({})",
259                limit
260            ),
261            RepetitionCountInvalid => write!(
262                f,
263                "invalid repetition count range, \
264                 the start must be <= the end"
265            ),
266            RepetitionCountDecimalEmpty => {
267                write!(f, "repetition quantifier expects a valid decimal")
268            }
269            RepetitionCountUnclosed => {
270                write!(f, "unclosed counted repetition")
271            }
272            RepetitionMissing => {
273                write!(f, "repetition operator missing expression")
274            }
275            SpecialWordBoundaryUnclosed => {
276                write!(
277                    f,
278                    "special word boundary assertion is either \
279                     unclosed or contains an invalid character",
280                )
281            }
282            SpecialWordBoundaryUnrecognized => {
283                write!(
284                    f,
285                    "unrecognized special word boundary assertion, \
286                     valid choices are: start, end, start-half \
287                     or end-half",
288                )
289            }
290            SpecialWordOrRepetitionUnexpectedEof => {
291                write!(
292                    f,
293                    "found either the beginning of a special word \
294                     boundary or a bounded repetition on a \\b with \
295                     an opening brace, but no closing brace",
296                )
297            }
298            UnicodeClassInvalid => {
299                write!(f, "invalid Unicode character class")
300            }
301            UnsupportedBackreference => {
302                write!(f, "backreferences are not supported")
303            }
304            UnsupportedLookAround => write!(
305                f,
306                "look-around, including look-ahead and look-behind, \
307                 is not supported"
308            ),
309        }
310    }
311}
312
313/// Span represents the position information of a single AST item.
314///
315/// All span positions are absolute byte offsets that can be used on the
316/// original regular expression that was parsed.
317#[derive(Clone, Copy, Eq, PartialEq)]
318#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
319pub struct Span {
320    /// The start byte offset.
321    pub start: Position,
322    /// The end byte offset.
323    pub end: Position,
324}
325
326impl core::fmt::Debug for Span {
327    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
328        write!(f, "Span({:?}, {:?})", self.start, self.end)
329    }
330}
331
332impl Ord for Span {
333    fn cmp(&self, other: &Span) -> Ordering {
334        (&self.start, &self.end).cmp(&(&other.start, &other.end))
335    }
336}
337
338impl PartialOrd for Span {
339    fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
340        Some(self.cmp(other))
341    }
342}
343
344/// A single position in a regular expression.
345///
346/// A position encodes one half of a span, and include the byte offset, line
347/// number and column number.
348#[derive(Clone, Copy, Eq, PartialEq)]
349#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
350pub struct Position {
351    /// The absolute offset of this position, starting at `0` from the
352    /// beginning of the regular expression pattern string.
353    pub offset: usize,
354    /// The line number, starting at `1`.
355    pub line: usize,
356    /// The approximate column number, starting at `1`.
357    pub column: usize,
358}
359
360impl core::fmt::Debug for Position {
361    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
362        write!(
363            f,
364            "Position(o: {:?}, l: {:?}, c: {:?})",
365            self.offset, self.line, self.column
366        )
367    }
368}
369
370impl Ord for Position {
371    fn cmp(&self, other: &Position) -> Ordering {
372        self.offset.cmp(&other.offset)
373    }
374}
375
376impl PartialOrd for Position {
377    fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
378        Some(self.cmp(other))
379    }
380}
381
382impl Span {
383    /// Create a new span with the given positions.
384    pub fn new(start: Position, end: Position) -> Span {
385        Span { start, end }
386    }
387
388    /// Create a new span using the given position as the start and end.
389    pub fn splat(pos: Position) -> Span {
390        Span::new(pos, pos)
391    }
392
393    /// Create a new span by replacing the starting the position with the one
394    /// given.
395    pub fn with_start(self, pos: Position) -> Span {
396        Span { start: pos, ..self }
397    }
398
399    /// Create a new span by replacing the ending the position with the one
400    /// given.
401    pub fn with_end(self, pos: Position) -> Span {
402        Span { end: pos, ..self }
403    }
404
405    /// Returns true if and only if this span occurs on a single line.
406    pub fn is_one_line(&self) -> bool {
407        self.start.line == self.end.line
408    }
409
410    /// Returns true if and only if this span is empty. That is, it points to
411    /// a single position in the concrete syntax of a regular expression.
412    pub fn is_empty(&self) -> bool {
413        self.start.offset == self.end.offset
414    }
415}
416
417impl Position {
418    /// Create a new position with the given information.
419    ///
420    /// `offset` is the absolute offset of the position, starting at `0` from
421    /// the beginning of the regular expression pattern string.
422    ///
423    /// `line` is the line number, starting at `1`.
424    ///
425    /// `column` is the approximate column number, starting at `1`.
426    pub fn new(offset: usize, line: usize, column: usize) -> Position {
427        Position { offset, line, column }
428    }
429}
430
431/// An abstract syntax tree for a singular expression along with comments
432/// found.
433///
434/// Comments are not stored in the tree itself to avoid complexity. Each
435/// comment contains a span of precisely where it occurred in the original
436/// regular expression.
437#[derive(Clone, Debug, Eq, PartialEq)]
438#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
439pub struct WithComments {
440    /// The actual ast.
441    pub ast: Ast,
442    /// All comments found in the original regular expression.
443    pub comments: Vec<Comment>,
444}
445
446/// A comment from a regular expression with an associated span.
447///
448/// A regular expression can only contain comments when the `x` flag is
449/// enabled.
450#[derive(Clone, Debug, Eq, PartialEq)]
451#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
452pub struct Comment {
453    /// The span of this comment, including the beginning `#` and ending `\n`.
454    pub span: Span,
455    /// The comment text, starting with the first character following the `#`
456    /// and ending with the last character preceding the `\n`.
457    pub comment: String,
458}
459
460/// An abstract syntax tree for a single regular expression.
461///
462/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
463/// space proportional to the size of the `Ast`.
464///
465/// This type defines its own destructor that uses constant stack space and
466/// heap space proportional to the size of the `Ast`.
467#[derive(Clone, Debug, Eq, PartialEq)]
468#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
469pub enum Ast {
470    /// An empty regex that matches everything.
471    Empty(Box<Span>),
472    /// A set of flags, e.g., `(?is)`.
473    Flags(Box<SetFlags>),
474    /// A single character literal, which includes escape sequences.
475    Literal(Box<Literal>),
476    /// The "any character" class.
477    Dot(Box<Span>),
478    /// A single zero-width assertion.
479    Assertion(Box<Assertion>),
480    /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`.
481    ClassUnicode(Box<ClassUnicode>),
482    /// A single perl character class, e.g., `\d` or `\W`.
483    ClassPerl(Box<ClassPerl>),
484    /// A single bracketed character class set, which may contain zero or more
485    /// character ranges and/or zero or more nested classes. e.g.,
486    /// `[a-zA-Z\pL]`.
487    ClassBracketed(Box<ClassBracketed>),
488    /// A repetition operator applied to an arbitrary regular expression.
489    Repetition(Box<Repetition>),
490    /// A grouped regular expression.
491    Group(Box<Group>),
492    /// An alternation of regular expressions.
493    Alternation(Box<Alternation>),
494    /// A concatenation of regular expressions.
495    Concat(Box<Concat>),
496}
497
498impl Ast {
499    /// Create an "empty" AST item.
500    pub fn empty(span: Span) -> Ast {
501        Ast::Empty(Box::new(span))
502    }
503
504    /// Create a "flags" AST item.
505    pub fn flags(e: SetFlags) -> Ast {
506        Ast::Flags(Box::new(e))
507    }
508
509    /// Create a "literal" AST item.
510    pub fn literal(e: Literal) -> Ast {
511        Ast::Literal(Box::new(e))
512    }
513
514    /// Create a "dot" AST item.
515    pub fn dot(span: Span) -> Ast {
516        Ast::Dot(Box::new(span))
517    }
518
519    /// Create a "assertion" AST item.
520    pub fn assertion(e: Assertion) -> Ast {
521        Ast::Assertion(Box::new(e))
522    }
523
524    /// Create a "Unicode class" AST item.
525    pub fn class_unicode(e: ClassUnicode) -> Ast {
526        Ast::ClassUnicode(Box::new(e))
527    }
528
529    /// Create a "Perl class" AST item.
530    pub fn class_perl(e: ClassPerl) -> Ast {
531        Ast::ClassPerl(Box::new(e))
532    }
533
534    /// Create a "bracketed class" AST item.
535    pub fn class_bracketed(e: ClassBracketed) -> Ast {
536        Ast::ClassBracketed(Box::new(e))
537    }
538
539    /// Create a "repetition" AST item.
540    pub fn repetition(e: Repetition) -> Ast {
541        Ast::Repetition(Box::new(e))
542    }
543
544    /// Create a "group" AST item.
545    pub fn group(e: Group) -> Ast {
546        Ast::Group(Box::new(e))
547    }
548
549    /// Create a "alternation" AST item.
550    pub fn alternation(e: Alternation) -> Ast {
551        Ast::Alternation(Box::new(e))
552    }
553
554    /// Create a "concat" AST item.
555    pub fn concat(e: Concat) -> Ast {
556        Ast::Concat(Box::new(e))
557    }
558
559    /// Return the span of this abstract syntax tree.
560    pub fn span(&self) -> &Span {
561        match *self {
562            Ast::Empty(ref span) => span,
563            Ast::Flags(ref x) => &x.span,
564            Ast::Literal(ref x) => &x.span,
565            Ast::Dot(ref span) => span,
566            Ast::Assertion(ref x) => &x.span,
567            Ast::ClassUnicode(ref x) => &x.span,
568            Ast::ClassPerl(ref x) => &x.span,
569            Ast::ClassBracketed(ref x) => &x.span,
570            Ast::Repetition(ref x) => &x.span,
571            Ast::Group(ref x) => &x.span,
572            Ast::Alternation(ref x) => &x.span,
573            Ast::Concat(ref x) => &x.span,
574        }
575    }
576
577    /// Return true if and only if this Ast is empty.
578    pub fn is_empty(&self) -> bool {
579        match *self {
580            Ast::Empty(_) => true,
581            _ => false,
582        }
583    }
584
585    /// Returns true if and only if this AST has any (including possibly empty)
586    /// subexpressions.
587    fn has_subexprs(&self) -> bool {
588        match *self {
589            Ast::Empty(_)
590            | Ast::Flags(_)
591            | Ast::Literal(_)
592            | Ast::Dot(_)
593            | Ast::Assertion(_)
594            | Ast::ClassUnicode(_)
595            | Ast::ClassPerl(_) => false,
596            Ast::ClassBracketed(_)
597            | Ast::Repetition(_)
598            | Ast::Group(_)
599            | Ast::Alternation(_)
600            | Ast::Concat(_) => true,
601        }
602    }
603}
604
605/// Print a display representation of this Ast.
606///
607/// This does not preserve any of the original whitespace formatting that may
608/// have originally been present in the concrete syntax from which this Ast
609/// was generated.
610///
611/// This implementation uses constant stack space and heap space proportional
612/// to the size of the `Ast`.
613impl core::fmt::Display for Ast {
614    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
615        use crate::ast::print::Printer;
616        Printer::new().print(self, f)
617    }
618}
619
620/// An alternation of regular expressions.
621#[derive(Clone, Debug, Eq, PartialEq)]
622#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
623pub struct Alternation {
624    /// The span of this alternation.
625    pub span: Span,
626    /// The alternate regular expressions.
627    pub asts: Vec<Ast>,
628}
629
630impl Alternation {
631    /// Return this alternation as an AST.
632    ///
633    /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
634    /// If this alternation contains exactly 1 AST, then the corresponding AST
635    /// is returned. Otherwise, `Ast::alternation` is returned.
636    pub fn into_ast(mut self) -> Ast {
637        match self.asts.len() {
638            0 => Ast::empty(self.span),
639            1 => self.asts.pop().unwrap(),
640            _ => Ast::alternation(self),
641        }
642    }
643}
644
645/// A concatenation of regular expressions.
646#[derive(Clone, Debug, Eq, PartialEq)]
647#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
648pub struct Concat {
649    /// The span of this concatenation.
650    pub span: Span,
651    /// The concatenation regular expressions.
652    pub asts: Vec<Ast>,
653}
654
655impl Concat {
656    /// Return this concatenation as an AST.
657    ///
658    /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
659    /// If this alternation contains exactly 1 AST, then the corresponding AST
660    /// is returned. Otherwise, `Ast::concat` is returned.
661    pub fn into_ast(mut self) -> Ast {
662        match self.asts.len() {
663            0 => Ast::empty(self.span),
664            1 => self.asts.pop().unwrap(),
665            _ => Ast::concat(self),
666        }
667    }
668}
669
670/// A single literal expression.
671///
672/// A literal corresponds to a single Unicode scalar value. Literals may be
673/// represented in their literal form, e.g., `a` or in their escaped form,
674/// e.g., `\x61`.
675#[derive(Clone, Debug, Eq, PartialEq)]
676#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
677pub struct Literal {
678    /// The span of this literal.
679    pub span: Span,
680    /// The kind of this literal.
681    pub kind: LiteralKind,
682    /// The Unicode scalar value corresponding to this literal.
683    pub c: char,
684}
685
686impl Literal {
687    /// If this literal was written as a `\x` hex escape, then this returns
688    /// the corresponding byte value. Otherwise, this returns `None`.
689    pub fn byte(&self) -> Option<u8> {
690        match self.kind {
691            LiteralKind::HexFixed(HexLiteralKind::X) => {
692                u8::try_from(self.c).ok()
693            }
694            _ => None,
695        }
696    }
697}
698
699/// The kind of a single literal expression.
700#[derive(Clone, Debug, Eq, PartialEq)]
701#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
702pub enum LiteralKind {
703    /// The literal is written verbatim, e.g., `a` or `☃`.
704    Verbatim,
705    /// The literal is written as an escape because it is otherwise a special
706    /// regex meta character, e.g., `\*` or `\[`.
707    Meta,
708    /// The literal is written as an escape despite the fact that the escape is
709    /// unnecessary, e.g., `\%` or `\/`.
710    Superfluous,
711    /// The literal is written as an octal escape, e.g., `\141`.
712    Octal,
713    /// The literal is written as a hex code with a fixed number of digits
714    /// depending on the type of the escape, e.g., `\x61` or `\u0061` or
715    /// `\U00000061`.
716    HexFixed(HexLiteralKind),
717    /// The literal is written as a hex code with a bracketed number of
718    /// digits. The only restriction is that the bracketed hex code must refer
719    /// to a valid Unicode scalar value.
720    HexBrace(HexLiteralKind),
721    /// The literal is written as a specially recognized escape, e.g., `\f`
722    /// or `\n`.
723    Special(SpecialLiteralKind),
724}
725
726/// The type of a special literal.
727///
728/// A special literal is a special escape sequence recognized by the regex
729/// parser, e.g., `\f` or `\n`.
730#[derive(Clone, Debug, Eq, PartialEq)]
731#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
732pub enum SpecialLiteralKind {
733    /// Bell, spelled `\a` (`\x07`).
734    Bell,
735    /// Form feed, spelled `\f` (`\x0C`).
736    FormFeed,
737    /// Tab, spelled `\t` (`\x09`).
738    Tab,
739    /// Line feed, spelled `\n` (`\x0A`).
740    LineFeed,
741    /// Carriage return, spelled `\r` (`\x0D`).
742    CarriageReturn,
743    /// Vertical tab, spelled `\v` (`\x0B`).
744    VerticalTab,
745    /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
746    /// parsing in verbose mode.
747    Space,
748}
749
750/// The type of a Unicode hex literal.
751///
752/// Note that all variants behave the same when used with brackets. They only
753/// differ when used without brackets in the number of hex digits that must
754/// follow.
755#[derive(Clone, Debug, Eq, PartialEq)]
756#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
757pub enum HexLiteralKind {
758    /// A `\x` prefix. When used without brackets, this form is limited to
759    /// two digits.
760    X,
761    /// A `\u` prefix. When used without brackets, this form is limited to
762    /// four digits.
763    UnicodeShort,
764    /// A `\U` prefix. When used without brackets, this form is limited to
765    /// eight digits.
766    UnicodeLong,
767}
768
769impl HexLiteralKind {
770    /// The number of digits that must be used with this literal form when
771    /// used without brackets. When used with brackets, there is no
772    /// restriction on the number of digits.
773    pub fn digits(&self) -> u32 {
774        match *self {
775            HexLiteralKind::X => 2,
776            HexLiteralKind::UnicodeShort => 4,
777            HexLiteralKind::UnicodeLong => 8,
778        }
779    }
780}
781
782/// A Perl character class.
783#[derive(Clone, Debug, Eq, PartialEq)]
784#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
785pub struct ClassPerl {
786    /// The span of this class.
787    pub span: Span,
788    /// The kind of Perl class.
789    pub kind: ClassPerlKind,
790    /// Whether the class is negated or not. e.g., `\d` is not negated but
791    /// `\D` is.
792    pub negated: bool,
793}
794
795/// The available Perl character classes.
796#[derive(Clone, Debug, Eq, PartialEq)]
797#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
798pub enum ClassPerlKind {
799    /// Decimal numbers.
800    Digit,
801    /// Whitespace.
802    Space,
803    /// Word characters.
804    Word,
805}
806
807/// An ASCII character class.
808#[derive(Clone, Debug, Eq, PartialEq)]
809#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
810pub struct ClassAscii {
811    /// The span of this class.
812    pub span: Span,
813    /// The kind of ASCII class.
814    pub kind: ClassAsciiKind,
815    /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
816    /// but `[[:^alpha:]]` is.
817    pub negated: bool,
818}
819
820/// The available ASCII character classes.
821#[derive(Clone, Debug, Eq, PartialEq)]
822#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
823pub enum ClassAsciiKind {
824    /// `[0-9A-Za-z]`
825    Alnum,
826    /// `[A-Za-z]`
827    Alpha,
828    /// `[\x00-\x7F]`
829    Ascii,
830    /// `[ \t]`
831    Blank,
832    /// `[\x00-\x1F\x7F]`
833    Cntrl,
834    /// `[0-9]`
835    Digit,
836    /// `[!-~]`
837    Graph,
838    /// `[a-z]`
839    Lower,
840    /// `[ -~]`
841    Print,
842    /// `[!-/:-@\[-`{-~]`
843    Punct,
844    /// `[\t\n\v\f\r ]`
845    Space,
846    /// `[A-Z]`
847    Upper,
848    /// `[0-9A-Za-z_]`
849    Word,
850    /// `[0-9A-Fa-f]`
851    Xdigit,
852}
853
854impl ClassAsciiKind {
855    /// Return the corresponding ClassAsciiKind variant for the given name.
856    ///
857    /// The name given should correspond to the lowercase version of the
858    /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
859    ///
860    /// If no variant with the corresponding name exists, then `None` is
861    /// returned.
862    pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
863        use self::ClassAsciiKind::*;
864        match name {
865            "alnum" => Some(Alnum),
866            "alpha" => Some(Alpha),
867            "ascii" => Some(Ascii),
868            "blank" => Some(Blank),
869            "cntrl" => Some(Cntrl),
870            "digit" => Some(Digit),
871            "graph" => Some(Graph),
872            "lower" => Some(Lower),
873            "print" => Some(Print),
874            "punct" => Some(Punct),
875            "space" => Some(Space),
876            "upper" => Some(Upper),
877            "word" => Some(Word),
878            "xdigit" => Some(Xdigit),
879            _ => None,
880        }
881    }
882}
883
884/// A Unicode character class.
885#[derive(Clone, Debug, Eq, PartialEq)]
886#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
887pub struct ClassUnicode {
888    /// The span of this class.
889    pub span: Span,
890    /// Whether this class is negated or not.
891    ///
892    /// Note: be careful when using this attribute. This specifically refers
893    /// to whether the class is written as `\p` or `\P`, where the latter
894    /// is `negated = true`. However, it also possible to write something like
895    /// `\P{scx!=Katakana}` which is actually equivalent to
896    /// `\p{scx=Katakana}` and is therefore not actually negated even though
897    /// `negated = true` here. To test whether this class is truly negated
898    /// or not, use the `is_negated` method.
899    pub negated: bool,
900    /// The kind of Unicode class.
901    pub kind: ClassUnicodeKind,
902}
903
904impl ClassUnicode {
905    /// Returns true if this class has been negated.
906    ///
907    /// Note that this takes the Unicode op into account, if it's present.
908    /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
909    pub fn is_negated(&self) -> bool {
910        match self.kind {
911            ClassUnicodeKind::NamedValue {
912                op: ClassUnicodeOpKind::NotEqual,
913                ..
914            } => !self.negated,
915            _ => self.negated,
916        }
917    }
918}
919
920/// The available forms of Unicode character classes.
921#[derive(Clone, Debug, Eq, PartialEq)]
922pub enum ClassUnicodeKind {
923    /// A one letter abbreviated class, e.g., `\pN`.
924    OneLetter(char),
925    /// A binary property, general category or script. The string may be
926    /// empty.
927    Named(String),
928    /// A property name and an associated value.
929    NamedValue {
930        /// The type of Unicode op used to associate `name` with `value`.
931        op: ClassUnicodeOpKind,
932        /// The property name (which may be empty).
933        name: String,
934        /// The property value (which may be empty).
935        value: String,
936    },
937}
938
939#[cfg(feature = "arbitrary")]
940impl arbitrary::Arbitrary<'_> for ClassUnicodeKind {
941    fn arbitrary(
942        u: &mut arbitrary::Unstructured,
943    ) -> arbitrary::Result<ClassUnicodeKind> {
944        #[cfg(any(
945            feature = "unicode-age",
946            feature = "unicode-bool",
947            feature = "unicode-gencat",
948            feature = "unicode-perl",
949            feature = "unicode-script",
950            feature = "unicode-segment",
951        ))]
952        {
953            use alloc::string::ToString;
954
955            use super::unicode_tables::{
956                property_names::PROPERTY_NAMES,
957                property_values::PROPERTY_VALUES,
958            };
959
960            match u.choose_index(3)? {
961                0 => {
962                    let all = PROPERTY_VALUES
963                        .iter()
964                        .flat_map(|e| e.1.iter())
965                        .filter(|(name, _)| name.len() == 1)
966                        .count();
967                    let idx = u.choose_index(all)?;
968                    let value = PROPERTY_VALUES
969                        .iter()
970                        .flat_map(|e| e.1.iter())
971                        .take(idx + 1)
972                        .last()
973                        .unwrap()
974                        .0
975                        .chars()
976                        .next()
977                        .unwrap();
978                    Ok(ClassUnicodeKind::OneLetter(value))
979                }
980                1 => {
981                    let all = PROPERTY_VALUES
982                        .iter()
983                        .map(|e| e.1.len())
984                        .sum::<usize>()
985                        + PROPERTY_NAMES.len();
986                    let idx = u.choose_index(all)?;
987                    let name = PROPERTY_VALUES
988                        .iter()
989                        .flat_map(|e| e.1.iter())
990                        .chain(PROPERTY_NAMES)
991                        .map(|(_, e)| e)
992                        .take(idx + 1)
993                        .last()
994                        .unwrap();
995                    Ok(ClassUnicodeKind::Named(name.to_string()))
996                }
997                2 => {
998                    let all = PROPERTY_VALUES
999                        .iter()
1000                        .map(|e| e.1.len())
1001                        .sum::<usize>();
1002                    let idx = u.choose_index(all)?;
1003                    let (prop, value) = PROPERTY_VALUES
1004                        .iter()
1005                        .flat_map(|e| {
1006                            e.1.iter().map(|(_, value)| (e.0, value))
1007                        })
1008                        .take(idx + 1)
1009                        .last()
1010                        .unwrap();
1011                    Ok(ClassUnicodeKind::NamedValue {
1012                        op: u.arbitrary()?,
1013                        name: prop.to_string(),
1014                        value: value.to_string(),
1015                    })
1016                }
1017                _ => unreachable!("index chosen is impossible"),
1018            }
1019        }
1020        #[cfg(not(any(
1021            feature = "unicode-age",
1022            feature = "unicode-bool",
1023            feature = "unicode-gencat",
1024            feature = "unicode-perl",
1025            feature = "unicode-script",
1026            feature = "unicode-segment",
1027        )))]
1028        {
1029            match u.choose_index(3)? {
1030                0 => Ok(ClassUnicodeKind::OneLetter(u.arbitrary()?)),
1031                1 => Ok(ClassUnicodeKind::Named(u.arbitrary()?)),
1032                2 => Ok(ClassUnicodeKind::NamedValue {
1033                    op: u.arbitrary()?,
1034                    name: u.arbitrary()?,
1035                    value: u.arbitrary()?,
1036                }),
1037                _ => unreachable!("index chosen is impossible"),
1038            }
1039        }
1040    }
1041
1042    fn size_hint(depth: usize) -> (usize, Option<usize>) {
1043        #[cfg(any(
1044            feature = "unicode-age",
1045            feature = "unicode-bool",
1046            feature = "unicode-gencat",
1047            feature = "unicode-perl",
1048            feature = "unicode-script",
1049            feature = "unicode-segment",
1050        ))]
1051        {
1052            arbitrary::size_hint::and_all(&[
1053                usize::size_hint(depth),
1054                usize::size_hint(depth),
1055                arbitrary::size_hint::or(
1056                    (0, Some(0)),
1057                    ClassUnicodeOpKind::size_hint(depth),
1058                ),
1059            ])
1060        }
1061        #[cfg(not(any(
1062            feature = "unicode-age",
1063            feature = "unicode-bool",
1064            feature = "unicode-gencat",
1065            feature = "unicode-perl",
1066            feature = "unicode-script",
1067            feature = "unicode-segment",
1068        )))]
1069        {
1070            arbitrary::size_hint::and(
1071                usize::size_hint(depth),
1072                arbitrary::size_hint::or_all(&[
1073                    char::size_hint(depth),
1074                    String::size_hint(depth),
1075                    arbitrary::size_hint::and_all(&[
1076                        String::size_hint(depth),
1077                        String::size_hint(depth),
1078                        ClassUnicodeOpKind::size_hint(depth),
1079                    ]),
1080                ]),
1081            )
1082        }
1083    }
1084}
1085
1086/// The type of op used in a Unicode character class.
1087#[derive(Clone, Debug, Eq, PartialEq)]
1088#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1089pub enum ClassUnicodeOpKind {
1090    /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
1091    Equal,
1092    /// A property set to a specific value using a colon, e.g.,
1093    /// `\p{scx:Katakana}`.
1094    Colon,
1095    /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
1096    NotEqual,
1097}
1098
1099impl ClassUnicodeOpKind {
1100    /// Whether the op is an equality op or not.
1101    pub fn is_equal(&self) -> bool {
1102        match *self {
1103            ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
1104            _ => false,
1105        }
1106    }
1107}
1108
1109/// A bracketed character class, e.g., `[a-z0-9]`.
1110#[derive(Clone, Debug, Eq, PartialEq)]
1111#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1112pub struct ClassBracketed {
1113    /// The span of this class.
1114    pub span: Span,
1115    /// Whether this class is negated or not. e.g., `[a]` is not negated but
1116    /// `[^a]` is.
1117    pub negated: bool,
1118    /// The type of this set. A set is either a normal union of things, e.g.,
1119    /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
1120    pub kind: ClassSet,
1121}
1122
1123/// A character class set.
1124///
1125/// This type corresponds to the internal structure of a bracketed character
1126/// class. That is, every bracketed character is one of two types: a union of
1127/// items (literals, ranges, other bracketed classes) or a tree of binary set
1128/// operations.
1129#[derive(Clone, Debug, Eq, PartialEq)]
1130#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1131pub enum ClassSet {
1132    /// An item, which can be a single literal, range, nested character class
1133    /// or a union of items.
1134    Item(ClassSetItem),
1135    /// A single binary operation (i.e., &&, -- or ~~).
1136    BinaryOp(ClassSetBinaryOp),
1137}
1138
1139impl ClassSet {
1140    /// Build a set from a union.
1141    pub fn union(ast: ClassSetUnion) -> ClassSet {
1142        ClassSet::Item(ClassSetItem::Union(ast))
1143    }
1144
1145    /// Return the span of this character class set.
1146    pub fn span(&self) -> &Span {
1147        match *self {
1148            ClassSet::Item(ref x) => x.span(),
1149            ClassSet::BinaryOp(ref x) => &x.span,
1150        }
1151    }
1152
1153    /// Return true if and only if this class set is empty.
1154    fn is_empty(&self) -> bool {
1155        match *self {
1156            ClassSet::Item(ClassSetItem::Empty(_)) => true,
1157            _ => false,
1158        }
1159    }
1160}
1161
1162/// A single component of a character class set.
1163#[derive(Clone, Debug, Eq, PartialEq)]
1164#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1165pub enum ClassSetItem {
1166    /// An empty item.
1167    ///
1168    /// Note that a bracketed character class cannot contain a single empty
1169    /// item. Empty items can appear when using one of the binary operators.
1170    /// For example, `[&&]` is the intersection of two empty classes.
1171    Empty(Span),
1172    /// A single literal.
1173    Literal(Literal),
1174    /// A range between two literals.
1175    Range(ClassSetRange),
1176    /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
1177    Ascii(ClassAscii),
1178    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
1179    Unicode(ClassUnicode),
1180    /// A perl character class, e.g., `\d` or `\W`.
1181    Perl(ClassPerl),
1182    /// A bracketed character class set, which may contain zero or more
1183    /// character ranges and/or zero or more nested classes. e.g.,
1184    /// `[a-zA-Z\pL]`.
1185    Bracketed(Box<ClassBracketed>),
1186    /// A union of items.
1187    Union(ClassSetUnion),
1188}
1189
1190impl ClassSetItem {
1191    /// Return the span of this character class set item.
1192    pub fn span(&self) -> &Span {
1193        match *self {
1194            ClassSetItem::Empty(ref span) => span,
1195            ClassSetItem::Literal(ref x) => &x.span,
1196            ClassSetItem::Range(ref x) => &x.span,
1197            ClassSetItem::Ascii(ref x) => &x.span,
1198            ClassSetItem::Perl(ref x) => &x.span,
1199            ClassSetItem::Unicode(ref x) => &x.span,
1200            ClassSetItem::Bracketed(ref x) => &x.span,
1201            ClassSetItem::Union(ref x) => &x.span,
1202        }
1203    }
1204}
1205
1206/// A single character class range in a set.
1207#[derive(Clone, Debug, Eq, PartialEq)]
1208#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1209pub struct ClassSetRange {
1210    /// The span of this range.
1211    pub span: Span,
1212    /// The start of this range.
1213    pub start: Literal,
1214    /// The end of this range.
1215    pub end: Literal,
1216}
1217
1218impl ClassSetRange {
1219    /// Returns true if and only if this character class range is valid.
1220    ///
1221    /// The only case where a range is invalid is if its start is greater than
1222    /// its end.
1223    pub fn is_valid(&self) -> bool {
1224        self.start.c <= self.end.c
1225    }
1226}
1227
1228/// A union of items inside a character class set.
1229#[derive(Clone, Debug, Eq, PartialEq)]
1230#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1231pub struct ClassSetUnion {
1232    /// The span of the items in this operation. e.g., the `a-z0-9` in
1233    /// `[^a-z0-9]`
1234    pub span: Span,
1235    /// The sequence of items that make up this union.
1236    pub items: Vec<ClassSetItem>,
1237}
1238
1239impl ClassSetUnion {
1240    /// Push a new item in this union.
1241    ///
1242    /// The ending position of this union's span is updated to the ending
1243    /// position of the span of the item given. If the union is empty, then
1244    /// the starting position of this union is set to the starting position
1245    /// of this item.
1246    ///
1247    /// In other words, if you only use this method to add items to a union
1248    /// and you set the spans on each item correctly, then you should never
1249    /// need to adjust the span of the union directly.
1250    pub fn push(&mut self, item: ClassSetItem) {
1251        if self.items.is_empty() {
1252            self.span.start = item.span().start;
1253        }
1254        self.span.end = item.span().end;
1255        self.items.push(item);
1256    }
1257
1258    /// Return this union as a character class set item.
1259    ///
1260    /// If this union contains zero items, then an empty union is
1261    /// returned. If this concatenation contains exactly 1 item, then the
1262    /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1263    /// returned.
1264    pub fn into_item(mut self) -> ClassSetItem {
1265        match self.items.len() {
1266            0 => ClassSetItem::Empty(self.span),
1267            1 => self.items.pop().unwrap(),
1268            _ => ClassSetItem::Union(self),
1269        }
1270    }
1271}
1272
1273/// A Unicode character class set operation.
1274#[derive(Clone, Debug, Eq, PartialEq)]
1275#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1276pub struct ClassSetBinaryOp {
1277    /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1278    pub span: Span,
1279    /// The type of this set operation.
1280    pub kind: ClassSetBinaryOpKind,
1281    /// The left hand side of the operation.
1282    pub lhs: Box<ClassSet>,
1283    /// The right hand side of the operation.
1284    pub rhs: Box<ClassSet>,
1285}
1286
1287/// The type of a Unicode character class set operation.
1288///
1289/// Note that this doesn't explicitly represent union since there is no
1290/// explicit union operator. Concatenation inside a character class corresponds
1291/// to the union operation.
1292#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1293#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1294pub enum ClassSetBinaryOpKind {
1295    /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1296    Intersection,
1297    /// The difference of two sets, e.g., `\pN--[0-9]`.
1298    Difference,
1299    /// The symmetric difference of two sets. The symmetric difference is the
1300    /// set of elements belonging to one but not both sets.
1301    /// e.g., `[\pL~~[:ascii:]]`.
1302    SymmetricDifference,
1303}
1304
1305/// A single zero-width assertion.
1306#[derive(Clone, Debug, Eq, PartialEq)]
1307#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1308pub struct Assertion {
1309    /// The span of this assertion.
1310    pub span: Span,
1311    /// The assertion kind, e.g., `\b` or `^`.
1312    pub kind: AssertionKind,
1313}
1314
1315/// An assertion kind.
1316#[derive(Clone, Debug, Eq, PartialEq)]
1317#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1318pub enum AssertionKind {
1319    /// `^`
1320    StartLine,
1321    /// `$`
1322    EndLine,
1323    /// `\A`
1324    StartText,
1325    /// `\z`
1326    EndText,
1327    /// `\b`
1328    WordBoundary,
1329    /// `\B`
1330    NotWordBoundary,
1331    /// `\b{start}`
1332    WordBoundaryStart,
1333    /// `\b{end}`
1334    WordBoundaryEnd,
1335    /// `\<` (alias for `\b{start}`)
1336    WordBoundaryStartAngle,
1337    /// `\>` (alias for `\b{end}`)
1338    WordBoundaryEndAngle,
1339    /// `\b{start-half}`
1340    WordBoundaryStartHalf,
1341    /// `\b{end-half}`
1342    WordBoundaryEndHalf,
1343}
1344
1345/// A repetition operation applied to a regular expression.
1346#[derive(Clone, Debug, Eq, PartialEq)]
1347#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1348pub struct Repetition {
1349    /// The span of this operation.
1350    pub span: Span,
1351    /// The actual operation.
1352    pub op: RepetitionOp,
1353    /// Whether this operation was applied greedily or not.
1354    pub greedy: bool,
1355    /// The regular expression under repetition.
1356    pub ast: Box<Ast>,
1357}
1358
1359/// The repetition operator itself.
1360#[derive(Clone, Debug, Eq, PartialEq)]
1361#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1362pub struct RepetitionOp {
1363    /// The span of this operator. This includes things like `+`, `*?` and
1364    /// `{m,n}`.
1365    pub span: Span,
1366    /// The type of operation.
1367    pub kind: RepetitionKind,
1368}
1369
1370/// The kind of a repetition operator.
1371#[derive(Clone, Debug, Eq, PartialEq)]
1372#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1373pub enum RepetitionKind {
1374    /// `?`
1375    ZeroOrOne,
1376    /// `*`
1377    ZeroOrMore,
1378    /// `+`
1379    OneOrMore,
1380    /// `{m,n}`
1381    Range(RepetitionRange),
1382}
1383
1384/// A range repetition operator.
1385#[derive(Clone, Debug, Eq, PartialEq)]
1386#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1387pub enum RepetitionRange {
1388    /// `{m}`
1389    Exactly(u32),
1390    /// `{m,}`
1391    AtLeast(u32),
1392    /// `{m,n}`
1393    Bounded(u32, u32),
1394}
1395
1396impl RepetitionRange {
1397    /// Returns true if and only if this repetition range is valid.
1398    ///
1399    /// The only case where a repetition range is invalid is if it is bounded
1400    /// and its start is greater than its end.
1401    pub fn is_valid(&self) -> bool {
1402        match *self {
1403            RepetitionRange::Bounded(s, e) if s > e => false,
1404            _ => true,
1405        }
1406    }
1407}
1408
1409/// A grouped regular expression.
1410///
1411/// This includes both capturing and non-capturing groups. This does **not**
1412/// include flag-only groups like `(?is)`, but does contain any group that
1413/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1414/// `(?is:a)`.
1415#[derive(Clone, Debug, Eq, PartialEq)]
1416#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1417pub struct Group {
1418    /// The span of this group.
1419    pub span: Span,
1420    /// The kind of this group.
1421    pub kind: GroupKind,
1422    /// The regular expression in this group.
1423    pub ast: Box<Ast>,
1424}
1425
1426impl Group {
1427    /// If this group is non-capturing, then this returns the (possibly empty)
1428    /// set of flags. Otherwise, `None` is returned.
1429    pub fn flags(&self) -> Option<&Flags> {
1430        match self.kind {
1431            GroupKind::NonCapturing(ref flags) => Some(flags),
1432            _ => None,
1433        }
1434    }
1435
1436    /// Returns true if and only if this group is capturing.
1437    pub fn is_capturing(&self) -> bool {
1438        match self.kind {
1439            GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true,
1440            GroupKind::NonCapturing(_) => false,
1441        }
1442    }
1443
1444    /// Returns the capture index of this group, if this is a capturing group.
1445    ///
1446    /// This returns a capture index precisely when `is_capturing` is `true`.
1447    pub fn capture_index(&self) -> Option<u32> {
1448        match self.kind {
1449            GroupKind::CaptureIndex(i) => Some(i),
1450            GroupKind::CaptureName { ref name, .. } => Some(name.index),
1451            GroupKind::NonCapturing(_) => None,
1452        }
1453    }
1454}
1455
1456/// The kind of a group.
1457#[derive(Clone, Debug, Eq, PartialEq)]
1458#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1459pub enum GroupKind {
1460    /// `(a)`
1461    CaptureIndex(u32),
1462    /// `(?<name>a)` or `(?P<name>a)`
1463    CaptureName {
1464        /// True if the `?P<` syntax is used and false if the `?<` syntax is used.
1465        starts_with_p: bool,
1466        /// The capture name.
1467        name: CaptureName,
1468    },
1469    /// `(?:a)` and `(?i:a)`
1470    NonCapturing(Flags),
1471}
1472
1473/// A capture name.
1474///
1475/// This corresponds to the name itself between the angle brackets in, e.g.,
1476/// `(?P<foo>expr)`.
1477#[derive(Clone, Debug, Eq, PartialEq)]
1478pub struct CaptureName {
1479    /// The span of this capture name.
1480    pub span: Span,
1481    /// The capture name.
1482    pub name: String,
1483    /// The capture index.
1484    pub index: u32,
1485}
1486
1487#[cfg(feature = "arbitrary")]
1488impl arbitrary::Arbitrary<'_> for CaptureName {
1489    fn arbitrary(
1490        u: &mut arbitrary::Unstructured,
1491    ) -> arbitrary::Result<CaptureName> {
1492        let len = u.arbitrary_len::<char>()?;
1493        if len == 0 {
1494            return Err(arbitrary::Error::NotEnoughData);
1495        }
1496        let mut name: String = String::new();
1497        for _ in 0..len {
1498            let ch: char = u.arbitrary()?;
1499            let cp = u32::from(ch);
1500            let ascii_letter_offset = u8::try_from(cp % 26).unwrap();
1501            let ascii_letter = b'a' + ascii_letter_offset;
1502            name.push(char::from(ascii_letter));
1503        }
1504        Ok(CaptureName { span: u.arbitrary()?, name, index: u.arbitrary()? })
1505    }
1506
1507    fn size_hint(depth: usize) -> (usize, Option<usize>) {
1508        arbitrary::size_hint::and_all(&[
1509            Span::size_hint(depth),
1510            usize::size_hint(depth),
1511            u32::size_hint(depth),
1512        ])
1513    }
1514}
1515
1516/// A group of flags that is not applied to a particular regular expression.
1517#[derive(Clone, Debug, Eq, PartialEq)]
1518#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1519pub struct SetFlags {
1520    /// The span of these flags, including the grouping parentheses.
1521    pub span: Span,
1522    /// The actual sequence of flags.
1523    pub flags: Flags,
1524}
1525
1526/// A group of flags.
1527///
1528/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1529#[derive(Clone, Debug, Eq, PartialEq)]
1530#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1531pub struct Flags {
1532    /// The span of this group of flags.
1533    pub span: Span,
1534    /// A sequence of flag items. Each item is either a flag or a negation
1535    /// operator.
1536    pub items: Vec<FlagsItem>,
1537}
1538
1539impl Flags {
1540    /// Add the given item to this sequence of flags.
1541    ///
1542    /// If the item was added successfully, then `None` is returned. If the
1543    /// given item is a duplicate, then `Some(i)` is returned, where
1544    /// `items[i].kind == item.kind`.
1545    pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1546        for (i, x) in self.items.iter().enumerate() {
1547            if x.kind == item.kind {
1548                return Some(i);
1549            }
1550        }
1551        self.items.push(item);
1552        None
1553    }
1554
1555    /// Returns the state of the given flag in this set.
1556    ///
1557    /// If the given flag is in the set but is negated, then `Some(false)` is
1558    /// returned.
1559    ///
1560    /// If the given flag is in the set and is not negated, then `Some(true)`
1561    /// is returned.
1562    ///
1563    /// Otherwise, `None` is returned.
1564    pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1565        let mut negated = false;
1566        for x in &self.items {
1567            match x.kind {
1568                FlagsItemKind::Negation => {
1569                    negated = true;
1570                }
1571                FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1572                    return Some(!negated);
1573                }
1574                _ => {}
1575            }
1576        }
1577        None
1578    }
1579}
1580
1581/// A single item in a group of flags.
1582#[derive(Clone, Debug, Eq, PartialEq)]
1583#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1584pub struct FlagsItem {
1585    /// The span of this item.
1586    pub span: Span,
1587    /// The kind of this item.
1588    pub kind: FlagsItemKind,
1589}
1590
1591/// The kind of an item in a group of flags.
1592#[derive(Clone, Debug, Eq, PartialEq)]
1593#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1594pub enum FlagsItemKind {
1595    /// A negation operator applied to all subsequent flags in the enclosing
1596    /// group.
1597    Negation,
1598    /// A single flag in a group.
1599    Flag(Flag),
1600}
1601
1602impl FlagsItemKind {
1603    /// Returns true if and only if this item is a negation operator.
1604    pub fn is_negation(&self) -> bool {
1605        match *self {
1606            FlagsItemKind::Negation => true,
1607            _ => false,
1608        }
1609    }
1610}
1611
1612/// A single flag.
1613#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1614#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1615pub enum Flag {
1616    /// `i`
1617    CaseInsensitive,
1618    /// `m`
1619    MultiLine,
1620    /// `s`
1621    DotMatchesNewLine,
1622    /// `U`
1623    SwapGreed,
1624    /// `u`
1625    Unicode,
1626    /// `R`
1627    CRLF,
1628    /// `x`
1629    IgnoreWhitespace,
1630}
1631
1632/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1633/// space but heap space proportional to the depth of the `Ast`.
1634impl Drop for Ast {
1635    fn drop(&mut self) {
1636        use core::mem;
1637
1638        match *self {
1639            Ast::Empty(_)
1640            | Ast::Flags(_)
1641            | Ast::Literal(_)
1642            | Ast::Dot(_)
1643            | Ast::Assertion(_)
1644            | Ast::ClassUnicode(_)
1645            | Ast::ClassPerl(_)
1646            // Bracketed classes are recursive, they get their own Drop impl.
1647            | Ast::ClassBracketed(_) => return,
1648            Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1649            Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1650            Ast::Alternation(ref x) if x.asts.is_empty() => return,
1651            Ast::Concat(ref x) if x.asts.is_empty() => return,
1652            _ => {}
1653        }
1654
1655        let empty_span = || Span::splat(Position::new(0, 0, 0));
1656        let empty_ast = || Ast::empty(empty_span());
1657        let mut stack = vec![mem::replace(self, empty_ast())];
1658        while let Some(mut ast) = stack.pop() {
1659            match ast {
1660                Ast::Empty(_)
1661                | Ast::Flags(_)
1662                | Ast::Literal(_)
1663                | Ast::Dot(_)
1664                | Ast::Assertion(_)
1665                | Ast::ClassUnicode(_)
1666                | Ast::ClassPerl(_)
1667                // Bracketed classes are recursive, so they get their own Drop
1668                // impl.
1669                | Ast::ClassBracketed(_) => {}
1670                Ast::Repetition(ref mut x) => {
1671                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1672                }
1673                Ast::Group(ref mut x) => {
1674                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1675                }
1676                Ast::Alternation(ref mut x) => {
1677                    stack.extend(x.asts.drain(..));
1678                }
1679                Ast::Concat(ref mut x) => {
1680                    stack.extend(x.asts.drain(..));
1681                }
1682            }
1683        }
1684    }
1685}
1686
1687/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1688/// stack space but heap space proportional to the depth of the `ClassSet`.
1689impl Drop for ClassSet {
1690    fn drop(&mut self) {
1691        use core::mem;
1692
1693        match *self {
1694            ClassSet::Item(ref item) => match *item {
1695                ClassSetItem::Empty(_)
1696                | ClassSetItem::Literal(_)
1697                | ClassSetItem::Range(_)
1698                | ClassSetItem::Ascii(_)
1699                | ClassSetItem::Unicode(_)
1700                | ClassSetItem::Perl(_) => return,
1701                ClassSetItem::Bracketed(ref x) => {
1702                    if x.kind.is_empty() {
1703                        return;
1704                    }
1705                }
1706                ClassSetItem::Union(ref x) => {
1707                    if x.items.is_empty() {
1708                        return;
1709                    }
1710                }
1711            },
1712            ClassSet::BinaryOp(ref op) => {
1713                if op.lhs.is_empty() && op.rhs.is_empty() {
1714                    return;
1715                }
1716            }
1717        }
1718
1719        let empty_span = || Span::splat(Position::new(0, 0, 0));
1720        let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
1721        let mut stack = vec![mem::replace(self, empty_set())];
1722        while let Some(mut set) = stack.pop() {
1723            match set {
1724                ClassSet::Item(ref mut item) => match *item {
1725                    ClassSetItem::Empty(_)
1726                    | ClassSetItem::Literal(_)
1727                    | ClassSetItem::Range(_)
1728                    | ClassSetItem::Ascii(_)
1729                    | ClassSetItem::Unicode(_)
1730                    | ClassSetItem::Perl(_) => {}
1731                    ClassSetItem::Bracketed(ref mut x) => {
1732                        stack.push(mem::replace(&mut x.kind, empty_set()));
1733                    }
1734                    ClassSetItem::Union(ref mut x) => {
1735                        stack.extend(x.items.drain(..).map(ClassSet::Item));
1736                    }
1737                },
1738                ClassSet::BinaryOp(ref mut op) => {
1739                    stack.push(mem::replace(&mut op.lhs, empty_set()));
1740                    stack.push(mem::replace(&mut op.rhs, empty_set()));
1741                }
1742            }
1743        }
1744    }
1745}
1746
1747#[cfg(test)]
1748mod tests {
1749    use super::*;
1750
1751    // We use a thread with an explicit stack size to test that our destructor
1752    // for Ast can handle arbitrarily sized expressions in constant stack
1753    // space. In case we run on a platform without threads (WASM?), we limit
1754    // this test to Windows/Unix.
1755    #[test]
1756    #[cfg(any(unix, windows))]
1757    fn no_stack_overflow_on_drop() {
1758        use std::thread;
1759
1760        let run = || {
1761            let span = || Span::splat(Position::new(0, 0, 0));
1762            let mut ast = Ast::empty(span());
1763            for i in 0..200 {
1764                ast = Ast::group(Group {
1765                    span: span(),
1766                    kind: GroupKind::CaptureIndex(i),
1767                    ast: Box::new(ast),
1768                });
1769            }
1770            assert!(!ast.is_empty());
1771        };
1772
1773        // We run our test on a thread with a small stack size so we can
1774        // force the issue more easily.
1775        //
1776        // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD)
1777        // will just barf with very small stack sizes. So we bump this up a bit
1778        // to give more room to breath. When I did this, I confirmed that if
1779        // I remove the custom `Drop` impl for `Ast`, then this test does
1780        // indeed still fail with a stack overflow. (At the time of writing, I
1781        // had to bump it all the way up to 32K before the test would pass even
1782        // without the custom `Drop` impl. So 16K seems like a safe number
1783        // here.)
1784        //
1785        // See: https://github.com/rust-lang/regex/issues/967
1786        thread::Builder::new()
1787            .stack_size(16 << 10)
1788            .spawn(run)
1789            .unwrap()
1790            .join()
1791            .unwrap();
1792    }
1793
1794    // This tests that our `Ast` has a reasonable size. This isn't a hard rule
1795    // and it can be increased if given a good enough reason. But this test
1796    // exists because the size of `Ast` was at one point over 200 bytes on a
1797    // 64-bit target. Wow.
1798    #[test]
1799    fn ast_size() {
1800        let max = 2 * core::mem::size_of::<usize>();
1801        let size = core::mem::size_of::<Ast>();
1802        assert!(
1803            size <= max,
1804            "Ast size of {} bytes is bigger than suggested max {}",
1805            size,
1806            max
1807        );
1808    }
1809}
regex_syntax/ast/mod.rs

regex_syntax/ast/
mod.rs