regex_syntax/ast/
parse.rs

1/*!
2This module provides a regular expression parser.
3*/
4
5use core::{
6    borrow::Borrow,
7    cell::{Cell, RefCell},
8    mem,
9};
10
11use alloc::{
12    boxed::Box,
13    string::{String, ToString},
14    vec,
15    vec::Vec,
16};
17
18use crate::{
19    ast::{self, Ast, Position, Span},
20    either::Either,
21    is_escapeable_character, is_meta_character,
22};
23
24type Result<T> = core::result::Result<T, ast::Error>;
25
26/// A primitive is an expression with no sub-expressions. This includes
27/// literals, assertions and non-set character classes. This representation
28/// is used as intermediate state in the parser.
29///
30/// This does not include ASCII character classes, since they can only appear
31/// within a set character class.
32#[derive(Clone, Debug, Eq, PartialEq)]
33enum Primitive {
34    Literal(ast::Literal),
35    Assertion(ast::Assertion),
36    Dot(Span),
37    Perl(ast::ClassPerl),
38    Unicode(ast::ClassUnicode),
39}
40
41impl Primitive {
42    /// Return the span of this primitive.
43    fn span(&self) -> &Span {
44        match *self {
45            Primitive::Literal(ref x) => &x.span,
46            Primitive::Assertion(ref x) => &x.span,
47            Primitive::Dot(ref span) => span,
48            Primitive::Perl(ref x) => &x.span,
49            Primitive::Unicode(ref x) => &x.span,
50        }
51    }
52
53    /// Convert this primitive into a proper AST.
54    fn into_ast(self) -> Ast {
55        match self {
56            Primitive::Literal(lit) => Ast::literal(lit),
57            Primitive::Assertion(assert) => Ast::assertion(assert),
58            Primitive::Dot(span) => Ast::dot(span),
59            Primitive::Perl(cls) => Ast::class_perl(cls),
60            Primitive::Unicode(cls) => Ast::class_unicode(cls),
61        }
62    }
63
64    /// Convert this primitive into an item in a character class.
65    ///
66    /// If this primitive is not a legal item (i.e., an assertion or a dot),
67    /// then return an error.
68    fn into_class_set_item<P: Borrow<Parser>>(
69        self,
70        p: &ParserI<'_, P>,
71    ) -> Result<ast::ClassSetItem> {
72        use self::Primitive::*;
73        use crate::ast::ClassSetItem;
74
75        match self {
76            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
77            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
78            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
79            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
80        }
81    }
82
83    /// Convert this primitive into a literal in a character class. In
84    /// particular, literals are the only valid items that can appear in
85    /// ranges.
86    ///
87    /// If this primitive is not a legal item (i.e., a class, assertion or a
88    /// dot), then return an error.
89    fn into_class_literal<P: Borrow<Parser>>(
90        self,
91        p: &ParserI<'_, P>,
92    ) -> Result<ast::Literal> {
93        use self::Primitive::*;
94
95        match self {
96            Literal(lit) => Ok(lit),
97            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
98        }
99    }
100}
101
102/// Returns true if the given character is a hexadecimal digit.
103fn is_hex(c: char) -> bool {
104    ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
105}
106
107/// Returns true if the given character is a valid in a capture group name.
108///
109/// If `first` is true, then `c` is treated as the first character in the
110/// group name (which must be alphabetic or underscore).
111fn is_capture_char(c: char, first: bool) -> bool {
112    if first {
113        c == '_' || c.is_alphabetic()
114    } else {
115        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
116    }
117}
118
119/// A builder for a regular expression parser.
120///
121/// This builder permits modifying configuration options for the parser.
122#[derive(Clone, Debug)]
123pub struct ParserBuilder {
124    ignore_whitespace: bool,
125    nest_limit: u32,
126    octal: bool,
127    empty_min_range: bool,
128}
129
130impl Default for ParserBuilder {
131    fn default() -> ParserBuilder {
132        ParserBuilder::new()
133    }
134}
135
136impl ParserBuilder {
137    /// Create a new parser builder with a default configuration.
138    pub fn new() -> ParserBuilder {
139        ParserBuilder {
140            ignore_whitespace: false,
141            nest_limit: 250,
142            octal: false,
143            empty_min_range: false,
144        }
145    }
146
147    /// Build a parser from this configuration with the given pattern.
148    pub fn build(&self) -> Parser {
149        Parser {
150            pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
151            capture_index: Cell::new(0),
152            nest_limit: self.nest_limit,
153            octal: self.octal,
154            empty_min_range: self.empty_min_range,
155            initial_ignore_whitespace: self.ignore_whitespace,
156            ignore_whitespace: Cell::new(self.ignore_whitespace),
157            comments: RefCell::new(vec![]),
158            stack_group: RefCell::new(vec![]),
159            stack_class: RefCell::new(vec![]),
160            capture_names: RefCell::new(vec![]),
161            scratch: RefCell::new(String::new()),
162        }
163    }
164
165    /// Set the nesting limit for this parser.
166    ///
167    /// The nesting limit controls how deep the abstract syntax tree is allowed
168    /// to be. If the AST exceeds the given limit (e.g., with too many nested
169    /// groups), then an error is returned by the parser.
170    ///
171    /// The purpose of this limit is to act as a heuristic to prevent stack
172    /// overflow for consumers that do structural induction on an `Ast` using
173    /// explicit recursion. While this crate never does this (instead using
174    /// constant stack space and moving the call stack to the heap), other
175    /// crates may.
176    ///
177    /// This limit is not checked until the entire AST is parsed. Therefore,
178    /// if callers want to put a limit on the amount of heap space used, then
179    /// they should impose a limit on the length, in bytes, of the concrete
180    /// pattern string. In particular, this is viable since this parser
181    /// implementation will limit itself to heap space proportional to the
182    /// length of the pattern string.
183    ///
184    /// Note that a nest limit of `0` will return a nest limit error for most
185    /// patterns but not all. For example, a nest limit of `0` permits `a` but
186    /// not `ab`, since `ab` requires a concatenation, which results in a nest
187    /// depth of `1`. In general, a nest limit is not something that manifests
188    /// in an obvious way in the concrete syntax, therefore, it should not be
189    /// used in a granular way.
190    pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
191        self.nest_limit = limit;
192        self
193    }
194
195    /// Whether to support octal syntax or not.
196    ///
197    /// Octal syntax is a little-known way of uttering Unicode codepoints in
198    /// a regular expression. For example, `a`, `\x61`, `\u0061` and
199    /// `\141` are all equivalent regular expressions, where the last example
200    /// shows octal syntax.
201    ///
202    /// While supporting octal syntax isn't in and of itself a problem, it does
203    /// make good error messages harder. That is, in PCRE based regex engines,
204    /// syntax like `\0` invokes a backreference, which is explicitly
205    /// unsupported in Rust's regex engine. However, many users expect it to
206    /// be supported. Therefore, when octal support is disabled, the error
207    /// message will explicitly mention that backreferences aren't supported.
208    ///
209    /// Octal syntax is disabled by default.
210    pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
211        self.octal = yes;
212        self
213    }
214
215    /// Enable verbose mode in the regular expression.
216    ///
217    /// When enabled, verbose mode permits insignificant whitespace in many
218    /// places in the regular expression, as well as comments. Comments are
219    /// started using `#` and continue until the end of the line.
220    ///
221    /// By default, this is disabled. It may be selectively enabled in the
222    /// regular expression by using the `x` flag regardless of this setting.
223    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
224        self.ignore_whitespace = yes;
225        self
226    }
227
228    /// Allow using `{,n}` as an equivalent to `{0,n}`.
229    ///
230    /// When enabled, the parser accepts `{,n}` as valid syntax for `{0,n}`.
231    /// Most regular expression engines don't support the `{,n}` syntax, but
232    /// some others do it, namely Python's `re` library.
233    ///
234    /// This is disabled by default.
235    pub fn empty_min_range(&mut self, yes: bool) -> &mut ParserBuilder {
236        self.empty_min_range = yes;
237        self
238    }
239}
240
241/// A regular expression parser.
242///
243/// This parses a string representation of a regular expression into an
244/// abstract syntax tree. The size of the tree is proportional to the length
245/// of the regular expression pattern.
246///
247/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
248#[derive(Clone, Debug)]
249pub struct Parser {
250    /// The current position of the parser.
251    pos: Cell<Position>,
252    /// The current capture index.
253    capture_index: Cell<u32>,
254    /// The maximum number of open parens/brackets allowed. If the parser
255    /// exceeds this number, then an error is returned.
256    nest_limit: u32,
257    /// Whether to support octal syntax or not. When `false`, the parser will
258    /// return an error helpfully pointing out that backreferences are not
259    /// supported.
260    octal: bool,
261    /// The initial setting for `ignore_whitespace` as provided by
262    /// `ParserBuilder`. It is used when resetting the parser's state.
263    initial_ignore_whitespace: bool,
264    /// Whether the parser supports `{,n}` repetitions as an equivalent to
265    /// `{0,n}.`
266    empty_min_range: bool,
267    /// Whether whitespace should be ignored. When enabled, comments are
268    /// also permitted.
269    ignore_whitespace: Cell<bool>,
270    /// A list of comments, in order of appearance.
271    comments: RefCell<Vec<ast::Comment>>,
272    /// A stack of grouped sub-expressions, including alternations.
273    stack_group: RefCell<Vec<GroupState>>,
274    /// A stack of nested character classes. This is only non-empty when
275    /// parsing a class.
276    stack_class: RefCell<Vec<ClassState>>,
277    /// A sorted sequence of capture names. This is used to detect duplicate
278    /// capture names and report an error if one is detected.
279    capture_names: RefCell<Vec<ast::CaptureName>>,
280    /// A scratch buffer used in various places. Mostly this is used to
281    /// accumulate relevant characters from parts of a pattern.
282    scratch: RefCell<String>,
283}
284
285/// ParserI is the internal parser implementation.
286///
287/// We use this separate type so that we can carry the provided pattern string
288/// along with us. In particular, a `Parser` internal state is not tied to any
289/// one pattern, but `ParserI` is.
290///
291/// This type also lets us use `ParserI<&Parser>` in production code while
292/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
293/// work against the internal interface of the parser.
294#[derive(Clone, Debug)]
295struct ParserI<'s, P> {
296    /// The parser state/configuration.
297    parser: P,
298    /// The full regular expression provided by the user.
299    pattern: &'s str,
300}
301
302/// GroupState represents a single stack frame while parsing nested groups
303/// and alternations. Each frame records the state up to an opening parenthesis
304/// or a alternating bracket `|`.
305#[derive(Clone, Debug)]
306enum GroupState {
307    /// This state is pushed whenever an opening group is found.
308    Group {
309        /// The concatenation immediately preceding the opening group.
310        concat: ast::Concat,
311        /// The group that has been opened. Its sub-AST is always empty.
312        group: ast::Group,
313        /// Whether this group has the `x` flag enabled or not.
314        ignore_whitespace: bool,
315    },
316    /// This state is pushed whenever a new alternation branch is found. If
317    /// an alternation branch is found and this state is at the top of the
318    /// stack, then this state should be modified to include the new
319    /// alternation.
320    Alternation(ast::Alternation),
321}
322
323/// ClassState represents a single stack frame while parsing character classes.
324/// Each frame records the state up to an intersection, difference, symmetric
325/// difference or nested class.
326///
327/// Note that a parser's character class stack is only non-empty when parsing
328/// a character class. In all other cases, it is empty.
329#[derive(Clone, Debug)]
330enum ClassState {
331    /// This state is pushed whenever an opening bracket is found.
332    Open {
333        /// The union of class items immediately preceding this class.
334        union: ast::ClassSetUnion,
335        /// The class that has been opened. Typically this just corresponds
336        /// to the `[`, but it can also include `[^` since `^` indicates
337        /// negation of the class.
338        set: ast::ClassBracketed,
339    },
340    /// This state is pushed when a operator is seen. When popped, the stored
341    /// set becomes the left hand side of the operator.
342    Op {
343        /// The type of the operation, i.e., &&, -- or ~~.
344        kind: ast::ClassSetBinaryOpKind,
345        /// The left-hand side of the operator.
346        lhs: ast::ClassSet,
347    },
348}
349
350impl Parser {
351    /// Create a new parser with a default configuration.
352    ///
353    /// The parser can be run with either the `parse` or `parse_with_comments`
354    /// methods. The parse methods return an abstract syntax tree.
355    ///
356    /// To set configuration options on the parser, use [`ParserBuilder`].
357    pub fn new() -> Parser {
358        ParserBuilder::new().build()
359    }
360
361    /// Parse the regular expression into an abstract syntax tree.
362    pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
363        ParserI::new(self, pattern).parse()
364    }
365
366    /// Parse the regular expression and return an abstract syntax tree with
367    /// all of the comments found in the pattern.
368    pub fn parse_with_comments(
369        &mut self,
370        pattern: &str,
371    ) -> Result<ast::WithComments> {
372        ParserI::new(self, pattern).parse_with_comments()
373    }
374
375    /// Reset the internal state of a parser.
376    ///
377    /// This is called at the beginning of every parse. This prevents the
378    /// parser from running with inconsistent state (say, if a previous
379    /// invocation returned an error and the parser is reused).
380    fn reset(&self) {
381        // These settings should be in line with the construction
382        // in `ParserBuilder::build`.
383        self.pos.set(Position { offset: 0, line: 1, column: 1 });
384        self.ignore_whitespace.set(self.initial_ignore_whitespace);
385        self.comments.borrow_mut().clear();
386        self.stack_group.borrow_mut().clear();
387        self.stack_class.borrow_mut().clear();
388    }
389}
390
391impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
392    /// Build an internal parser from a parser configuration and a pattern.
393    fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
394        ParserI { parser, pattern }
395    }
396
397    /// Return a reference to the parser state.
398    fn parser(&self) -> &Parser {
399        self.parser.borrow()
400    }
401
402    /// Return a reference to the pattern being parsed.
403    fn pattern(&self) -> &str {
404        self.pattern
405    }
406
407    /// Create a new error with the given span and error type.
408    fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
409        ast::Error { kind, pattern: self.pattern().to_string(), span }
410    }
411
412    /// Return the current offset of the parser.
413    ///
414    /// The offset starts at `0` from the beginning of the regular expression
415    /// pattern string.
416    fn offset(&self) -> usize {
417        self.parser().pos.get().offset
418    }
419
420    /// Return the current line number of the parser.
421    ///
422    /// The line number starts at `1`.
423    fn line(&self) -> usize {
424        self.parser().pos.get().line
425    }
426
427    /// Return the current column of the parser.
428    ///
429    /// The column number starts at `1` and is reset whenever a `\n` is seen.
430    fn column(&self) -> usize {
431        self.parser().pos.get().column
432    }
433
434    /// Return the next capturing index. Each subsequent call increments the
435    /// internal index.
436    ///
437    /// The span given should correspond to the location of the opening
438    /// parenthesis.
439    ///
440    /// If the capture limit is exceeded, then an error is returned.
441    fn next_capture_index(&self, span: Span) -> Result<u32> {
442        let current = self.parser().capture_index.get();
443        let i = current.checked_add(1).ok_or_else(|| {
444            self.error(span, ast::ErrorKind::CaptureLimitExceeded)
445        })?;
446        self.parser().capture_index.set(i);
447        Ok(i)
448    }
449
450    /// Adds the given capture name to this parser. If this capture name has
451    /// already been used, then an error is returned.
452    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
453        let mut names = self.parser().capture_names.borrow_mut();
454        match names
455            .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
456        {
457            Err(i) => {
458                names.insert(i, cap.clone());
459                Ok(())
460            }
461            Ok(i) => Err(self.error(
462                cap.span,
463                ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
464            )),
465        }
466    }
467
468    /// Return whether the parser should ignore whitespace or not.
469    fn ignore_whitespace(&self) -> bool {
470        self.parser().ignore_whitespace.get()
471    }
472
473    /// Return the character at the current position of the parser.
474    ///
475    /// This panics if the current position does not point to a valid char.
476    fn char(&self) -> char {
477        self.char_at(self.offset())
478    }
479
480    /// Return the character at the given position.
481    ///
482    /// This panics if the given position does not point to a valid char.
483    fn char_at(&self, i: usize) -> char {
484        self.pattern()[i..]
485            .chars()
486            .next()
487            .unwrap_or_else(|| panic!("expected char at offset {}", i))
488    }
489
490    /// Bump the parser to the next Unicode scalar value.
491    ///
492    /// If the end of the input has been reached, then `false` is returned.
493    fn bump(&self) -> bool {
494        if self.is_eof() {
495            return false;
496        }
497        let Position { mut offset, mut line, mut column } = self.pos();
498        if self.char() == '\n' {
499            line = line.checked_add(1).unwrap();
500            column = 1;
501        } else {
502            column = column.checked_add(1).unwrap();
503        }
504        offset += self.char().len_utf8();
505        self.parser().pos.set(Position { offset, line, column });
506        self.pattern()[self.offset()..].chars().next().is_some()
507    }
508
509    /// If the substring starting at the current position of the parser has
510    /// the given prefix, then bump the parser to the character immediately
511    /// following the prefix and return true. Otherwise, don't bump the parser
512    /// and return false.
513    fn bump_if(&self, prefix: &str) -> bool {
514        if self.pattern()[self.offset()..].starts_with(prefix) {
515            for _ in 0..prefix.chars().count() {
516                self.bump();
517            }
518            true
519        } else {
520            false
521        }
522    }
523
524    /// Returns true if and only if the parser is positioned at a look-around
525    /// prefix. The conditions under which this returns true must always
526    /// correspond to a regular expression that would otherwise be consider
527    /// invalid.
528    ///
529    /// This should only be called immediately after parsing the opening of
530    /// a group or a set of flags.
531    fn is_lookaround_prefix(&self) -> bool {
532        self.bump_if("?=")
533            || self.bump_if("?!")
534            || self.bump_if("?<=")
535            || self.bump_if("?<!")
536    }
537
538    /// Bump the parser, and if the `x` flag is enabled, bump through any
539    /// subsequent spaces. Return true if and only if the parser is not at
540    /// EOF.
541    fn bump_and_bump_space(&self) -> bool {
542        if !self.bump() {
543            return false;
544        }
545        self.bump_space();
546        !self.is_eof()
547    }
548
549    /// If the `x` flag is enabled (i.e., whitespace insensitivity with
550    /// comments), then this will advance the parser through all whitespace
551    /// and comments to the next non-whitespace non-comment byte.
552    ///
553    /// If the `x` flag is disabled, then this is a no-op.
554    ///
555    /// This should be used selectively throughout the parser where
556    /// arbitrary whitespace is permitted when the `x` flag is enabled. For
557    /// example, `{   5  , 6}` is equivalent to `{5,6}`.
558    fn bump_space(&self) {
559        if !self.ignore_whitespace() {
560            return;
561        }
562        while !self.is_eof() {
563            if self.char().is_whitespace() {
564                self.bump();
565            } else if self.char() == '#' {
566                let start = self.pos();
567                let mut comment_text = String::new();
568                self.bump();
569                while !self.is_eof() {
570                    let c = self.char();
571                    self.bump();
572                    if c == '\n' {
573                        break;
574                    }
575                    comment_text.push(c);
576                }
577                let comment = ast::Comment {
578                    span: Span::new(start, self.pos()),
579                    comment: comment_text,
580                };
581                self.parser().comments.borrow_mut().push(comment);
582            } else {
583                break;
584            }
585        }
586    }
587
588    /// Peek at the next character in the input without advancing the parser.
589    ///
590    /// If the input has been exhausted, then this returns `None`.
591    fn peek(&self) -> Option<char> {
592        if self.is_eof() {
593            return None;
594        }
595        self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
596    }
597
598    /// Like peek, but will ignore spaces when the parser is in whitespace
599    /// insensitive mode.
600    fn peek_space(&self) -> Option<char> {
601        if !self.ignore_whitespace() {
602            return self.peek();
603        }
604        if self.is_eof() {
605            return None;
606        }
607        let mut start = self.offset() + self.char().len_utf8();
608        let mut in_comment = false;
609        for (i, c) in self.pattern()[start..].char_indices() {
610            if c.is_whitespace() {
611                continue;
612            } else if !in_comment && c == '#' {
613                in_comment = true;
614            } else if in_comment && c == '\n' {
615                in_comment = false;
616            } else {
617                start += i;
618                break;
619            }
620        }
621        self.pattern()[start..].chars().next()
622    }
623
624    /// Returns true if the next call to `bump` would return false.
625    fn is_eof(&self) -> bool {
626        self.offset() == self.pattern().len()
627    }
628
629    /// Return the current position of the parser, which includes the offset,
630    /// line and column.
631    fn pos(&self) -> Position {
632        self.parser().pos.get()
633    }
634
635    /// Create a span at the current position of the parser. Both the start
636    /// and end of the span are set.
637    fn span(&self) -> Span {
638        Span::splat(self.pos())
639    }
640
641    /// Create a span that covers the current character.
642    fn span_char(&self) -> Span {
643        let mut next = Position {
644            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
645            line: self.line(),
646            column: self.column().checked_add(1).unwrap(),
647        };
648        if self.char() == '\n' {
649            next.line += 1;
650            next.column = 1;
651        }
652        Span::new(self.pos(), next)
653    }
654
655    /// Parse and push a single alternation on to the parser's internal stack.
656    /// If the top of the stack already has an alternation, then add to that
657    /// instead of pushing a new one.
658    ///
659    /// The concatenation given corresponds to a single alternation branch.
660    /// The concatenation returned starts the next branch and is empty.
661    ///
662    /// This assumes the parser is currently positioned at `|` and will advance
663    /// the parser to the character following `|`.
664    #[inline(never)]
665    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
666        assert_eq!(self.char(), '|');
667        concat.span.end = self.pos();
668        self.push_or_add_alternation(concat);
669        self.bump();
670        Ok(ast::Concat { span: self.span(), asts: vec![] })
671    }
672
673    /// Pushes or adds the given branch of an alternation to the parser's
674    /// internal stack of state.
675    fn push_or_add_alternation(&self, concat: ast::Concat) {
676        use self::GroupState::*;
677
678        let mut stack = self.parser().stack_group.borrow_mut();
679        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
680            alts.asts.push(concat.into_ast());
681            return;
682        }
683        stack.push(Alternation(ast::Alternation {
684            span: Span::new(concat.span.start, self.pos()),
685            asts: vec![concat.into_ast()],
686        }));
687    }
688
689    /// Parse and push a group AST (and its parent concatenation) on to the
690    /// parser's internal stack. Return a fresh concatenation corresponding
691    /// to the group's sub-AST.
692    ///
693    /// If a set of flags was found (with no group), then the concatenation
694    /// is returned with that set of flags added.
695    ///
696    /// This assumes that the parser is currently positioned on the opening
697    /// parenthesis. It advances the parser to the character at the start
698    /// of the sub-expression (or adjoining expression).
699    ///
700    /// If there was a problem parsing the start of the group, then an error
701    /// is returned.
702    #[inline(never)]
703    fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
704        assert_eq!(self.char(), '(');
705        match self.parse_group()? {
706            Either::Left(set) => {
707                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
708                if let Some(v) = ignore {
709                    self.parser().ignore_whitespace.set(v);
710                }
711
712                concat.asts.push(Ast::flags(set));
713                Ok(concat)
714            }
715            Either::Right(group) => {
716                let old_ignore_whitespace = self.ignore_whitespace();
717                let new_ignore_whitespace = group
718                    .flags()
719                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
720                    .unwrap_or(old_ignore_whitespace);
721                self.parser().stack_group.borrow_mut().push(
722                    GroupState::Group {
723                        concat,
724                        group,
725                        ignore_whitespace: old_ignore_whitespace,
726                    },
727                );
728                self.parser().ignore_whitespace.set(new_ignore_whitespace);
729                Ok(ast::Concat { span: self.span(), asts: vec![] })
730            }
731        }
732    }
733
734    /// Pop a group AST from the parser's internal stack and set the group's
735    /// AST to the given concatenation. Return the concatenation containing
736    /// the group.
737    ///
738    /// This assumes that the parser is currently positioned on the closing
739    /// parenthesis and advances the parser to the character following the `)`.
740    ///
741    /// If no such group could be popped, then an unopened group error is
742    /// returned.
743    #[inline(never)]
744    fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
745        use self::GroupState::*;
746
747        assert_eq!(self.char(), ')');
748        let mut stack = self.parser().stack_group.borrow_mut();
749        let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
750            .pop()
751        {
752            Some(Group { concat, group, ignore_whitespace }) => {
753                (concat, group, ignore_whitespace, None)
754            }
755            Some(Alternation(alt)) => match stack.pop() {
756                Some(Group { concat, group, ignore_whitespace }) => {
757                    (concat, group, ignore_whitespace, Some(alt))
758                }
759                None | Some(Alternation(_)) => {
760                    return Err(self.error(
761                        self.span_char(),
762                        ast::ErrorKind::GroupUnopened,
763                    ));
764                }
765            },
766            None => {
767                return Err(self
768                    .error(self.span_char(), ast::ErrorKind::GroupUnopened));
769            }
770        };
771        self.parser().ignore_whitespace.set(ignore_whitespace);
772        group_concat.span.end = self.pos();
773        self.bump();
774        group.span.end = self.pos();
775        match alt {
776            Some(mut alt) => {
777                alt.span.end = group_concat.span.end;
778                alt.asts.push(group_concat.into_ast());
779                group.ast = Box::new(alt.into_ast());
780            }
781            None => {
782                group.ast = Box::new(group_concat.into_ast());
783            }
784        }
785        prior_concat.asts.push(Ast::group(group));
786        Ok(prior_concat)
787    }
788
789    /// Pop the last state from the parser's internal stack, if it exists, and
790    /// add the given concatenation to it. There either must be no state or a
791    /// single alternation item on the stack. Any other scenario produces an
792    /// error.
793    ///
794    /// This assumes that the parser has advanced to the end.
795    #[inline(never)]
796    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
797        concat.span.end = self.pos();
798        let mut stack = self.parser().stack_group.borrow_mut();
799        let ast = match stack.pop() {
800            None => Ok(concat.into_ast()),
801            Some(GroupState::Alternation(mut alt)) => {
802                alt.span.end = self.pos();
803                alt.asts.push(concat.into_ast());
804                Ok(Ast::alternation(alt))
805            }
806            Some(GroupState::Group { group, .. }) => {
807                return Err(
808                    self.error(group.span, ast::ErrorKind::GroupUnclosed)
809                );
810            }
811        };
812        // If we try to pop again, there should be nothing.
813        match stack.pop() {
814            None => ast,
815            Some(GroupState::Alternation(_)) => {
816                // This unreachable is unfortunate. This case can't happen
817                // because the only way we can be here is if there were two
818                // `GroupState::Alternation`s adjacent in the parser's stack,
819                // which we guarantee to never happen because we never push a
820                // `GroupState::Alternation` if one is already at the top of
821                // the stack.
822                unreachable!()
823            }
824            Some(GroupState::Group { group, .. }) => {
825                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
826            }
827        }
828    }
829
830    /// Parse the opening of a character class and push the current class
831    /// parsing context onto the parser's stack. This assumes that the parser
832    /// is positioned at an opening `[`. The given union should correspond to
833    /// the union of set items built up before seeing the `[`.
834    ///
835    /// If there was a problem parsing the opening of the class, then an error
836    /// is returned. Otherwise, a new union of set items for the class is
837    /// returned (which may be populated with either a `]` or a `-`).
838    #[inline(never)]
839    fn push_class_open(
840        &self,
841        parent_union: ast::ClassSetUnion,
842    ) -> Result<ast::ClassSetUnion> {
843        assert_eq!(self.char(), '[');
844
845        let (nested_set, nested_union) = self.parse_set_class_open()?;
846        self.parser()
847            .stack_class
848            .borrow_mut()
849            .push(ClassState::Open { union: parent_union, set: nested_set });
850        Ok(nested_union)
851    }
852
853    /// Parse the end of a character class set and pop the character class
854    /// parser stack. The union given corresponds to the last union built
855    /// before seeing the closing `]`. The union returned corresponds to the
856    /// parent character class set with the nested class added to it.
857    ///
858    /// This assumes that the parser is positioned at a `]` and will advance
859    /// the parser to the byte immediately following the `]`.
860    ///
861    /// If the stack is empty after popping, then this returns the final
862    /// "top-level" character class AST (where a "top-level" character class
863    /// is one that is not nested inside any other character class).
864    ///
865    /// If there is no corresponding opening bracket on the parser's stack,
866    /// then an error is returned.
867    #[inline(never)]
868    fn pop_class(
869        &self,
870        nested_union: ast::ClassSetUnion,
871    ) -> Result<Either<ast::ClassSetUnion, ast::ClassBracketed>> {
872        assert_eq!(self.char(), ']');
873
874        let item = ast::ClassSet::Item(nested_union.into_item());
875        let prevset = self.pop_class_op(item);
876        let mut stack = self.parser().stack_class.borrow_mut();
877        match stack.pop() {
878            None => {
879                // We can never observe an empty stack:
880                //
881                // 1) We are guaranteed to start with a non-empty stack since
882                //    the character class parser is only initiated when it sees
883                //    a `[`.
884                // 2) If we ever observe an empty stack while popping after
885                //    seeing a `]`, then we signal the character class parser
886                //    to terminate.
887                panic!("unexpected empty character class stack")
888            }
889            Some(ClassState::Op { .. }) => {
890                // This panic is unfortunate, but this case is impossible
891                // since we already popped the Op state if one exists above.
892                // Namely, every push to the class parser stack is guarded by
893                // whether an existing Op is already on the top of the stack.
894                // If it is, the existing Op is modified. That is, the stack
895                // can never have consecutive Op states.
896                panic!("unexpected ClassState::Op")
897            }
898            Some(ClassState::Open { mut union, mut set }) => {
899                self.bump();
900                set.span.end = self.pos();
901                set.kind = prevset;
902                if stack.is_empty() {
903                    Ok(Either::Right(set))
904                } else {
905                    union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
906                    Ok(Either::Left(union))
907                }
908            }
909        }
910    }
911
912    /// Return an "unclosed class" error whose span points to the most
913    /// recently opened class.
914    ///
915    /// This should only be called while parsing a character class.
916    #[inline(never)]
917    fn unclosed_class_error(&self) -> ast::Error {
918        for state in self.parser().stack_class.borrow().iter().rev() {
919            if let ClassState::Open { ref set, .. } = *state {
920                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
921            }
922        }
923        // We are guaranteed to have a non-empty stack with at least
924        // one open bracket, so we should never get here.
925        panic!("no open character class found")
926    }
927
928    /// Push the current set of class items on to the class parser's stack as
929    /// the left hand side of the given operator.
930    ///
931    /// A fresh set union is returned, which should be used to build the right
932    /// hand side of this operator.
933    #[inline(never)]
934    fn push_class_op(
935        &self,
936        next_kind: ast::ClassSetBinaryOpKind,
937        next_union: ast::ClassSetUnion,
938    ) -> ast::ClassSetUnion {
939        let item = ast::ClassSet::Item(next_union.into_item());
940        let new_lhs = self.pop_class_op(item);
941        self.parser()
942            .stack_class
943            .borrow_mut()
944            .push(ClassState::Op { kind: next_kind, lhs: new_lhs });
945        ast::ClassSetUnion { span: self.span(), items: vec![] }
946    }
947
948    /// Pop a character class set from the character class parser stack. If the
949    /// top of the stack is just an item (not an operation), then return the
950    /// given set unchanged. If the top of the stack is an operation, then the
951    /// given set will be used as the rhs of the operation on the top of the
952    /// stack. In that case, the binary operation is returned as a set.
953    #[inline(never)]
954    fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
955        let mut stack = self.parser().stack_class.borrow_mut();
956        let (kind, lhs) = match stack.pop() {
957            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
958            Some(state @ ClassState::Open { .. }) => {
959                stack.push(state);
960                return rhs;
961            }
962            None => unreachable!(),
963        };
964        let span = Span::new(lhs.span().start, rhs.span().end);
965        ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
966            span,
967            kind,
968            lhs: Box::new(lhs),
969            rhs: Box::new(rhs),
970        })
971    }
972}
973
974impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
975    /// Parse the regular expression into an abstract syntax tree.
976    fn parse(&self) -> Result<Ast> {
977        self.parse_with_comments().map(|astc| astc.ast)
978    }
979
980    /// Parse the regular expression and return an abstract syntax tree with
981    /// all of the comments found in the pattern.
982    fn parse_with_comments(&self) -> Result<ast::WithComments> {
983        assert_eq!(self.offset(), 0, "parser can only be used once");
984        self.parser().reset();
985        let mut concat = ast::Concat { span: self.span(), asts: vec![] };
986        loop {
987            self.bump_space();
988            if self.is_eof() {
989                break;
990            }
991            match self.char() {
992                '(' => concat = self.push_group(concat)?,
993                ')' => concat = self.pop_group(concat)?,
994                '|' => concat = self.push_alternate(concat)?,
995                '[' => {
996                    let class = self.parse_set_class()?;
997                    concat.asts.push(Ast::class_bracketed(class));
998                }
999                '?' => {
1000                    concat = self.parse_uncounted_repetition(
1001                        concat,
1002                        ast::RepetitionKind::ZeroOrOne,
1003                    )?;
1004                }
1005                '*' => {
1006                    concat = self.parse_uncounted_repetition(
1007                        concat,
1008                        ast::RepetitionKind::ZeroOrMore,
1009                    )?;
1010                }
1011                '+' => {
1012                    concat = self.parse_uncounted_repetition(
1013                        concat,
1014                        ast::RepetitionKind::OneOrMore,
1015                    )?;
1016                }
1017                '{' => {
1018                    concat = self.parse_counted_repetition(concat)?;
1019                }
1020                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1021            }
1022        }
1023        let ast = self.pop_group_end(concat)?;
1024        NestLimiter::new(self).check(&ast)?;
1025        Ok(ast::WithComments {
1026            ast,
1027            comments: mem::replace(
1028                &mut *self.parser().comments.borrow_mut(),
1029                vec![],
1030            ),
1031        })
1032    }
1033
1034    /// Parses an uncounted repetition operation. An uncounted repetition
1035    /// operator includes ?, * and +, but does not include the {m,n} syntax.
1036    /// The given `kind` should correspond to the operator observed by the
1037    /// caller.
1038    ///
1039    /// This assumes that the parser is currently positioned at the repetition
1040    /// operator and advances the parser to the first character after the
1041    /// operator. (Note that the operator may include a single additional `?`,
1042    /// which makes the operator ungreedy.)
1043    ///
1044    /// The caller should include the concatenation that is being built. The
1045    /// concatenation returned includes the repetition operator applied to the
1046    /// last expression in the given concatenation.
1047    #[inline(never)]
1048    fn parse_uncounted_repetition(
1049        &self,
1050        mut concat: ast::Concat,
1051        kind: ast::RepetitionKind,
1052    ) -> Result<ast::Concat> {
1053        assert!(
1054            self.char() == '?' || self.char() == '*' || self.char() == '+'
1055        );
1056        let op_start = self.pos();
1057        let ast = match concat.asts.pop() {
1058            Some(ast) => ast,
1059            None => {
1060                return Err(
1061                    self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1062                )
1063            }
1064        };
1065        match ast {
1066            Ast::Empty(_) | Ast::Flags(_) => {
1067                return Err(
1068                    self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1069                )
1070            }
1071            _ => {}
1072        }
1073        let mut greedy = true;
1074        if self.bump() && self.char() == '?' {
1075            greedy = false;
1076            self.bump();
1077        }
1078        concat.asts.push(Ast::repetition(ast::Repetition {
1079            span: ast.span().with_end(self.pos()),
1080            op: ast::RepetitionOp {
1081                span: Span::new(op_start, self.pos()),
1082                kind,
1083            },
1084            greedy,
1085            ast: Box::new(ast),
1086        }));
1087        Ok(concat)
1088    }
1089
1090    /// Parses a counted repetition operation. A counted repetition operator
1091    /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1092    /// operators.
1093    ///
1094    /// This assumes that the parser is currently positioned at the opening `{`
1095    /// and advances the parser to the first character after the operator.
1096    /// (Note that the operator may include a single additional `?`, which
1097    /// makes the operator ungreedy.)
1098    ///
1099    /// The caller should include the concatenation that is being built. The
1100    /// concatenation returned includes the repetition operator applied to the
1101    /// last expression in the given concatenation.
1102    #[inline(never)]
1103    fn parse_counted_repetition(
1104        &self,
1105        mut concat: ast::Concat,
1106    ) -> Result<ast::Concat> {
1107        assert!(self.char() == '{');
1108        let start = self.pos();
1109        let ast = match concat.asts.pop() {
1110            Some(ast) => ast,
1111            None => {
1112                return Err(
1113                    self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1114                )
1115            }
1116        };
1117        match ast {
1118            Ast::Empty(_) | Ast::Flags(_) => {
1119                return Err(
1120                    self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1121                )
1122            }
1123            _ => {}
1124        }
1125        if !self.bump_and_bump_space() {
1126            return Err(self.error(
1127                Span::new(start, self.pos()),
1128                ast::ErrorKind::RepetitionCountUnclosed,
1129            ));
1130        }
1131        let count_start = specialize_err(
1132            self.parse_decimal(),
1133            ast::ErrorKind::DecimalEmpty,
1134            ast::ErrorKind::RepetitionCountDecimalEmpty,
1135        );
1136        if self.is_eof() {
1137            return Err(self.error(
1138                Span::new(start, self.pos()),
1139                ast::ErrorKind::RepetitionCountUnclosed,
1140            ));
1141        }
1142        let range = if self.char() == ',' {
1143            if !self.bump_and_bump_space() {
1144                return Err(self.error(
1145                    Span::new(start, self.pos()),
1146                    ast::ErrorKind::RepetitionCountUnclosed,
1147                ));
1148            }
1149            if self.char() != '}' {
1150                let count_start = match count_start {
1151                    Ok(c) => c,
1152                    Err(err)
1153                        if err.kind
1154                            == ast::ErrorKind::RepetitionCountDecimalEmpty =>
1155                    {
1156                        if self.parser().empty_min_range {
1157                            0
1158                        } else {
1159                            return Err(err);
1160                        }
1161                    }
1162                    err => err?,
1163                };
1164                let count_end = specialize_err(
1165                    self.parse_decimal(),
1166                    ast::ErrorKind::DecimalEmpty,
1167                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1168                )?;
1169                ast::RepetitionRange::Bounded(count_start, count_end)
1170            } else {
1171                ast::RepetitionRange::AtLeast(count_start?)
1172            }
1173        } else {
1174            ast::RepetitionRange::Exactly(count_start?)
1175        };
1176
1177        if self.is_eof() || self.char() != '}' {
1178            return Err(self.error(
1179                Span::new(start, self.pos()),
1180                ast::ErrorKind::RepetitionCountUnclosed,
1181            ));
1182        }
1183
1184        let mut greedy = true;
1185        if self.bump_and_bump_space() && self.char() == '?' {
1186            greedy = false;
1187            self.bump();
1188        }
1189
1190        let op_span = Span::new(start, self.pos());
1191        if !range.is_valid() {
1192            return Err(
1193                self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
1194            );
1195        }
1196        concat.asts.push(Ast::repetition(ast::Repetition {
1197            span: ast.span().with_end(self.pos()),
1198            op: ast::RepetitionOp {
1199                span: op_span,
1200                kind: ast::RepetitionKind::Range(range),
1201            },
1202            greedy,
1203            ast: Box::new(ast),
1204        }));
1205        Ok(concat)
1206    }
1207
1208    /// Parse a group (which contains a sub-expression) or a set of flags.
1209    ///
1210    /// If a group was found, then it is returned with an empty AST. If a set
1211    /// of flags is found, then that set is returned.
1212    ///
1213    /// The parser should be positioned at the opening parenthesis.
1214    ///
1215    /// This advances the parser to the character before the start of the
1216    /// sub-expression (in the case of a group) or to the closing parenthesis
1217    /// immediately following the set of flags.
1218    ///
1219    /// # Errors
1220    ///
1221    /// If flags are given and incorrectly specified, then a corresponding
1222    /// error is returned.
1223    ///
1224    /// If a capture name is given and it is incorrectly specified, then a
1225    /// corresponding error is returned.
1226    #[inline(never)]
1227    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1228        assert_eq!(self.char(), '(');
1229        let open_span = self.span_char();
1230        self.bump();
1231        self.bump_space();
1232        if self.is_lookaround_prefix() {
1233            return Err(self.error(
1234                Span::new(open_span.start, self.span().end),
1235                ast::ErrorKind::UnsupportedLookAround,
1236            ));
1237        }
1238        let inner_span = self.span();
1239        let mut starts_with_p = true;
1240        if self.bump_if("?P<") || {
1241            starts_with_p = false;
1242            self.bump_if("?<")
1243        } {
1244            let capture_index = self.next_capture_index(open_span)?;
1245            let name = self.parse_capture_name(capture_index)?;
1246            Ok(Either::Right(ast::Group {
1247                span: open_span,
1248                kind: ast::GroupKind::CaptureName { starts_with_p, name },
1249                ast: Box::new(Ast::empty(self.span())),
1250            }))
1251        } else if self.bump_if("?") {
1252            if self.is_eof() {
1253                return Err(
1254                    self.error(open_span, ast::ErrorKind::GroupUnclosed)
1255                );
1256            }
1257            let flags = self.parse_flags()?;
1258            let char_end = self.char();
1259            self.bump();
1260            if char_end == ')' {
1261                // We don't allow empty flags, e.g., `(?)`. We instead
1262                // interpret it as a repetition operator missing its argument.
1263                if flags.items.is_empty() {
1264                    return Err(self.error(
1265                        inner_span,
1266                        ast::ErrorKind::RepetitionMissing,
1267                    ));
1268                }
1269                Ok(Either::Left(ast::SetFlags {
1270                    span: Span { end: self.pos(), ..open_span },
1271                    flags,
1272                }))
1273            } else {
1274                assert_eq!(char_end, ':');
1275                Ok(Either::Right(ast::Group {
1276                    span: open_span,
1277                    kind: ast::GroupKind::NonCapturing(flags),
1278                    ast: Box::new(Ast::empty(self.span())),
1279                }))
1280            }
1281        } else {
1282            let capture_index = self.next_capture_index(open_span)?;
1283            Ok(Either::Right(ast::Group {
1284                span: open_span,
1285                kind: ast::GroupKind::CaptureIndex(capture_index),
1286                ast: Box::new(Ast::empty(self.span())),
1287            }))
1288        }
1289    }
1290
1291    /// Parses a capture group name. Assumes that the parser is positioned at
1292    /// the first character in the name following the opening `<` (and may
1293    /// possibly be EOF). This advances the parser to the first character
1294    /// following the closing `>`.
1295    ///
1296    /// The caller must provide the capture index of the group for this name.
1297    #[inline(never)]
1298    fn parse_capture_name(
1299        &self,
1300        capture_index: u32,
1301    ) -> Result<ast::CaptureName> {
1302        if self.is_eof() {
1303            return Err(self
1304                .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1305        }
1306        let start = self.pos();
1307        loop {
1308            if self.char() == '>' {
1309                break;
1310            }
1311            if !is_capture_char(self.char(), self.pos() == start) {
1312                return Err(self.error(
1313                    self.span_char(),
1314                    ast::ErrorKind::GroupNameInvalid,
1315                ));
1316            }
1317            if !self.bump() {
1318                break;
1319            }
1320        }
1321        let end = self.pos();
1322        if self.is_eof() {
1323            return Err(self
1324                .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1325        }
1326        assert_eq!(self.char(), '>');
1327        self.bump();
1328        let name = &self.pattern()[start.offset..end.offset];
1329        if name.is_empty() {
1330            return Err(self.error(
1331                Span::new(start, start),
1332                ast::ErrorKind::GroupNameEmpty,
1333            ));
1334        }
1335        let capname = ast::CaptureName {
1336            span: Span::new(start, end),
1337            name: name.to_string(),
1338            index: capture_index,
1339        };
1340        self.add_capture_name(&capname)?;
1341        Ok(capname)
1342    }
1343
1344    /// Parse a sequence of flags starting at the current character.
1345    ///
1346    /// This advances the parser to the character immediately following the
1347    /// flags, which is guaranteed to be either `:` or `)`.
1348    ///
1349    /// # Errors
1350    ///
1351    /// If any flags are duplicated, then an error is returned.
1352    ///
1353    /// If the negation operator is used more than once, then an error is
1354    /// returned.
1355    ///
1356    /// If no flags could be found or if the negation operation is not followed
1357    /// by any flags, then an error is returned.
1358    #[inline(never)]
1359    fn parse_flags(&self) -> Result<ast::Flags> {
1360        let mut flags = ast::Flags { span: self.span(), items: vec![] };
1361        let mut last_was_negation = None;
1362        while self.char() != ':' && self.char() != ')' {
1363            if self.char() == '-' {
1364                last_was_negation = Some(self.span_char());
1365                let item = ast::FlagsItem {
1366                    span: self.span_char(),
1367                    kind: ast::FlagsItemKind::Negation,
1368                };
1369                if let Some(i) = flags.add_item(item) {
1370                    return Err(self.error(
1371                        self.span_char(),
1372                        ast::ErrorKind::FlagRepeatedNegation {
1373                            original: flags.items[i].span,
1374                        },
1375                    ));
1376                }
1377            } else {
1378                last_was_negation = None;
1379                let item = ast::FlagsItem {
1380                    span: self.span_char(),
1381                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1382                };
1383                if let Some(i) = flags.add_item(item) {
1384                    return Err(self.error(
1385                        self.span_char(),
1386                        ast::ErrorKind::FlagDuplicate {
1387                            original: flags.items[i].span,
1388                        },
1389                    ));
1390                }
1391            }
1392            if !self.bump() {
1393                return Err(
1394                    self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof)
1395                );
1396            }
1397        }
1398        if let Some(span) = last_was_negation {
1399            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1400        }
1401        flags.span.end = self.pos();
1402        Ok(flags)
1403    }
1404
1405    /// Parse the current character as a flag. Do not advance the parser.
1406    ///
1407    /// # Errors
1408    ///
1409    /// If the flag is not recognized, then an error is returned.
1410    #[inline(never)]
1411    fn parse_flag(&self) -> Result<ast::Flag> {
1412        match self.char() {
1413            'i' => Ok(ast::Flag::CaseInsensitive),
1414            'm' => Ok(ast::Flag::MultiLine),
1415            's' => Ok(ast::Flag::DotMatchesNewLine),
1416            'U' => Ok(ast::Flag::SwapGreed),
1417            'u' => Ok(ast::Flag::Unicode),
1418            'R' => Ok(ast::Flag::CRLF),
1419            'x' => Ok(ast::Flag::IgnoreWhitespace),
1420            _ => {
1421                Err(self
1422                    .error(self.span_char(), ast::ErrorKind::FlagUnrecognized))
1423            }
1424        }
1425    }
1426
1427    /// Parse a primitive AST. e.g., A literal, non-set character class or
1428    /// assertion.
1429    ///
1430    /// This assumes that the parser expects a primitive at the current
1431    /// location. i.e., All other non-primitive cases have been handled.
1432    /// For example, if the parser's position is at `|`, then `|` will be
1433    /// treated as a literal (e.g., inside a character class).
1434    ///
1435    /// This advances the parser to the first character immediately following
1436    /// the primitive.
1437    fn parse_primitive(&self) -> Result<Primitive> {
1438        match self.char() {
1439            '\\' => self.parse_escape(),
1440            '.' => {
1441                let ast = Primitive::Dot(self.span_char());
1442                self.bump();
1443                Ok(ast)
1444            }
1445            '^' => {
1446                let ast = Primitive::Assertion(ast::Assertion {
1447                    span: self.span_char(),
1448                    kind: ast::AssertionKind::StartLine,
1449                });
1450                self.bump();
1451                Ok(ast)
1452            }
1453            '$' => {
1454                let ast = Primitive::Assertion(ast::Assertion {
1455                    span: self.span_char(),
1456                    kind: ast::AssertionKind::EndLine,
1457                });
1458                self.bump();
1459                Ok(ast)
1460            }
1461            c => {
1462                let ast = Primitive::Literal(ast::Literal {
1463                    span: self.span_char(),
1464                    kind: ast::LiteralKind::Verbatim,
1465                    c,
1466                });
1467                self.bump();
1468                Ok(ast)
1469            }
1470        }
1471    }
1472
1473    /// Parse an escape sequence as a primitive AST.
1474    ///
1475    /// This assumes the parser is positioned at the start of the escape
1476    /// sequence, i.e., `\`. It advances the parser to the first position
1477    /// immediately following the escape sequence.
1478    #[inline(never)]
1479    fn parse_escape(&self) -> Result<Primitive> {
1480        assert_eq!(self.char(), '\\');
1481        let start = self.pos();
1482        if !self.bump() {
1483            return Err(self.error(
1484                Span::new(start, self.pos()),
1485                ast::ErrorKind::EscapeUnexpectedEof,
1486            ));
1487        }
1488        let c = self.char();
1489        // Put some of the more complicated routines into helpers.
1490        match c {
1491            '0'..='7' => {
1492                if !self.parser().octal {
1493                    return Err(self.error(
1494                        Span::new(start, self.span_char().end),
1495                        ast::ErrorKind::UnsupportedBackreference,
1496                    ));
1497                }
1498                let mut lit = self.parse_octal();
1499                lit.span.start = start;
1500                return Ok(Primitive::Literal(lit));
1501            }
1502            '8'..='9' if !self.parser().octal => {
1503                return Err(self.error(
1504                    Span::new(start, self.span_char().end),
1505                    ast::ErrorKind::UnsupportedBackreference,
1506                ));
1507            }
1508            'x' | 'u' | 'U' => {
1509                let mut lit = self.parse_hex()?;
1510                lit.span.start = start;
1511                return Ok(Primitive::Literal(lit));
1512            }
1513            'p' | 'P' => {
1514                let mut cls = self.parse_unicode_class()?;
1515                cls.span.start = start;
1516                return Ok(Primitive::Unicode(cls));
1517            }
1518            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1519                let mut cls = self.parse_perl_class();
1520                cls.span.start = start;
1521                return Ok(Primitive::Perl(cls));
1522            }
1523            _ => {}
1524        }
1525
1526        // Handle all of the one letter sequences inline.
1527        self.bump();
1528        let span = Span::new(start, self.pos());
1529        if is_meta_character(c) {
1530            return Ok(Primitive::Literal(ast::Literal {
1531                span,
1532                kind: ast::LiteralKind::Meta,
1533                c,
1534            }));
1535        }
1536        if is_escapeable_character(c) {
1537            return Ok(Primitive::Literal(ast::Literal {
1538                span,
1539                kind: ast::LiteralKind::Superfluous,
1540                c,
1541            }));
1542        }
1543        let special = |kind, c| {
1544            Ok(Primitive::Literal(ast::Literal {
1545                span,
1546                kind: ast::LiteralKind::Special(kind),
1547                c,
1548            }))
1549        };
1550        match c {
1551            'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1552            'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1553            't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1554            'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1555            'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1556            'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1557            'A' => Ok(Primitive::Assertion(ast::Assertion {
1558                span,
1559                kind: ast::AssertionKind::StartText,
1560            })),
1561            'z' => Ok(Primitive::Assertion(ast::Assertion {
1562                span,
1563                kind: ast::AssertionKind::EndText,
1564            })),
1565            'b' => {
1566                let mut wb = ast::Assertion {
1567                    span,
1568                    kind: ast::AssertionKind::WordBoundary,
1569                };
1570                // After a \b, we "try" to parse things like \b{start} for
1571                // special word boundary assertions.
1572                if !self.is_eof() && self.char() == '{' {
1573                    if let Some(kind) =
1574                        self.maybe_parse_special_word_boundary(start)?
1575                    {
1576                        wb.kind = kind;
1577                        wb.span.end = self.pos();
1578                    }
1579                }
1580                Ok(Primitive::Assertion(wb))
1581            }
1582            'B' => Ok(Primitive::Assertion(ast::Assertion {
1583                span,
1584                kind: ast::AssertionKind::NotWordBoundary,
1585            })),
1586            '<' => Ok(Primitive::Assertion(ast::Assertion {
1587                span,
1588                kind: ast::AssertionKind::WordBoundaryStartAngle,
1589            })),
1590            '>' => Ok(Primitive::Assertion(ast::Assertion {
1591                span,
1592                kind: ast::AssertionKind::WordBoundaryEndAngle,
1593            })),
1594            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1595        }
1596    }
1597
1598    /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
1599    /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
1600    ///
1601    /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
1602    /// if it fails it will just return `None` with no error. This is done
1603    /// because `\b{5}` is a valid expression and we want to let that be parsed
1604    /// by the existing counted repetition parsing code. (I thought about just
1605    /// invoking the counted repetition code from here, but it seemed a little
1606    /// ham-fisted.)
1607    ///
1608    /// Unlike `maybe_parse_ascii_class` though, this can return an error.
1609    /// Namely, if we definitely know it isn't a counted repetition, then we
1610    /// return an error specific to the specialty word boundaries.
1611    ///
1612    /// This assumes the parser is positioned at a `{` immediately following
1613    /// a `\b`. When `None` is returned, the parser is returned to the position
1614    /// at which it started: pointing at a `{`.
1615    ///
1616    /// The position given should correspond to the start of the `\b`.
1617    fn maybe_parse_special_word_boundary(
1618        &self,
1619        wb_start: Position,
1620    ) -> Result<Option<ast::AssertionKind>> {
1621        assert_eq!(self.char(), '{');
1622
1623        let is_valid_char = |c| match c {
1624            'A'..='Z' | 'a'..='z' | '-' => true,
1625            _ => false,
1626        };
1627        let start = self.pos();
1628        if !self.bump_and_bump_space() {
1629            return Err(self.error(
1630                Span::new(wb_start, self.pos()),
1631                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
1632            ));
1633        }
1634        let start_contents = self.pos();
1635        // This is one of the critical bits: if the first non-whitespace
1636        // character isn't in [-A-Za-z] (i.e., this can't be a special word
1637        // boundary), then we bail and let the counted repetition parser deal
1638        // with this.
1639        if !is_valid_char(self.char()) {
1640            self.parser().pos.set(start);
1641            return Ok(None);
1642        }
1643
1644        // Now collect up our chars until we see a '}'.
1645        let mut scratch = self.parser().scratch.borrow_mut();
1646        scratch.clear();
1647        while !self.is_eof() && is_valid_char(self.char()) {
1648            scratch.push(self.char());
1649            self.bump_and_bump_space();
1650        }
1651        if self.is_eof() || self.char() != '}' {
1652            return Err(self.error(
1653                Span::new(start, self.pos()),
1654                ast::ErrorKind::SpecialWordBoundaryUnclosed,
1655            ));
1656        }
1657        let end = self.pos();
1658        self.bump();
1659        let kind = match scratch.as_str() {
1660            "start" => ast::AssertionKind::WordBoundaryStart,
1661            "end" => ast::AssertionKind::WordBoundaryEnd,
1662            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
1663            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
1664            _ => {
1665                return Err(self.error(
1666                    Span::new(start_contents, end),
1667                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
1668                ))
1669            }
1670        };
1671        Ok(Some(kind))
1672    }
1673
1674    /// Parse an octal representation of a Unicode codepoint up to 3 digits
1675    /// long. This expects the parser to be positioned at the first octal
1676    /// digit and advances the parser to the first character immediately
1677    /// following the octal number. This also assumes that parsing octal
1678    /// escapes is enabled.
1679    ///
1680    /// Assuming the preconditions are met, this routine can never fail.
1681    #[inline(never)]
1682    fn parse_octal(&self) -> ast::Literal {
1683        assert!(self.parser().octal);
1684        assert!('0' <= self.char() && self.char() <= '7');
1685        let start = self.pos();
1686        // Parse up to two more digits.
1687        while self.bump()
1688            && '0' <= self.char()
1689            && self.char() <= '7'
1690            && self.pos().offset - start.offset <= 2
1691        {}
1692        let end = self.pos();
1693        let octal = &self.pattern()[start.offset..end.offset];
1694        // Parsing the octal should never fail since the above guarantees a
1695        // valid number.
1696        let codepoint =
1697            u32::from_str_radix(octal, 8).expect("valid octal number");
1698        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1699        // invalid Unicode scalar values.
1700        let c = char::from_u32(codepoint).expect("Unicode scalar value");
1701        ast::Literal {
1702            span: Span::new(start, end),
1703            kind: ast::LiteralKind::Octal,
1704            c,
1705        }
1706    }
1707
1708    /// Parse a hex representation of a Unicode codepoint. This handles both
1709    /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1710    /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1711    /// the first character immediately following the hexadecimal literal.
1712    #[inline(never)]
1713    fn parse_hex(&self) -> Result<ast::Literal> {
1714        assert!(
1715            self.char() == 'x' || self.char() == 'u' || self.char() == 'U'
1716        );
1717
1718        let hex_kind = match self.char() {
1719            'x' => ast::HexLiteralKind::X,
1720            'u' => ast::HexLiteralKind::UnicodeShort,
1721            _ => ast::HexLiteralKind::UnicodeLong,
1722        };
1723        if !self.bump_and_bump_space() {
1724            return Err(
1725                self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
1726            );
1727        }
1728        if self.char() == '{' {
1729            self.parse_hex_brace(hex_kind)
1730        } else {
1731            self.parse_hex_digits(hex_kind)
1732        }
1733    }
1734
1735    /// Parse an N-digit hex representation of a Unicode codepoint. This
1736    /// expects the parser to be positioned at the first digit and will advance
1737    /// the parser to the first character immediately following the escape
1738    /// sequence.
1739    ///
1740    /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1741    /// or 8 (for `\UNNNNNNNN`).
1742    #[inline(never)]
1743    fn parse_hex_digits(
1744        &self,
1745        kind: ast::HexLiteralKind,
1746    ) -> Result<ast::Literal> {
1747        let mut scratch = self.parser().scratch.borrow_mut();
1748        scratch.clear();
1749
1750        let start = self.pos();
1751        for i in 0..kind.digits() {
1752            if i > 0 && !self.bump_and_bump_space() {
1753                return Err(self
1754                    .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
1755            }
1756            if !is_hex(self.char()) {
1757                return Err(self.error(
1758                    self.span_char(),
1759                    ast::ErrorKind::EscapeHexInvalidDigit,
1760                ));
1761            }
1762            scratch.push(self.char());
1763        }
1764        // The final bump just moves the parser past the literal, which may
1765        // be EOF.
1766        self.bump_and_bump_space();
1767        let end = self.pos();
1768        let hex = scratch.as_str();
1769        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1770            None => Err(self.error(
1771                Span::new(start, end),
1772                ast::ErrorKind::EscapeHexInvalid,
1773            )),
1774            Some(c) => Ok(ast::Literal {
1775                span: Span::new(start, end),
1776                kind: ast::LiteralKind::HexFixed(kind),
1777                c,
1778            }),
1779        }
1780    }
1781
1782    /// Parse a hex representation of any Unicode scalar value. This expects
1783    /// the parser to be positioned at the opening brace `{` and will advance
1784    /// the parser to the first character following the closing brace `}`.
1785    #[inline(never)]
1786    fn parse_hex_brace(
1787        &self,
1788        kind: ast::HexLiteralKind,
1789    ) -> Result<ast::Literal> {
1790        let mut scratch = self.parser().scratch.borrow_mut();
1791        scratch.clear();
1792
1793        let brace_pos = self.pos();
1794        let start = self.span_char().end;
1795        while self.bump_and_bump_space() && self.char() != '}' {
1796            if !is_hex(self.char()) {
1797                return Err(self.error(
1798                    self.span_char(),
1799                    ast::ErrorKind::EscapeHexInvalidDigit,
1800                ));
1801            }
1802            scratch.push(self.char());
1803        }
1804        if self.is_eof() {
1805            return Err(self.error(
1806                Span::new(brace_pos, self.pos()),
1807                ast::ErrorKind::EscapeUnexpectedEof,
1808            ));
1809        }
1810        let end = self.pos();
1811        let hex = scratch.as_str();
1812        assert_eq!(self.char(), '}');
1813        self.bump_and_bump_space();
1814
1815        if hex.is_empty() {
1816            return Err(self.error(
1817                Span::new(brace_pos, self.pos()),
1818                ast::ErrorKind::EscapeHexEmpty,
1819            ));
1820        }
1821        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1822            None => Err(self.error(
1823                Span::new(start, end),
1824                ast::ErrorKind::EscapeHexInvalid,
1825            )),
1826            Some(c) => Ok(ast::Literal {
1827                span: Span::new(start, self.pos()),
1828                kind: ast::LiteralKind::HexBrace(kind),
1829                c,
1830            }),
1831        }
1832    }
1833
1834    /// Parse a decimal number into a u32 while trimming leading and trailing
1835    /// whitespace.
1836    ///
1837    /// This expects the parser to be positioned at the first position where
1838    /// a decimal digit could occur. This will advance the parser to the byte
1839    /// immediately following the last contiguous decimal digit.
1840    ///
1841    /// If no decimal digit could be found or if there was a problem parsing
1842    /// the complete set of digits into a u32, then an error is returned.
1843    fn parse_decimal(&self) -> Result<u32> {
1844        let mut scratch = self.parser().scratch.borrow_mut();
1845        scratch.clear();
1846
1847        while !self.is_eof() && self.char().is_whitespace() {
1848            self.bump();
1849        }
1850        let start = self.pos();
1851        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1852            scratch.push(self.char());
1853            self.bump_and_bump_space();
1854        }
1855        let span = Span::new(start, self.pos());
1856        while !self.is_eof() && self.char().is_whitespace() {
1857            self.bump_and_bump_space();
1858        }
1859        let digits = scratch.as_str();
1860        if digits.is_empty() {
1861            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1862        }
1863        match u32::from_str_radix(digits, 10).ok() {
1864            Some(n) => Ok(n),
1865            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1866        }
1867    }
1868
1869    /// Parse a standard character class consisting primarily of characters or
1870    /// character ranges, but can also contain nested character classes of
1871    /// any type (sans `.`).
1872    ///
1873    /// This assumes the parser is positioned at the opening `[`. If parsing
1874    /// is successful, then the parser is advanced to the position immediately
1875    /// following the closing `]`.
1876    #[inline(never)]
1877    fn parse_set_class(&self) -> Result<ast::ClassBracketed> {
1878        assert_eq!(self.char(), '[');
1879
1880        let mut union =
1881            ast::ClassSetUnion { span: self.span(), items: vec![] };
1882        loop {
1883            self.bump_space();
1884            if self.is_eof() {
1885                return Err(self.unclosed_class_error());
1886            }
1887            match self.char() {
1888                '[' => {
1889                    // If we've already parsed the opening bracket, then
1890                    // attempt to treat this as the beginning of an ASCII
1891                    // class. If ASCII class parsing fails, then the parser
1892                    // backs up to `[`.
1893                    if !self.parser().stack_class.borrow().is_empty() {
1894                        if let Some(cls) = self.maybe_parse_ascii_class() {
1895                            union.push(ast::ClassSetItem::Ascii(cls));
1896                            continue;
1897                        }
1898                    }
1899                    union = self.push_class_open(union)?;
1900                }
1901                ']' => match self.pop_class(union)? {
1902                    Either::Left(nested_union) => {
1903                        union = nested_union;
1904                    }
1905                    Either::Right(class) => return Ok(class),
1906                },
1907                '&' if self.peek() == Some('&') => {
1908                    assert!(self.bump_if("&&"));
1909                    union = self.push_class_op(
1910                        ast::ClassSetBinaryOpKind::Intersection,
1911                        union,
1912                    );
1913                }
1914                '-' if self.peek() == Some('-') => {
1915                    assert!(self.bump_if("--"));
1916                    union = self.push_class_op(
1917                        ast::ClassSetBinaryOpKind::Difference,
1918                        union,
1919                    );
1920                }
1921                '~' if self.peek() == Some('~') => {
1922                    assert!(self.bump_if("~~"));
1923                    union = self.push_class_op(
1924                        ast::ClassSetBinaryOpKind::SymmetricDifference,
1925                        union,
1926                    );
1927                }
1928                _ => {
1929                    union.push(self.parse_set_class_range()?);
1930                }
1931            }
1932        }
1933    }
1934
1935    /// Parse a single primitive item in a character class set. The item to
1936    /// be parsed can either be one of a simple literal character, a range
1937    /// between two simple literal characters or a "primitive" character
1938    /// class like \w or \p{Greek}.
1939    ///
1940    /// If an invalid escape is found, or if a character class is found where
1941    /// a simple literal is expected (e.g., in a range), then an error is
1942    /// returned.
1943    #[inline(never)]
1944    fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1945        let prim1 = self.parse_set_class_item()?;
1946        self.bump_space();
1947        if self.is_eof() {
1948            return Err(self.unclosed_class_error());
1949        }
1950        // If the next char isn't a `-`, then we don't have a range.
1951        // There are two exceptions. If the char after a `-` is a `]`, then
1952        // `-` is interpreted as a literal `-`. Alternatively, if the char
1953        // after a `-` is a `-`, then `--` corresponds to a "difference"
1954        // operation.
1955        if self.char() != '-'
1956            || self.peek_space() == Some(']')
1957            || self.peek_space() == Some('-')
1958        {
1959            return prim1.into_class_set_item(self);
1960        }
1961        // OK, now we're parsing a range, so bump past the `-` and parse the
1962        // second half of the range.
1963        if !self.bump_and_bump_space() {
1964            return Err(self.unclosed_class_error());
1965        }
1966        let prim2 = self.parse_set_class_item()?;
1967        let range = ast::ClassSetRange {
1968            span: Span::new(prim1.span().start, prim2.span().end),
1969            start: prim1.into_class_literal(self)?,
1970            end: prim2.into_class_literal(self)?,
1971        };
1972        if !range.is_valid() {
1973            return Err(
1974                self.error(range.span, ast::ErrorKind::ClassRangeInvalid)
1975            );
1976        }
1977        Ok(ast::ClassSetItem::Range(range))
1978    }
1979
1980    /// Parse a single item in a character class as a primitive, where the
1981    /// primitive either consists of a verbatim literal or a single escape
1982    /// sequence.
1983    ///
1984    /// This assumes the parser is positioned at the beginning of a primitive,
1985    /// and advances the parser to the first position after the primitive if
1986    /// successful.
1987    ///
1988    /// Note that it is the caller's responsibility to report an error if an
1989    /// illegal primitive was parsed.
1990    #[inline(never)]
1991    fn parse_set_class_item(&self) -> Result<Primitive> {
1992        if self.char() == '\\' {
1993            self.parse_escape()
1994        } else {
1995            let x = Primitive::Literal(ast::Literal {
1996                span: self.span_char(),
1997                kind: ast::LiteralKind::Verbatim,
1998                c: self.char(),
1999            });
2000            self.bump();
2001            Ok(x)
2002        }
2003    }
2004
2005    /// Parses the opening of a character class set. This includes the opening
2006    /// bracket along with `^` if present to indicate negation. This also
2007    /// starts parsing the opening set of unioned items if applicable, since
2008    /// there are special rules applied to certain characters in the opening
2009    /// of a character class. For example, `[^]]` is the class of all
2010    /// characters not equal to `]`. (`]` would need to be escaped in any other
2011    /// position.) Similarly for `-`.
2012    ///
2013    /// In all cases, the op inside the returned `ast::ClassBracketed` is an
2014    /// empty union. This empty union should be replaced with the actual item
2015    /// when it is popped from the parser's stack.
2016    ///
2017    /// This assumes the parser is positioned at the opening `[` and advances
2018    /// the parser to the first non-special byte of the character class.
2019    ///
2020    /// An error is returned if EOF is found.
2021    #[inline(never)]
2022    fn parse_set_class_open(
2023        &self,
2024    ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
2025        assert_eq!(self.char(), '[');
2026        let start = self.pos();
2027        if !self.bump_and_bump_space() {
2028            return Err(self.error(
2029                Span::new(start, self.pos()),
2030                ast::ErrorKind::ClassUnclosed,
2031            ));
2032        }
2033
2034        let negated = if self.char() != '^' {
2035            false
2036        } else {
2037            if !self.bump_and_bump_space() {
2038                return Err(self.error(
2039                    Span::new(start, self.pos()),
2040                    ast::ErrorKind::ClassUnclosed,
2041                ));
2042            }
2043            true
2044        };
2045        // Accept any number of `-` as literal `-`.
2046        let mut union =
2047            ast::ClassSetUnion { span: self.span(), items: vec![] };
2048        while self.char() == '-' {
2049            union.push(ast::ClassSetItem::Literal(ast::Literal {
2050                span: self.span_char(),
2051                kind: ast::LiteralKind::Verbatim,
2052                c: '-',
2053            }));
2054            if !self.bump_and_bump_space() {
2055                return Err(self.error(
2056                    Span::new(start, start),
2057                    ast::ErrorKind::ClassUnclosed,
2058                ));
2059            }
2060        }
2061        // If `]` is the *first* char in a set, then interpret it as a literal
2062        // `]`. That is, an empty class is impossible to write.
2063        if union.items.is_empty() && self.char() == ']' {
2064            union.push(ast::ClassSetItem::Literal(ast::Literal {
2065                span: self.span_char(),
2066                kind: ast::LiteralKind::Verbatim,
2067                c: ']',
2068            }));
2069            if !self.bump_and_bump_space() {
2070                return Err(self.error(
2071                    Span::new(start, self.pos()),
2072                    ast::ErrorKind::ClassUnclosed,
2073                ));
2074            }
2075        }
2076        let set = ast::ClassBracketed {
2077            span: Span::new(start, self.pos()),
2078            negated,
2079            kind: ast::ClassSet::union(ast::ClassSetUnion {
2080                span: Span::new(union.span.start, union.span.start),
2081                items: vec![],
2082            }),
2083        };
2084        Ok((set, union))
2085    }
2086
2087    /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
2088    ///
2089    /// This assumes the parser is positioned at the opening `[`.
2090    ///
2091    /// If no valid ASCII character class could be found, then this does not
2092    /// advance the parser and `None` is returned. Otherwise, the parser is
2093    /// advanced to the first byte following the closing `]` and the
2094    /// corresponding ASCII class is returned.
2095    #[inline(never)]
2096    fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
2097        // ASCII character classes are interesting from a parsing perspective
2098        // because parsing cannot fail with any interesting error. For example,
2099        // in order to use an ASCII character class, it must be enclosed in
2100        // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
2101        // of it as "ASCII character classes have the syntax `[:NAME:]` which
2102        // can only appear within character brackets." This means that things
2103        // like `[[:lower:]A]` are legal constructs.
2104        //
2105        // However, if one types an incorrect ASCII character class, e.g.,
2106        // `[[:loower:]]`, then we treat that as a normal nested character
2107        // class containing the characters `:elorw`. One might argue that we
2108        // should return an error instead since the repeated colons give away
2109        // the intent to write an ASCII class. But what if the user typed
2110        // `[[:lower]]` instead? How can we tell that was intended to be an
2111        // ASCII class and not just a normal nested class?
2112        //
2113        // Reasonable people can probably disagree over this, but for better
2114        // or worse, we implement semantics that never fails at the expense
2115        // of better failure modes.
2116        assert_eq!(self.char(), '[');
2117        // If parsing fails, then we back up the parser to this starting point.
2118        let start = self.pos();
2119        let mut negated = false;
2120        if !self.bump() || self.char() != ':' {
2121            self.parser().pos.set(start);
2122            return None;
2123        }
2124        if !self.bump() {
2125            self.parser().pos.set(start);
2126            return None;
2127        }
2128        if self.char() == '^' {
2129            negated = true;
2130            if !self.bump() {
2131                self.parser().pos.set(start);
2132                return None;
2133            }
2134        }
2135        let name_start = self.offset();
2136        while self.char() != ':' && self.bump() {}
2137        if self.is_eof() {
2138            self.parser().pos.set(start);
2139            return None;
2140        }
2141        let name = &self.pattern()[name_start..self.offset()];
2142        if !self.bump_if(":]") {
2143            self.parser().pos.set(start);
2144            return None;
2145        }
2146        let kind = match ast::ClassAsciiKind::from_name(name) {
2147            Some(kind) => kind,
2148            None => {
2149                self.parser().pos.set(start);
2150                return None;
2151            }
2152        };
2153        Some(ast::ClassAscii {
2154            span: Span::new(start, self.pos()),
2155            kind,
2156            negated,
2157        })
2158    }
2159
2160    /// Parse a Unicode class in either the single character notation, `\pN`
2161    /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2162    /// the parser is positioned at the `p` (or `P` for negation) and will
2163    /// advance the parser to the character immediately following the class.
2164    ///
2165    /// Note that this does not check whether the class name is valid or not.
2166    #[inline(never)]
2167    fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2168        assert!(self.char() == 'p' || self.char() == 'P');
2169
2170        let mut scratch = self.parser().scratch.borrow_mut();
2171        scratch.clear();
2172
2173        let negated = self.char() == 'P';
2174        if !self.bump_and_bump_space() {
2175            return Err(
2176                self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
2177            );
2178        }
2179        let (start, kind) = if self.char() == '{' {
2180            let start = self.span_char().end;
2181            while self.bump_and_bump_space() && self.char() != '}' {
2182                scratch.push(self.char());
2183            }
2184            if self.is_eof() {
2185                return Err(self
2186                    .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2187            }
2188            assert_eq!(self.char(), '}');
2189            self.bump();
2190
2191            let name = scratch.as_str();
2192            if let Some(i) = name.find("!=") {
2193                (
2194                    start,
2195                    ast::ClassUnicodeKind::NamedValue {
2196                        op: ast::ClassUnicodeOpKind::NotEqual,
2197                        name: name[..i].to_string(),
2198                        value: name[i + 2..].to_string(),
2199                    },
2200                )
2201            } else if let Some(i) = name.find(':') {
2202                (
2203                    start,
2204                    ast::ClassUnicodeKind::NamedValue {
2205                        op: ast::ClassUnicodeOpKind::Colon,
2206                        name: name[..i].to_string(),
2207                        value: name[i + 1..].to_string(),
2208                    },
2209                )
2210            } else if let Some(i) = name.find('=') {
2211                (
2212                    start,
2213                    ast::ClassUnicodeKind::NamedValue {
2214                        op: ast::ClassUnicodeOpKind::Equal,
2215                        name: name[..i].to_string(),
2216                        value: name[i + 1..].to_string(),
2217                    },
2218                )
2219            } else {
2220                (start, ast::ClassUnicodeKind::Named(name.to_string()))
2221            }
2222        } else {
2223            let start = self.pos();
2224            let c = self.char();
2225            if c == '\\' {
2226                return Err(self.error(
2227                    self.span_char(),
2228                    ast::ErrorKind::UnicodeClassInvalid,
2229                ));
2230            }
2231            self.bump_and_bump_space();
2232            let kind = ast::ClassUnicodeKind::OneLetter(c);
2233            (start, kind)
2234        };
2235        Ok(ast::ClassUnicode {
2236            span: Span::new(start, self.pos()),
2237            negated,
2238            kind,
2239        })
2240    }
2241
2242    /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2243    /// parser is currently at a valid character class name and will be
2244    /// advanced to the character immediately following the class.
2245    #[inline(never)]
2246    fn parse_perl_class(&self) -> ast::ClassPerl {
2247        let c = self.char();
2248        let span = self.span_char();
2249        self.bump();
2250        let (negated, kind) = match c {
2251            'd' => (false, ast::ClassPerlKind::Digit),
2252            'D' => (true, ast::ClassPerlKind::Digit),
2253            's' => (false, ast::ClassPerlKind::Space),
2254            'S' => (true, ast::ClassPerlKind::Space),
2255            'w' => (false, ast::ClassPerlKind::Word),
2256            'W' => (true, ast::ClassPerlKind::Word),
2257            c => panic!("expected valid Perl class but got '{}'", c),
2258        };
2259        ast::ClassPerl { span, kind, negated }
2260    }
2261}
2262
2263/// A type that traverses a fully parsed Ast and checks whether its depth
2264/// exceeds the specified nesting limit. If it does, then an error is returned.
2265#[derive(Debug)]
2266struct NestLimiter<'p, 's, P> {
2267    /// The parser that is checking the nest limit.
2268    p: &'p ParserI<'s, P>,
2269    /// The current depth while walking an Ast.
2270    depth: u32,
2271}
2272
2273impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
2274    fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2275        NestLimiter { p, depth: 0 }
2276    }
2277
2278    #[inline(never)]
2279    fn check(self, ast: &Ast) -> Result<()> {
2280        ast::visit(ast, self)
2281    }
2282
2283    fn increment_depth(&mut self, span: &Span) -> Result<()> {
2284        let new = self.depth.checked_add(1).ok_or_else(|| {
2285            self.p.error(
2286                span.clone(),
2287                ast::ErrorKind::NestLimitExceeded(u32::MAX),
2288            )
2289        })?;
2290        let limit = self.p.parser().nest_limit;
2291        if new > limit {
2292            return Err(self.p.error(
2293                span.clone(),
2294                ast::ErrorKind::NestLimitExceeded(limit),
2295            ));
2296        }
2297        self.depth = new;
2298        Ok(())
2299    }
2300
2301    fn decrement_depth(&mut self) {
2302        // Assuming the correctness of the visitor, this should never drop
2303        // below 0.
2304        self.depth = self.depth.checked_sub(1).unwrap();
2305    }
2306}
2307
2308impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2309    type Output = ();
2310    type Err = ast::Error;
2311
2312    fn finish(self) -> Result<()> {
2313        Ok(())
2314    }
2315
2316    fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2317        let span = match *ast {
2318            Ast::Empty(_)
2319            | Ast::Flags(_)
2320            | Ast::Literal(_)
2321            | Ast::Dot(_)
2322            | Ast::Assertion(_)
2323            | Ast::ClassUnicode(_)
2324            | Ast::ClassPerl(_) => {
2325                // These are all base cases, so we don't increment depth.
2326                return Ok(());
2327            }
2328            Ast::ClassBracketed(ref x) => &x.span,
2329            Ast::Repetition(ref x) => &x.span,
2330            Ast::Group(ref x) => &x.span,
2331            Ast::Alternation(ref x) => &x.span,
2332            Ast::Concat(ref x) => &x.span,
2333        };
2334        self.increment_depth(span)
2335    }
2336
2337    fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2338        match *ast {
2339            Ast::Empty(_)
2340            | Ast::Flags(_)
2341            | Ast::Literal(_)
2342            | Ast::Dot(_)
2343            | Ast::Assertion(_)
2344            | Ast::ClassUnicode(_)
2345            | Ast::ClassPerl(_) => {
2346                // These are all base cases, so we don't decrement depth.
2347                Ok(())
2348            }
2349            Ast::ClassBracketed(_)
2350            | Ast::Repetition(_)
2351            | Ast::Group(_)
2352            | Ast::Alternation(_)
2353            | Ast::Concat(_) => {
2354                self.decrement_depth();
2355                Ok(())
2356            }
2357        }
2358    }
2359
2360    fn visit_class_set_item_pre(
2361        &mut self,
2362        ast: &ast::ClassSetItem,
2363    ) -> Result<()> {
2364        let span = match *ast {
2365            ast::ClassSetItem::Empty(_)
2366            | ast::ClassSetItem::Literal(_)
2367            | ast::ClassSetItem::Range(_)
2368            | ast::ClassSetItem::Ascii(_)
2369            | ast::ClassSetItem::Unicode(_)
2370            | ast::ClassSetItem::Perl(_) => {
2371                // These are all base cases, so we don't increment depth.
2372                return Ok(());
2373            }
2374            ast::ClassSetItem::Bracketed(ref x) => &x.span,
2375            ast::ClassSetItem::Union(ref x) => &x.span,
2376        };
2377        self.increment_depth(span)
2378    }
2379
2380    fn visit_class_set_item_post(
2381        &mut self,
2382        ast: &ast::ClassSetItem,
2383    ) -> Result<()> {
2384        match *ast {
2385            ast::ClassSetItem::Empty(_)
2386            | ast::ClassSetItem::Literal(_)
2387            | ast::ClassSetItem::Range(_)
2388            | ast::ClassSetItem::Ascii(_)
2389            | ast::ClassSetItem::Unicode(_)
2390            | ast::ClassSetItem::Perl(_) => {
2391                // These are all base cases, so we don't decrement depth.
2392                Ok(())
2393            }
2394            ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => {
2395                self.decrement_depth();
2396                Ok(())
2397            }
2398        }
2399    }
2400
2401    fn visit_class_set_binary_op_pre(
2402        &mut self,
2403        ast: &ast::ClassSetBinaryOp,
2404    ) -> Result<()> {
2405        self.increment_depth(&ast.span)
2406    }
2407
2408    fn visit_class_set_binary_op_post(
2409        &mut self,
2410        _ast: &ast::ClassSetBinaryOp,
2411    ) -> Result<()> {
2412        self.decrement_depth();
2413        Ok(())
2414    }
2415}
2416
2417/// When the result is an error, transforms the ast::ErrorKind from the source
2418/// Result into another one. This function is used to return clearer error
2419/// messages when possible.
2420fn specialize_err<T>(
2421    result: Result<T>,
2422    from: ast::ErrorKind,
2423    to: ast::ErrorKind,
2424) -> Result<T> {
2425    if let Err(e) = result {
2426        if e.kind == from {
2427            Err(ast::Error { kind: to, pattern: e.pattern, span: e.span })
2428        } else {
2429            Err(e)
2430        }
2431    } else {
2432        result
2433    }
2434}
2435
2436#[cfg(test)]
2437mod tests {
2438    use core::ops::Range;
2439
2440    use alloc::format;
2441
2442    use super::*;
2443
2444    // Our own assert_eq, which has slightly better formatting (but honestly
2445    // still kind of crappy).
2446    macro_rules! assert_eq {
2447        ($left:expr, $right:expr) => {{
2448            match (&$left, &$right) {
2449                (left_val, right_val) => {
2450                    if !(*left_val == *right_val) {
2451                        panic!(
2452                            "assertion failed: `(left == right)`\n\n\
2453                             left:  `{:?}`\nright: `{:?}`\n\n",
2454                            left_val, right_val
2455                        )
2456                    }
2457                }
2458            }
2459        }};
2460    }
2461
2462    // We create these errors to compare with real ast::Errors in the tests.
2463    // We define equality between TestError and ast::Error to disregard the
2464    // pattern string in ast::Error, which is annoying to provide in tests.
2465    #[derive(Clone, Debug)]
2466    struct TestError {
2467        span: Span,
2468        kind: ast::ErrorKind,
2469    }
2470
2471    impl PartialEq<ast::Error> for TestError {
2472        fn eq(&self, other: &ast::Error) -> bool {
2473            self.span == other.span && self.kind == other.kind
2474        }
2475    }
2476
2477    impl PartialEq<TestError> for ast::Error {
2478        fn eq(&self, other: &TestError) -> bool {
2479            self.span == other.span && self.kind == other.kind
2480        }
2481    }
2482
2483    fn s(str: &str) -> String {
2484        str.to_string()
2485    }
2486
2487    fn parser(pattern: &str) -> ParserI<'_, Parser> {
2488        ParserI::new(Parser::new(), pattern)
2489    }
2490
2491    fn parser_octal(pattern: &str) -> ParserI<'_, Parser> {
2492        let parser = ParserBuilder::new().octal(true).build();
2493        ParserI::new(parser, pattern)
2494    }
2495
2496    fn parser_empty_min_range(pattern: &str) -> ParserI<'_, Parser> {
2497        let parser = ParserBuilder::new().empty_min_range(true).build();
2498        ParserI::new(parser, pattern)
2499    }
2500
2501    fn parser_nest_limit(
2502        pattern: &str,
2503        nest_limit: u32,
2504    ) -> ParserI<'_, Parser> {
2505        let p = ParserBuilder::new().nest_limit(nest_limit).build();
2506        ParserI::new(p, pattern)
2507    }
2508
2509    fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> {
2510        let p = ParserBuilder::new().ignore_whitespace(true).build();
2511        ParserI::new(p, pattern)
2512    }
2513
2514    /// Short alias for creating a new span.
2515    fn nspan(start: Position, end: Position) -> Span {
2516        Span::new(start, end)
2517    }
2518
2519    /// Short alias for creating a new position.
2520    fn npos(offset: usize, line: usize, column: usize) -> Position {
2521        Position::new(offset, line, column)
2522    }
2523
2524    /// Create a new span from the given offset range. This assumes a single
2525    /// line and sets the columns based on the offsets. i.e., This only works
2526    /// out of the box for ASCII, which is fine for most tests.
2527    fn span(range: Range<usize>) -> Span {
2528        let start = Position::new(range.start, 1, range.start + 1);
2529        let end = Position::new(range.end, 1, range.end + 1);
2530        Span::new(start, end)
2531    }
2532
2533    /// Create a new span for the corresponding byte range in the given string.
2534    fn span_range(subject: &str, range: Range<usize>) -> Span {
2535        let start = Position {
2536            offset: range.start,
2537            line: 1 + subject[..range.start].matches('\n').count(),
2538            column: 1 + subject[..range.start]
2539                .chars()
2540                .rev()
2541                .position(|c| c == '\n')
2542                .unwrap_or(subject[..range.start].chars().count()),
2543        };
2544        let end = Position {
2545            offset: range.end,
2546            line: 1 + subject[..range.end].matches('\n').count(),
2547            column: 1 + subject[..range.end]
2548                .chars()
2549                .rev()
2550                .position(|c| c == '\n')
2551                .unwrap_or(subject[..range.end].chars().count()),
2552        };
2553        Span::new(start, end)
2554    }
2555
2556    /// Create a verbatim literal starting at the given position.
2557    fn lit(c: char, start: usize) -> Ast {
2558        lit_with(c, span(start..start + c.len_utf8()))
2559    }
2560
2561    /// Create a meta literal starting at the given position.
2562    fn meta_lit(c: char, span: Span) -> Ast {
2563        Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
2564    }
2565
2566    /// Create a verbatim literal with the given span.
2567    fn lit_with(c: char, span: Span) -> Ast {
2568        Ast::literal(ast::Literal {
2569            span,
2570            kind: ast::LiteralKind::Verbatim,
2571            c,
2572        })
2573    }
2574
2575    /// Create a concatenation with the given range.
2576    fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2577        concat_with(span(range), asts)
2578    }
2579
2580    /// Create a concatenation with the given span.
2581    fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2582        Ast::concat(ast::Concat { span, asts })
2583    }
2584
2585    /// Create an alternation with the given span.
2586    fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2587        Ast::alternation(ast::Alternation { span: span(range), asts })
2588    }
2589
2590    /// Create a capturing group with the given span.
2591    fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2592        Ast::group(ast::Group {
2593            span: span(range),
2594            kind: ast::GroupKind::CaptureIndex(index),
2595            ast: Box::new(ast),
2596        })
2597    }
2598
2599    /// Create an ast::SetFlags.
2600    ///
2601    /// The given pattern should be the full pattern string. The range given
2602    /// should correspond to the byte offsets where the flag set occurs.
2603    ///
2604    /// If negated is true, then the set is interpreted as beginning with a
2605    /// negation.
2606    fn flag_set(
2607        pat: &str,
2608        range: Range<usize>,
2609        flag: ast::Flag,
2610        negated: bool,
2611    ) -> Ast {
2612        let mut items = vec![ast::FlagsItem {
2613            span: span_range(pat, (range.end - 2)..(range.end - 1)),
2614            kind: ast::FlagsItemKind::Flag(flag),
2615        }];
2616        if negated {
2617            items.insert(
2618                0,
2619                ast::FlagsItem {
2620                    span: span_range(pat, (range.start + 2)..(range.end - 2)),
2621                    kind: ast::FlagsItemKind::Negation,
2622                },
2623            );
2624        }
2625        Ast::flags(ast::SetFlags {
2626            span: span_range(pat, range.clone()),
2627            flags: ast::Flags {
2628                span: span_range(pat, (range.start + 2)..(range.end - 1)),
2629                items,
2630            },
2631        })
2632    }
2633
2634    #[test]
2635    fn parse_nest_limit() {
2636        // A nest limit of 0 still allows some types of regexes.
2637        assert_eq!(
2638            parser_nest_limit("", 0).parse(),
2639            Ok(Ast::empty(span(0..0)))
2640        );
2641        assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
2642
2643        // Test repetition operations, which require one level of nesting.
2644        assert_eq!(
2645            parser_nest_limit("a+", 0).parse().unwrap_err(),
2646            TestError {
2647                span: span(0..2),
2648                kind: ast::ErrorKind::NestLimitExceeded(0),
2649            }
2650        );
2651        assert_eq!(
2652            parser_nest_limit("a+", 1).parse(),
2653            Ok(Ast::repetition(ast::Repetition {
2654                span: span(0..2),
2655                op: ast::RepetitionOp {
2656                    span: span(1..2),
2657                    kind: ast::RepetitionKind::OneOrMore,
2658                },
2659                greedy: true,
2660                ast: Box::new(lit('a', 0)),
2661            }))
2662        );
2663        assert_eq!(
2664            parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2665            TestError {
2666                span: span(0..3),
2667                kind: ast::ErrorKind::NestLimitExceeded(1),
2668            }
2669        );
2670        assert_eq!(
2671            parser_nest_limit("a+*", 1).parse().unwrap_err(),
2672            TestError {
2673                span: span(0..2),
2674                kind: ast::ErrorKind::NestLimitExceeded(1),
2675            }
2676        );
2677        assert_eq!(
2678            parser_nest_limit("a+*", 2).parse(),
2679            Ok(Ast::repetition(ast::Repetition {
2680                span: span(0..3),
2681                op: ast::RepetitionOp {
2682                    span: span(2..3),
2683                    kind: ast::RepetitionKind::ZeroOrMore,
2684                },
2685                greedy: true,
2686                ast: Box::new(Ast::repetition(ast::Repetition {
2687                    span: span(0..2),
2688                    op: ast::RepetitionOp {
2689                        span: span(1..2),
2690                        kind: ast::RepetitionKind::OneOrMore,
2691                    },
2692                    greedy: true,
2693                    ast: Box::new(lit('a', 0)),
2694                })),
2695            }))
2696        );
2697
2698        // Test concatenations. A concatenation requires one level of nesting.
2699        assert_eq!(
2700            parser_nest_limit("ab", 0).parse().unwrap_err(),
2701            TestError {
2702                span: span(0..2),
2703                kind: ast::ErrorKind::NestLimitExceeded(0),
2704            }
2705        );
2706        assert_eq!(
2707            parser_nest_limit("ab", 1).parse(),
2708            Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))
2709        );
2710        assert_eq!(
2711            parser_nest_limit("abc", 1).parse(),
2712            Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))
2713        );
2714
2715        // Test alternations. An alternation requires one level of nesting.
2716        assert_eq!(
2717            parser_nest_limit("a|b", 0).parse().unwrap_err(),
2718            TestError {
2719                span: span(0..3),
2720                kind: ast::ErrorKind::NestLimitExceeded(0),
2721            }
2722        );
2723        assert_eq!(
2724            parser_nest_limit("a|b", 1).parse(),
2725            Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))
2726        );
2727        assert_eq!(
2728            parser_nest_limit("a|b|c", 1).parse(),
2729            Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))
2730        );
2731
2732        // Test character classes. Classes form their own mini-recursive
2733        // syntax!
2734        assert_eq!(
2735            parser_nest_limit("[a]", 0).parse().unwrap_err(),
2736            TestError {
2737                span: span(0..3),
2738                kind: ast::ErrorKind::NestLimitExceeded(0),
2739            }
2740        );
2741        assert_eq!(
2742            parser_nest_limit("[a]", 1).parse(),
2743            Ok(Ast::class_bracketed(ast::ClassBracketed {
2744                span: span(0..3),
2745                negated: false,
2746                kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
2747                    ast::Literal {
2748                        span: span(1..2),
2749                        kind: ast::LiteralKind::Verbatim,
2750                        c: 'a',
2751                    }
2752                )),
2753            }))
2754        );
2755        assert_eq!(
2756            parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2757            TestError {
2758                span: span(1..3),
2759                kind: ast::ErrorKind::NestLimitExceeded(1),
2760            }
2761        );
2762        assert_eq!(
2763            parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2764            TestError {
2765                span: span(3..7),
2766                kind: ast::ErrorKind::NestLimitExceeded(2),
2767            }
2768        );
2769        assert_eq!(
2770            parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2771            TestError {
2772                span: span(4..6),
2773                kind: ast::ErrorKind::NestLimitExceeded(3),
2774            }
2775        );
2776        assert_eq!(
2777            parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2778            TestError {
2779                span: span(1..5),
2780                kind: ast::ErrorKind::NestLimitExceeded(1),
2781            }
2782        );
2783        assert_eq!(
2784            parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2785            TestError {
2786                span: span(4..6),
2787                kind: ast::ErrorKind::NestLimitExceeded(2),
2788            }
2789        );
2790    }
2791
2792    #[test]
2793    fn parse_comments() {
2794        let pat = "(?x)
2795# This is comment 1.
2796foo # This is comment 2.
2797  # This is comment 3.
2798bar
2799# This is comment 4.";
2800        let astc = parser(pat).parse_with_comments().unwrap();
2801        assert_eq!(
2802            astc.ast,
2803            concat_with(
2804                span_range(pat, 0..pat.len()),
2805                vec![
2806                    flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2807                    lit_with('f', span_range(pat, 26..27)),
2808                    lit_with('o', span_range(pat, 27..28)),
2809                    lit_with('o', span_range(pat, 28..29)),
2810                    lit_with('b', span_range(pat, 74..75)),
2811                    lit_with('a', span_range(pat, 75..76)),
2812                    lit_with('r', span_range(pat, 76..77)),
2813                ]
2814            )
2815        );
2816        assert_eq!(
2817            astc.comments,
2818            vec![
2819                ast::Comment {
2820                    span: span_range(pat, 5..26),
2821                    comment: s(" This is comment 1."),
2822                },
2823                ast::Comment {
2824                    span: span_range(pat, 30..51),
2825                    comment: s(" This is comment 2."),
2826                },
2827                ast::Comment {
2828                    span: span_range(pat, 53..74),
2829                    comment: s(" This is comment 3."),
2830                },
2831                ast::Comment {
2832                    span: span_range(pat, 78..98),
2833                    comment: s(" This is comment 4."),
2834                },
2835            ]
2836        );
2837    }
2838
2839    #[test]
2840    fn parse_holistic() {
2841        assert_eq!(parser("]").parse(), Ok(lit(']', 0)));
2842        assert_eq!(
2843            parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2844            Ok(concat(
2845                0..36,
2846                vec![
2847                    meta_lit('\\', span(0..2)),
2848                    meta_lit('.', span(2..4)),
2849                    meta_lit('+', span(4..6)),
2850                    meta_lit('*', span(6..8)),
2851                    meta_lit('?', span(8..10)),
2852                    meta_lit('(', span(10..12)),
2853                    meta_lit(')', span(12..14)),
2854                    meta_lit('|', span(14..16)),
2855                    meta_lit('[', span(16..18)),
2856                    meta_lit(']', span(18..20)),
2857                    meta_lit('{', span(20..22)),
2858                    meta_lit('}', span(22..24)),
2859                    meta_lit('^', span(24..26)),
2860                    meta_lit('$', span(26..28)),
2861                    meta_lit('#', span(28..30)),
2862                    meta_lit('&', span(30..32)),
2863                    meta_lit('-', span(32..34)),
2864                    meta_lit('~', span(34..36)),
2865                ]
2866            ))
2867        );
2868    }
2869
2870    #[test]
2871    fn parse_ignore_whitespace() {
2872        // Test that basic whitespace insensitivity works.
2873        let pat = "(?x)a b";
2874        assert_eq!(
2875            parser(pat).parse(),
2876            Ok(concat_with(
2877                nspan(npos(0, 1, 1), npos(7, 1, 8)),
2878                vec![
2879                    flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2880                    lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2881                    lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2882                ]
2883            ))
2884        );
2885
2886        // Test that we can toggle whitespace insensitivity.
2887        let pat = "(?x)a b(?-x)a b";
2888        assert_eq!(
2889            parser(pat).parse(),
2890            Ok(concat_with(
2891                nspan(npos(0, 1, 1), npos(15, 1, 16)),
2892                vec![
2893                    flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2894                    lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2895                    lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2896                    flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2897                    lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2898                    lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2899                    lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2900                ]
2901            ))
2902        );
2903
2904        // Test that nesting whitespace insensitive flags works.
2905        let pat = "a (?x:a )a ";
2906        assert_eq!(
2907            parser(pat).parse(),
2908            Ok(concat_with(
2909                span_range(pat, 0..11),
2910                vec![
2911                    lit_with('a', span_range(pat, 0..1)),
2912                    lit_with(' ', span_range(pat, 1..2)),
2913                    Ast::group(ast::Group {
2914                        span: span_range(pat, 2..9),
2915                        kind: ast::GroupKind::NonCapturing(ast::Flags {
2916                            span: span_range(pat, 4..5),
2917                            items: vec![ast::FlagsItem {
2918                                span: span_range(pat, 4..5),
2919                                kind: ast::FlagsItemKind::Flag(
2920                                    ast::Flag::IgnoreWhitespace
2921                                ),
2922                            },],
2923                        }),
2924                        ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2925                    }),
2926                    lit_with('a', span_range(pat, 9..10)),
2927                    lit_with(' ', span_range(pat, 10..11)),
2928                ]
2929            ))
2930        );
2931
2932        // Test that whitespace after an opening paren is insignificant.
2933        let pat = "(?x)( ?P<foo> a )";
2934        assert_eq!(
2935            parser(pat).parse(),
2936            Ok(concat_with(
2937                span_range(pat, 0..pat.len()),
2938                vec![
2939                    flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2940                    Ast::group(ast::Group {
2941                        span: span_range(pat, 4..pat.len()),
2942                        kind: ast::GroupKind::CaptureName {
2943                            starts_with_p: true,
2944                            name: ast::CaptureName {
2945                                span: span_range(pat, 9..12),
2946                                name: s("foo"),
2947                                index: 1,
2948                            }
2949                        },
2950                        ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2951                    }),
2952                ]
2953            ))
2954        );
2955        let pat = "(?x)(  a )";
2956        assert_eq!(
2957            parser(pat).parse(),
2958            Ok(concat_with(
2959                span_range(pat, 0..pat.len()),
2960                vec![
2961                    flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2962                    Ast::group(ast::Group {
2963                        span: span_range(pat, 4..pat.len()),
2964                        kind: ast::GroupKind::CaptureIndex(1),
2965                        ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2966                    }),
2967                ]
2968            ))
2969        );
2970        let pat = "(?x)(  ?:  a )";
2971        assert_eq!(
2972            parser(pat).parse(),
2973            Ok(concat_with(
2974                span_range(pat, 0..pat.len()),
2975                vec![
2976                    flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2977                    Ast::group(ast::Group {
2978                        span: span_range(pat, 4..pat.len()),
2979                        kind: ast::GroupKind::NonCapturing(ast::Flags {
2980                            span: span_range(pat, 8..8),
2981                            items: vec![],
2982                        }),
2983                        ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2984                    }),
2985                ]
2986            ))
2987        );
2988        let pat = r"(?x)\x { 53 }";
2989        assert_eq!(
2990            parser(pat).parse(),
2991            Ok(concat_with(
2992                span_range(pat, 0..pat.len()),
2993                vec![
2994                    flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2995                    Ast::literal(ast::Literal {
2996                        span: span(4..13),
2997                        kind: ast::LiteralKind::HexBrace(
2998                            ast::HexLiteralKind::X
2999                        ),
3000                        c: 'S',
3001                    }),
3002                ]
3003            ))
3004        );
3005
3006        // Test that whitespace after an escape is OK.
3007        let pat = r"(?x)\ ";
3008        assert_eq!(
3009            parser(pat).parse(),
3010            Ok(concat_with(
3011                span_range(pat, 0..pat.len()),
3012                vec![
3013                    flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
3014                    Ast::literal(ast::Literal {
3015                        span: span_range(pat, 4..6),
3016                        kind: ast::LiteralKind::Superfluous,
3017                        c: ' ',
3018                    }),
3019                ]
3020            ))
3021        );
3022    }
3023
3024    #[test]
3025    fn parse_newlines() {
3026        let pat = ".\n.";
3027        assert_eq!(
3028            parser(pat).parse(),
3029            Ok(concat_with(
3030                span_range(pat, 0..3),
3031                vec![
3032                    Ast::dot(span_range(pat, 0..1)),
3033                    lit_with('\n', span_range(pat, 1..2)),
3034                    Ast::dot(span_range(pat, 2..3)),
3035                ]
3036            ))
3037        );
3038
3039        let pat = "foobar\nbaz\nquux\n";
3040        assert_eq!(
3041            parser(pat).parse(),
3042            Ok(concat_with(
3043                span_range(pat, 0..pat.len()),
3044                vec![
3045                    lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
3046                    lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
3047                    lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
3048                    lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
3049                    lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
3050                    lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
3051                    lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
3052                    lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
3053                    lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
3054                    lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
3055                    lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
3056                    lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
3057                    lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
3058                    lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
3059                    lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
3060                    lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
3061                ]
3062            ))
3063        );
3064    }
3065
3066    #[test]
3067    fn parse_uncounted_repetition() {
3068        assert_eq!(
3069            parser(r"a*").parse(),
3070            Ok(Ast::repetition(ast::Repetition {
3071                span: span(0..2),
3072                op: ast::RepetitionOp {
3073                    span: span(1..2),
3074                    kind: ast::RepetitionKind::ZeroOrMore,
3075                },
3076                greedy: true,
3077                ast: Box::new(lit('a', 0)),
3078            }))
3079        );
3080        assert_eq!(
3081            parser(r"a+").parse(),
3082            Ok(Ast::repetition(ast::Repetition {
3083                span: span(0..2),
3084                op: ast::RepetitionOp {
3085                    span: span(1..2),
3086                    kind: ast::RepetitionKind::OneOrMore,
3087                },
3088                greedy: true,
3089                ast: Box::new(lit('a', 0)),
3090            }))
3091        );
3092
3093        assert_eq!(
3094            parser(r"a?").parse(),
3095            Ok(Ast::repetition(ast::Repetition {
3096                span: span(0..2),
3097                op: ast::RepetitionOp {
3098                    span: span(1..2),
3099                    kind: ast::RepetitionKind::ZeroOrOne,
3100                },
3101                greedy: true,
3102                ast: Box::new(lit('a', 0)),
3103            }))
3104        );
3105        assert_eq!(
3106            parser(r"a??").parse(),
3107            Ok(Ast::repetition(ast::Repetition {
3108                span: span(0..3),
3109                op: ast::RepetitionOp {
3110                    span: span(1..3),
3111                    kind: ast::RepetitionKind::ZeroOrOne,
3112                },
3113                greedy: false,
3114                ast: Box::new(lit('a', 0)),
3115            }))
3116        );
3117        assert_eq!(
3118            parser(r"a?").parse(),
3119            Ok(Ast::repetition(ast::Repetition {
3120                span: span(0..2),
3121                op: ast::RepetitionOp {
3122                    span: span(1..2),
3123                    kind: ast::RepetitionKind::ZeroOrOne,
3124                },
3125                greedy: true,
3126                ast: Box::new(lit('a', 0)),
3127            }))
3128        );
3129        assert_eq!(
3130            parser(r"a?b").parse(),
3131            Ok(concat(
3132                0..3,
3133                vec![
3134                    Ast::repetition(ast::Repetition {
3135                        span: span(0..2),
3136                        op: ast::RepetitionOp {
3137                            span: span(1..2),
3138                            kind: ast::RepetitionKind::ZeroOrOne,
3139                        },
3140                        greedy: true,
3141                        ast: Box::new(lit('a', 0)),
3142                    }),
3143                    lit('b', 2),
3144                ]
3145            ))
3146        );
3147        assert_eq!(
3148            parser(r"a??b").parse(),
3149            Ok(concat(
3150                0..4,
3151                vec![
3152                    Ast::repetition(ast::Repetition {
3153                        span: span(0..3),
3154                        op: ast::RepetitionOp {
3155                            span: span(1..3),
3156                            kind: ast::RepetitionKind::ZeroOrOne,
3157                        },
3158                        greedy: false,
3159                        ast: Box::new(lit('a', 0)),
3160                    }),
3161                    lit('b', 3),
3162                ]
3163            ))
3164        );
3165        assert_eq!(
3166            parser(r"ab?").parse(),
3167            Ok(concat(
3168                0..3,
3169                vec![
3170                    lit('a', 0),
3171                    Ast::repetition(ast::Repetition {
3172                        span: span(1..3),
3173                        op: ast::RepetitionOp {
3174                            span: span(2..3),
3175                            kind: ast::RepetitionKind::ZeroOrOne,
3176                        },
3177                        greedy: true,
3178                        ast: Box::new(lit('b', 1)),
3179                    }),
3180                ]
3181            ))
3182        );
3183        assert_eq!(
3184            parser(r"(ab)?").parse(),
3185            Ok(Ast::repetition(ast::Repetition {
3186                span: span(0..5),
3187                op: ast::RepetitionOp {
3188                    span: span(4..5),
3189                    kind: ast::RepetitionKind::ZeroOrOne,
3190                },
3191                greedy: true,
3192                ast: Box::new(group(
3193                    0..4,
3194                    1,
3195                    concat(1..3, vec![lit('a', 1), lit('b', 2),])
3196                )),
3197            }))
3198        );
3199        assert_eq!(
3200            parser(r"|a?").parse(),
3201            Ok(alt(
3202                0..3,
3203                vec![
3204                    Ast::empty(span(0..0)),
3205                    Ast::repetition(ast::Repetition {
3206                        span: span(1..3),
3207                        op: ast::RepetitionOp {
3208                            span: span(2..3),
3209                            kind: ast::RepetitionKind::ZeroOrOne,
3210                        },
3211                        greedy: true,
3212                        ast: Box::new(lit('a', 1)),
3213                    }),
3214                ]
3215            ))
3216        );
3217
3218        assert_eq!(
3219            parser(r"*").parse().unwrap_err(),
3220            TestError {
3221                span: span(0..0),
3222                kind: ast::ErrorKind::RepetitionMissing,
3223            }
3224        );
3225        assert_eq!(
3226            parser(r"(?i)*").parse().unwrap_err(),
3227            TestError {
3228                span: span(4..4),
3229                kind: ast::ErrorKind::RepetitionMissing,
3230            }
3231        );
3232        assert_eq!(
3233            parser(r"(*)").parse().unwrap_err(),
3234            TestError {
3235                span: span(1..1),
3236                kind: ast::ErrorKind::RepetitionMissing,
3237            }
3238        );
3239        assert_eq!(
3240            parser(r"(?:?)").parse().unwrap_err(),
3241            TestError {
3242                span: span(3..3),
3243                kind: ast::ErrorKind::RepetitionMissing,
3244            }
3245        );
3246        assert_eq!(
3247            parser(r"+").parse().unwrap_err(),
3248            TestError {
3249                span: span(0..0),
3250                kind: ast::ErrorKind::RepetitionMissing,
3251            }
3252        );
3253        assert_eq!(
3254            parser(r"?").parse().unwrap_err(),
3255            TestError {
3256                span: span(0..0),
3257                kind: ast::ErrorKind::RepetitionMissing,
3258            }
3259        );
3260        assert_eq!(
3261            parser(r"(?)").parse().unwrap_err(),
3262            TestError {
3263                span: span(1..1),
3264                kind: ast::ErrorKind::RepetitionMissing,
3265            }
3266        );
3267        assert_eq!(
3268            parser(r"|*").parse().unwrap_err(),
3269            TestError {
3270                span: span(1..1),
3271                kind: ast::ErrorKind::RepetitionMissing,
3272            }
3273        );
3274        assert_eq!(
3275            parser(r"|+").parse().unwrap_err(),
3276            TestError {
3277                span: span(1..1),
3278                kind: ast::ErrorKind::RepetitionMissing,
3279            }
3280        );
3281        assert_eq!(
3282            parser(r"|?").parse().unwrap_err(),
3283            TestError {
3284                span: span(1..1),
3285                kind: ast::ErrorKind::RepetitionMissing,
3286            }
3287        );
3288    }
3289
3290    #[test]
3291    fn parse_counted_repetition() {
3292        assert_eq!(
3293            parser(r"a{5}").parse(),
3294            Ok(Ast::repetition(ast::Repetition {
3295                span: span(0..4),
3296                op: ast::RepetitionOp {
3297                    span: span(1..4),
3298                    kind: ast::RepetitionKind::Range(
3299                        ast::RepetitionRange::Exactly(5)
3300                    ),
3301                },
3302                greedy: true,
3303                ast: Box::new(lit('a', 0)),
3304            }))
3305        );
3306        assert_eq!(
3307            parser(r"a{5,}").parse(),
3308            Ok(Ast::repetition(ast::Repetition {
3309                span: span(0..5),
3310                op: ast::RepetitionOp {
3311                    span: span(1..5),
3312                    kind: ast::RepetitionKind::Range(
3313                        ast::RepetitionRange::AtLeast(5)
3314                    ),
3315                },
3316                greedy: true,
3317                ast: Box::new(lit('a', 0)),
3318            }))
3319        );
3320        assert_eq!(
3321            parser(r"a{5,9}").parse(),
3322            Ok(Ast::repetition(ast::Repetition {
3323                span: span(0..6),
3324                op: ast::RepetitionOp {
3325                    span: span(1..6),
3326                    kind: ast::RepetitionKind::Range(
3327                        ast::RepetitionRange::Bounded(5, 9)
3328                    ),
3329                },
3330                greedy: true,
3331                ast: Box::new(lit('a', 0)),
3332            }))
3333        );
3334        assert_eq!(
3335            parser(r"a{5}?").parse(),
3336            Ok(Ast::repetition(ast::Repetition {
3337                span: span(0..5),
3338                op: ast::RepetitionOp {
3339                    span: span(1..5),
3340                    kind: ast::RepetitionKind::Range(
3341                        ast::RepetitionRange::Exactly(5)
3342                    ),
3343                },
3344                greedy: false,
3345                ast: Box::new(lit('a', 0)),
3346            }))
3347        );
3348        assert_eq!(
3349            parser(r"ab{5}").parse(),
3350            Ok(concat(
3351                0..5,
3352                vec![
3353                    lit('a', 0),
3354                    Ast::repetition(ast::Repetition {
3355                        span: span(1..5),
3356                        op: ast::RepetitionOp {
3357                            span: span(2..5),
3358                            kind: ast::RepetitionKind::Range(
3359                                ast::RepetitionRange::Exactly(5)
3360                            ),
3361                        },
3362                        greedy: true,
3363                        ast: Box::new(lit('b', 1)),
3364                    }),
3365                ]
3366            ))
3367        );
3368        assert_eq!(
3369            parser(r"ab{5}c").parse(),
3370            Ok(concat(
3371                0..6,
3372                vec![
3373                    lit('a', 0),
3374                    Ast::repetition(ast::Repetition {
3375                        span: span(1..5),
3376                        op: ast::RepetitionOp {
3377                            span: span(2..5),
3378                            kind: ast::RepetitionKind::Range(
3379                                ast::RepetitionRange::Exactly(5)
3380                            ),
3381                        },
3382                        greedy: true,
3383                        ast: Box::new(lit('b', 1)),
3384                    }),
3385                    lit('c', 5),
3386                ]
3387            ))
3388        );
3389
3390        assert_eq!(
3391            parser(r"a{ 5 }").parse(),
3392            Ok(Ast::repetition(ast::Repetition {
3393                span: span(0..6),
3394                op: ast::RepetitionOp {
3395                    span: span(1..6),
3396                    kind: ast::RepetitionKind::Range(
3397                        ast::RepetitionRange::Exactly(5)
3398                    ),
3399                },
3400                greedy: true,
3401                ast: Box::new(lit('a', 0)),
3402            }))
3403        );
3404        assert_eq!(
3405            parser(r"a{ 5 , 9 }").parse(),
3406            Ok(Ast::repetition(ast::Repetition {
3407                span: span(0..10),
3408                op: ast::RepetitionOp {
3409                    span: span(1..10),
3410                    kind: ast::RepetitionKind::Range(
3411                        ast::RepetitionRange::Bounded(5, 9)
3412                    ),
3413                },
3414                greedy: true,
3415                ast: Box::new(lit('a', 0)),
3416            }))
3417        );
3418        assert_eq!(
3419            parser_empty_min_range(r"a{,9}").parse(),
3420            Ok(Ast::repetition(ast::Repetition {
3421                span: span(0..5),
3422                op: ast::RepetitionOp {
3423                    span: span(1..5),
3424                    kind: ast::RepetitionKind::Range(
3425                        ast::RepetitionRange::Bounded(0, 9)
3426                    ),
3427                },
3428                greedy: true,
3429                ast: Box::new(lit('a', 0)),
3430            }))
3431        );
3432        assert_eq!(
3433            parser_ignore_whitespace(r"a{5,9} ?").parse(),
3434            Ok(Ast::repetition(ast::Repetition {
3435                span: span(0..8),
3436                op: ast::RepetitionOp {
3437                    span: span(1..8),
3438                    kind: ast::RepetitionKind::Range(
3439                        ast::RepetitionRange::Bounded(5, 9)
3440                    ),
3441                },
3442                greedy: false,
3443                ast: Box::new(lit('a', 0)),
3444            }))
3445        );
3446        assert_eq!(
3447            parser(r"\b{5,9}").parse(),
3448            Ok(Ast::repetition(ast::Repetition {
3449                span: span(0..7),
3450                op: ast::RepetitionOp {
3451                    span: span(2..7),
3452                    kind: ast::RepetitionKind::Range(
3453                        ast::RepetitionRange::Bounded(5, 9)
3454                    ),
3455                },
3456                greedy: true,
3457                ast: Box::new(Ast::assertion(ast::Assertion {
3458                    span: span(0..2),
3459                    kind: ast::AssertionKind::WordBoundary,
3460                })),
3461            }))
3462        );
3463
3464        assert_eq!(
3465            parser(r"(?i){0}").parse().unwrap_err(),
3466            TestError {
3467                span: span(4..4),
3468                kind: ast::ErrorKind::RepetitionMissing,
3469            }
3470        );
3471        assert_eq!(
3472            parser(r"(?m){1,1}").parse().unwrap_err(),
3473            TestError {
3474                span: span(4..4),
3475                kind: ast::ErrorKind::RepetitionMissing,
3476            }
3477        );
3478        assert_eq!(
3479            parser(r"a{]}").parse().unwrap_err(),
3480            TestError {
3481                span: span(2..2),
3482                kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3483            }
3484        );
3485        assert_eq!(
3486            parser(r"a{1,]}").parse().unwrap_err(),
3487            TestError {
3488                span: span(4..4),
3489                kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3490            }
3491        );
3492        assert_eq!(
3493            parser(r"a{").parse().unwrap_err(),
3494            TestError {
3495                span: span(1..2),
3496                kind: ast::ErrorKind::RepetitionCountUnclosed,
3497            }
3498        );
3499        assert_eq!(
3500            parser(r"a{}").parse().unwrap_err(),
3501            TestError {
3502                span: span(2..2),
3503                kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3504            }
3505        );
3506        assert_eq!(
3507            parser(r"a{a").parse().unwrap_err(),
3508            TestError {
3509                span: span(2..2),
3510                kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3511            }
3512        );
3513        assert_eq!(
3514            parser(r"a{9999999999}").parse().unwrap_err(),
3515            TestError {
3516                span: span(2..12),
3517                kind: ast::ErrorKind::DecimalInvalid,
3518            }
3519        );
3520        assert_eq!(
3521            parser(r"a{9").parse().unwrap_err(),
3522            TestError {
3523                span: span(1..3),
3524                kind: ast::ErrorKind::RepetitionCountUnclosed,
3525            }
3526        );
3527        assert_eq!(
3528            parser(r"a{9,a").parse().unwrap_err(),
3529            TestError {
3530                span: span(4..4),
3531                kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3532            }
3533        );
3534        assert_eq!(
3535            parser(r"a{9,9999999999}").parse().unwrap_err(),
3536            TestError {
3537                span: span(4..14),
3538                kind: ast::ErrorKind::DecimalInvalid,
3539            }
3540        );
3541        assert_eq!(
3542            parser(r"a{9,").parse().unwrap_err(),
3543            TestError {
3544                span: span(1..4),
3545                kind: ast::ErrorKind::RepetitionCountUnclosed,
3546            }
3547        );
3548        assert_eq!(
3549            parser(r"a{9,11").parse().unwrap_err(),
3550            TestError {
3551                span: span(1..6),
3552                kind: ast::ErrorKind::RepetitionCountUnclosed,
3553            }
3554        );
3555        assert_eq!(
3556            parser(r"a{2,1}").parse().unwrap_err(),
3557            TestError {
3558                span: span(1..6),
3559                kind: ast::ErrorKind::RepetitionCountInvalid,
3560            }
3561        );
3562        assert_eq!(
3563            parser(r"{5}").parse().unwrap_err(),
3564            TestError {
3565                span: span(0..0),
3566                kind: ast::ErrorKind::RepetitionMissing,
3567            }
3568        );
3569        assert_eq!(
3570            parser(r"|{5}").parse().unwrap_err(),
3571            TestError {
3572                span: span(1..1),
3573                kind: ast::ErrorKind::RepetitionMissing,
3574            }
3575        );
3576    }
3577
3578    #[test]
3579    fn parse_alternate() {
3580        assert_eq!(
3581            parser(r"a|b").parse(),
3582            Ok(Ast::alternation(ast::Alternation {
3583                span: span(0..3),
3584                asts: vec![lit('a', 0), lit('b', 2)],
3585            }))
3586        );
3587        assert_eq!(
3588            parser(r"(a|b)").parse(),
3589            Ok(group(
3590                0..5,
3591                1,
3592                Ast::alternation(ast::Alternation {
3593                    span: span(1..4),
3594                    asts: vec![lit('a', 1), lit('b', 3)],
3595                })
3596            ))
3597        );
3598
3599        assert_eq!(
3600            parser(r"a|b|c").parse(),
3601            Ok(Ast::alternation(ast::Alternation {
3602                span: span(0..5),
3603                asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3604            }))
3605        );
3606        assert_eq!(
3607            parser(r"ax|by|cz").parse(),
3608            Ok(Ast::alternation(ast::Alternation {
3609                span: span(0..8),
3610                asts: vec![
3611                    concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3612                    concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3613                    concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3614                ],
3615            }))
3616        );
3617        assert_eq!(
3618            parser(r"(ax|by|cz)").parse(),
3619            Ok(group(
3620                0..10,
3621                1,
3622                Ast::alternation(ast::Alternation {
3623                    span: span(1..9),
3624                    asts: vec![
3625                        concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3626                        concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3627                        concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3628                    ],
3629                })
3630            ))
3631        );
3632        assert_eq!(
3633            parser(r"(ax|(by|(cz)))").parse(),
3634            Ok(group(
3635                0..14,
3636                1,
3637                alt(
3638                    1..13,
3639                    vec![
3640                        concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3641                        group(
3642                            4..13,
3643                            2,
3644                            alt(
3645                                5..12,
3646                                vec![
3647                                    concat(
3648                                        5..7,
3649                                        vec![lit('b', 5), lit('y', 6)]
3650                                    ),
3651                                    group(
3652                                        8..12,
3653                                        3,
3654                                        concat(
3655                                            9..11,
3656                                            vec![lit('c', 9), lit('z', 10),]
3657                                        )
3658                                    ),
3659                                ]
3660                            )
3661                        ),
3662                    ]
3663                )
3664            ))
3665        );
3666
3667        assert_eq!(
3668            parser(r"|").parse(),
3669            Ok(alt(
3670                0..1,
3671                vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),]
3672            ))
3673        );
3674        assert_eq!(
3675            parser(r"||").parse(),
3676            Ok(alt(
3677                0..2,
3678                vec![
3679                    Ast::empty(span(0..0)),
3680                    Ast::empty(span(1..1)),
3681                    Ast::empty(span(2..2)),
3682                ]
3683            ))
3684        );
3685        assert_eq!(
3686            parser(r"a|").parse(),
3687            Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),]))
3688        );
3689        assert_eq!(
3690            parser(r"|a").parse(),
3691            Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),]))
3692        );
3693
3694        assert_eq!(
3695            parser(r"(|)").parse(),
3696            Ok(group(
3697                0..3,
3698                1,
3699                alt(
3700                    1..2,
3701                    vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),]
3702                )
3703            ))
3704        );
3705        assert_eq!(
3706            parser(r"(a|)").parse(),
3707            Ok(group(
3708                0..4,
3709                1,
3710                alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),])
3711            ))
3712        );
3713        assert_eq!(
3714            parser(r"(|a)").parse(),
3715            Ok(group(
3716                0..4,
3717                1,
3718                alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),])
3719            ))
3720        );
3721
3722        assert_eq!(
3723            parser(r"a|b)").parse().unwrap_err(),
3724            TestError {
3725                span: span(3..4),
3726                kind: ast::ErrorKind::GroupUnopened,
3727            }
3728        );
3729        assert_eq!(
3730            parser(r"(a|b").parse().unwrap_err(),
3731            TestError {
3732                span: span(0..1),
3733                kind: ast::ErrorKind::GroupUnclosed,
3734            }
3735        );
3736    }
3737
3738    #[test]
3739    fn parse_unsupported_lookaround() {
3740        assert_eq!(
3741            parser(r"(?=a)").parse().unwrap_err(),
3742            TestError {
3743                span: span(0..3),
3744                kind: ast::ErrorKind::UnsupportedLookAround,
3745            }
3746        );
3747        assert_eq!(
3748            parser(r"(?!a)").parse().unwrap_err(),
3749            TestError {
3750                span: span(0..3),
3751                kind: ast::ErrorKind::UnsupportedLookAround,
3752            }
3753        );
3754        assert_eq!(
3755            parser(r"(?<=a)").parse().unwrap_err(),
3756            TestError {
3757                span: span(0..4),
3758                kind: ast::ErrorKind::UnsupportedLookAround,
3759            }
3760        );
3761        assert_eq!(
3762            parser(r"(?<!a)").parse().unwrap_err(),
3763            TestError {
3764                span: span(0..4),
3765                kind: ast::ErrorKind::UnsupportedLookAround,
3766            }
3767        );
3768    }
3769
3770    #[test]
3771    fn parse_group() {
3772        assert_eq!(
3773            parser("(?i)").parse(),
3774            Ok(Ast::flags(ast::SetFlags {
3775                span: span(0..4),
3776                flags: ast::Flags {
3777                    span: span(2..3),
3778                    items: vec![ast::FlagsItem {
3779                        span: span(2..3),
3780                        kind: ast::FlagsItemKind::Flag(
3781                            ast::Flag::CaseInsensitive
3782                        ),
3783                    }],
3784                },
3785            }))
3786        );
3787        assert_eq!(
3788            parser("(?iU)").parse(),
3789            Ok(Ast::flags(ast::SetFlags {
3790                span: span(0..5),
3791                flags: ast::Flags {
3792                    span: span(2..4),
3793                    items: vec![
3794                        ast::FlagsItem {
3795                            span: span(2..3),
3796                            kind: ast::FlagsItemKind::Flag(
3797                                ast::Flag::CaseInsensitive
3798                            ),
3799                        },
3800                        ast::FlagsItem {
3801                            span: span(3..4),
3802                            kind: ast::FlagsItemKind::Flag(
3803                                ast::Flag::SwapGreed
3804                            ),
3805                        },
3806                    ],
3807                },
3808            }))
3809        );
3810        assert_eq!(
3811            parser("(?i-U)").parse(),
3812            Ok(Ast::flags(ast::SetFlags {
3813                span: span(0..6),
3814                flags: ast::Flags {
3815                    span: span(2..5),
3816                    items: vec![
3817                        ast::FlagsItem {
3818                            span: span(2..3),
3819                            kind: ast::FlagsItemKind::Flag(
3820                                ast::Flag::CaseInsensitive
3821                            ),
3822                        },
3823                        ast::FlagsItem {
3824                            span: span(3..4),
3825                            kind: ast::FlagsItemKind::Negation,
3826                        },
3827                        ast::FlagsItem {
3828                            span: span(4..5),
3829                            kind: ast::FlagsItemKind::Flag(
3830                                ast::Flag::SwapGreed
3831                            ),
3832                        },
3833                    ],
3834                },
3835            }))
3836        );
3837
3838        assert_eq!(
3839            parser("()").parse(),
3840            Ok(Ast::group(ast::Group {
3841                span: span(0..2),
3842                kind: ast::GroupKind::CaptureIndex(1),
3843                ast: Box::new(Ast::empty(span(1..1))),
3844            }))
3845        );
3846        assert_eq!(
3847            parser("(a)").parse(),
3848            Ok(Ast::group(ast::Group {
3849                span: span(0..3),
3850                kind: ast::GroupKind::CaptureIndex(1),
3851                ast: Box::new(lit('a', 1)),
3852            }))
3853        );
3854        assert_eq!(
3855            parser("(())").parse(),
3856            Ok(Ast::group(ast::Group {
3857                span: span(0..4),
3858                kind: ast::GroupKind::CaptureIndex(1),
3859                ast: Box::new(Ast::group(ast::Group {
3860                    span: span(1..3),
3861                    kind: ast::GroupKind::CaptureIndex(2),
3862                    ast: Box::new(Ast::empty(span(2..2))),
3863                })),
3864            }))
3865        );
3866
3867        assert_eq!(
3868            parser("(?:a)").parse(),
3869            Ok(Ast::group(ast::Group {
3870                span: span(0..5),
3871                kind: ast::GroupKind::NonCapturing(ast::Flags {
3872                    span: span(2..2),
3873                    items: vec![],
3874                }),
3875                ast: Box::new(lit('a', 3)),
3876            }))
3877        );
3878
3879        assert_eq!(
3880            parser("(?i:a)").parse(),
3881            Ok(Ast::group(ast::Group {
3882                span: span(0..6),
3883                kind: ast::GroupKind::NonCapturing(ast::Flags {
3884                    span: span(2..3),
3885                    items: vec![ast::FlagsItem {
3886                        span: span(2..3),
3887                        kind: ast::FlagsItemKind::Flag(
3888                            ast::Flag::CaseInsensitive
3889                        ),
3890                    },],
3891                }),
3892                ast: Box::new(lit('a', 4)),
3893            }))
3894        );
3895        assert_eq!(
3896            parser("(?i-U:a)").parse(),
3897            Ok(Ast::group(ast::Group {
3898                span: span(0..8),
3899                kind: ast::GroupKind::NonCapturing(ast::Flags {
3900                    span: span(2..5),
3901                    items: vec![
3902                        ast::FlagsItem {
3903                            span: span(2..3),
3904                            kind: ast::FlagsItemKind::Flag(
3905                                ast::Flag::CaseInsensitive
3906                            ),
3907                        },
3908                        ast::FlagsItem {
3909                            span: span(3..4),
3910                            kind: ast::FlagsItemKind::Negation,
3911                        },
3912                        ast::FlagsItem {
3913                            span: span(4..5),
3914                            kind: ast::FlagsItemKind::Flag(
3915                                ast::Flag::SwapGreed
3916                            ),
3917                        },
3918                    ],
3919                }),
3920                ast: Box::new(lit('a', 6)),
3921            }))
3922        );
3923
3924        assert_eq!(
3925            parser("(").parse().unwrap_err(),
3926            TestError {
3927                span: span(0..1),
3928                kind: ast::ErrorKind::GroupUnclosed,
3929            }
3930        );
3931        assert_eq!(
3932            parser("(?").parse().unwrap_err(),
3933            TestError {
3934                span: span(0..1),
3935                kind: ast::ErrorKind::GroupUnclosed,
3936            }
3937        );
3938        assert_eq!(
3939            parser("(?P").parse().unwrap_err(),
3940            TestError {
3941                span: span(2..3),
3942                kind: ast::ErrorKind::FlagUnrecognized,
3943            }
3944        );
3945        assert_eq!(
3946            parser("(?P<").parse().unwrap_err(),
3947            TestError {
3948                span: span(4..4),
3949                kind: ast::ErrorKind::GroupNameUnexpectedEof,
3950            }
3951        );
3952        assert_eq!(
3953            parser("(a").parse().unwrap_err(),
3954            TestError {
3955                span: span(0..1),
3956                kind: ast::ErrorKind::GroupUnclosed,
3957            }
3958        );
3959        assert_eq!(
3960            parser("(()").parse().unwrap_err(),
3961            TestError {
3962                span: span(0..1),
3963                kind: ast::ErrorKind::GroupUnclosed,
3964            }
3965        );
3966        assert_eq!(
3967            parser(")").parse().unwrap_err(),
3968            TestError {
3969                span: span(0..1),
3970                kind: ast::ErrorKind::GroupUnopened,
3971            }
3972        );
3973        assert_eq!(
3974            parser("a)").parse().unwrap_err(),
3975            TestError {
3976                span: span(1..2),
3977                kind: ast::ErrorKind::GroupUnopened,
3978            }
3979        );
3980    }
3981
3982    #[test]
3983    fn parse_capture_name() {
3984        assert_eq!(
3985            parser("(?<a>z)").parse(),
3986            Ok(Ast::group(ast::Group {
3987                span: span(0..7),
3988                kind: ast::GroupKind::CaptureName {
3989                    starts_with_p: false,
3990                    name: ast::CaptureName {
3991                        span: span(3..4),
3992                        name: s("a"),
3993                        index: 1,
3994                    }
3995                },
3996                ast: Box::new(lit('z', 5)),
3997            }))
3998        );
3999        assert_eq!(
4000            parser("(?P<a>z)").parse(),
4001            Ok(Ast::group(ast::Group {
4002                span: span(0..8),
4003                kind: ast::GroupKind::CaptureName {
4004                    starts_with_p: true,
4005                    name: ast::CaptureName {
4006                        span: span(4..5),
4007                        name: s("a"),
4008                        index: 1,
4009                    }
4010                },
4011                ast: Box::new(lit('z', 6)),
4012            }))
4013        );
4014        assert_eq!(
4015            parser("(?P<abc>z)").parse(),
4016            Ok(Ast::group(ast::Group {
4017                span: span(0..10),
4018                kind: ast::GroupKind::CaptureName {
4019                    starts_with_p: true,
4020                    name: ast::CaptureName {
4021                        span: span(4..7),
4022                        name: s("abc"),
4023                        index: 1,
4024                    }
4025                },
4026                ast: Box::new(lit('z', 8)),
4027            }))
4028        );
4029
4030        assert_eq!(
4031            parser("(?P<a_1>z)").parse(),
4032            Ok(Ast::group(ast::Group {
4033                span: span(0..10),
4034                kind: ast::GroupKind::CaptureName {
4035                    starts_with_p: true,
4036                    name: ast::CaptureName {
4037                        span: span(4..7),
4038                        name: s("a_1"),
4039                        index: 1,
4040                    }
4041                },
4042                ast: Box::new(lit('z', 8)),
4043            }))
4044        );
4045
4046        assert_eq!(
4047            parser("(?P<a.1>z)").parse(),
4048            Ok(Ast::group(ast::Group {
4049                span: span(0..10),
4050                kind: ast::GroupKind::CaptureName {
4051                    starts_with_p: true,
4052                    name: ast::CaptureName {
4053                        span: span(4..7),
4054                        name: s("a.1"),
4055                        index: 1,
4056                    }
4057                },
4058                ast: Box::new(lit('z', 8)),
4059            }))
4060        );
4061
4062        assert_eq!(
4063            parser("(?P<a[1]>z)").parse(),
4064            Ok(Ast::group(ast::Group {
4065                span: span(0..11),
4066                kind: ast::GroupKind::CaptureName {
4067                    starts_with_p: true,
4068                    name: ast::CaptureName {
4069                        span: span(4..8),
4070                        name: s("a[1]"),
4071                        index: 1,
4072                    }
4073                },
4074                ast: Box::new(lit('z', 9)),
4075            }))
4076        );
4077
4078        assert_eq!(
4079            parser("(?P<a¾>)").parse(),
4080            Ok(Ast::group(ast::Group {
4081                span: Span::new(
4082                    Position::new(0, 1, 1),
4083                    Position::new(9, 1, 9),
4084                ),
4085                kind: ast::GroupKind::CaptureName {
4086                    starts_with_p: true,
4087                    name: ast::CaptureName {
4088                        span: Span::new(
4089                            Position::new(4, 1, 5),
4090                            Position::new(7, 1, 7),
4091                        ),
4092                        name: s("a¾"),
4093                        index: 1,
4094                    }
4095                },
4096                ast: Box::new(Ast::empty(Span::new(
4097                    Position::new(8, 1, 8),
4098                    Position::new(8, 1, 8),
4099                ))),
4100            }))
4101        );
4102        assert_eq!(
4103            parser("(?P<名字>)").parse(),
4104            Ok(Ast::group(ast::Group {
4105                span: Span::new(
4106                    Position::new(0, 1, 1),
4107                    Position::new(12, 1, 9),
4108                ),
4109                kind: ast::GroupKind::CaptureName {
4110                    starts_with_p: true,
4111                    name: ast::CaptureName {
4112                        span: Span::new(
4113                            Position::new(4, 1, 5),
4114                            Position::new(10, 1, 7),
4115                        ),
4116                        name: s("名字"),
4117                        index: 1,
4118                    }
4119                },
4120                ast: Box::new(Ast::empty(Span::new(
4121                    Position::new(11, 1, 8),
4122                    Position::new(11, 1, 8),
4123                ))),
4124            }))
4125        );
4126
4127        assert_eq!(
4128            parser("(?P<").parse().unwrap_err(),
4129            TestError {
4130                span: span(4..4),
4131                kind: ast::ErrorKind::GroupNameUnexpectedEof,
4132            }
4133        );
4134        assert_eq!(
4135            parser("(?P<>z)").parse().unwrap_err(),
4136            TestError {
4137                span: span(4..4),
4138                kind: ast::ErrorKind::GroupNameEmpty,
4139            }
4140        );
4141        assert_eq!(
4142            parser("(?P<a").parse().unwrap_err(),
4143            TestError {
4144                span: span(5..5),
4145                kind: ast::ErrorKind::GroupNameUnexpectedEof,
4146            }
4147        );
4148        assert_eq!(
4149            parser("(?P<ab").parse().unwrap_err(),
4150            TestError {
4151                span: span(6..6),
4152                kind: ast::ErrorKind::GroupNameUnexpectedEof,
4153            }
4154        );
4155        assert_eq!(
4156            parser("(?P<0a").parse().unwrap_err(),
4157            TestError {
4158                span: span(4..5),
4159                kind: ast::ErrorKind::GroupNameInvalid,
4160            }
4161        );
4162        assert_eq!(
4163            parser("(?P<~").parse().unwrap_err(),
4164            TestError {
4165                span: span(4..5),
4166                kind: ast::ErrorKind::GroupNameInvalid,
4167            }
4168        );
4169        assert_eq!(
4170            parser("(?P<abc~").parse().unwrap_err(),
4171            TestError {
4172                span: span(7..8),
4173                kind: ast::ErrorKind::GroupNameInvalid,
4174            }
4175        );
4176        assert_eq!(
4177            parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
4178            TestError {
4179                span: span(12..13),
4180                kind: ast::ErrorKind::GroupNameDuplicate {
4181                    original: span(4..5),
4182                },
4183            }
4184        );
4185        assert_eq!(
4186            parser("(?P<5>)").parse().unwrap_err(),
4187            TestError {
4188                span: span(4..5),
4189                kind: ast::ErrorKind::GroupNameInvalid,
4190            }
4191        );
4192        assert_eq!(
4193            parser("(?P<5a>)").parse().unwrap_err(),
4194            TestError {
4195                span: span(4..5),
4196                kind: ast::ErrorKind::GroupNameInvalid,
4197            }
4198        );
4199        assert_eq!(
4200            parser("(?P<¾>)").parse().unwrap_err(),
4201            TestError {
4202                span: Span::new(
4203                    Position::new(4, 1, 5),
4204                    Position::new(6, 1, 6),
4205                ),
4206                kind: ast::ErrorKind::GroupNameInvalid,
4207            }
4208        );
4209        assert_eq!(
4210            parser("(?P<¾a>)").parse().unwrap_err(),
4211            TestError {
4212                span: Span::new(
4213                    Position::new(4, 1, 5),
4214                    Position::new(6, 1, 6),
4215                ),
4216                kind: ast::ErrorKind::GroupNameInvalid,
4217            }
4218        );
4219        assert_eq!(
4220            parser("(?P<☃>)").parse().unwrap_err(),
4221            TestError {
4222                span: Span::new(
4223                    Position::new(4, 1, 5),
4224                    Position::new(7, 1, 6),
4225                ),
4226                kind: ast::ErrorKind::GroupNameInvalid,
4227            }
4228        );
4229        assert_eq!(
4230            parser("(?P<a☃>)").parse().unwrap_err(),
4231            TestError {
4232                span: Span::new(
4233                    Position::new(5, 1, 6),
4234                    Position::new(8, 1, 7),
4235                ),
4236                kind: ast::ErrorKind::GroupNameInvalid,
4237            }
4238        );
4239    }
4240
4241    #[test]
4242    fn parse_flags() {
4243        assert_eq!(
4244            parser("i:").parse_flags(),
4245            Ok(ast::Flags {
4246                span: span(0..1),
4247                items: vec![ast::FlagsItem {
4248                    span: span(0..1),
4249                    kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
4250                }],
4251            })
4252        );
4253        assert_eq!(
4254            parser("i)").parse_flags(),
4255            Ok(ast::Flags {
4256                span: span(0..1),
4257                items: vec![ast::FlagsItem {
4258                    span: span(0..1),
4259                    kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
4260                }],
4261            })
4262        );
4263
4264        assert_eq!(
4265            parser("isU:").parse_flags(),
4266            Ok(ast::Flags {
4267                span: span(0..3),
4268                items: vec![
4269                    ast::FlagsItem {
4270                        span: span(0..1),
4271                        kind: ast::FlagsItemKind::Flag(
4272                            ast::Flag::CaseInsensitive
4273                        ),
4274                    },
4275                    ast::FlagsItem {
4276                        span: span(1..2),
4277                        kind: ast::FlagsItemKind::Flag(
4278                            ast::Flag::DotMatchesNewLine
4279                        ),
4280                    },
4281                    ast::FlagsItem {
4282                        span: span(2..3),
4283                        kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4284                    },
4285                ],
4286            })
4287        );
4288
4289        assert_eq!(
4290            parser("-isU:").parse_flags(),
4291            Ok(ast::Flags {
4292                span: span(0..4),
4293                items: vec![
4294                    ast::FlagsItem {
4295                        span: span(0..1),
4296                        kind: ast::FlagsItemKind::Negation,
4297                    },
4298                    ast::FlagsItem {
4299                        span: span(1..2),
4300                        kind: ast::FlagsItemKind::Flag(
4301                            ast::Flag::CaseInsensitive
4302                        ),
4303                    },
4304                    ast::FlagsItem {
4305                        span: span(2..3),
4306                        kind: ast::FlagsItemKind::Flag(
4307                            ast::Flag::DotMatchesNewLine
4308                        ),
4309                    },
4310                    ast::FlagsItem {
4311                        span: span(3..4),
4312                        kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4313                    },
4314                ],
4315            })
4316        );
4317        assert_eq!(
4318            parser("i-sU:").parse_flags(),
4319            Ok(ast::Flags {
4320                span: span(0..4),
4321                items: vec![
4322                    ast::FlagsItem {
4323                        span: span(0..1),
4324                        kind: ast::FlagsItemKind::Flag(
4325                            ast::Flag::CaseInsensitive
4326                        ),
4327                    },
4328                    ast::FlagsItem {
4329                        span: span(1..2),
4330                        kind: ast::FlagsItemKind::Negation,
4331                    },
4332                    ast::FlagsItem {
4333                        span: span(2..3),
4334                        kind: ast::FlagsItemKind::Flag(
4335                            ast::Flag::DotMatchesNewLine
4336                        ),
4337                    },
4338                    ast::FlagsItem {
4339                        span: span(3..4),
4340                        kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4341                    },
4342                ],
4343            })
4344        );
4345        assert_eq!(
4346            parser("i-sR:").parse_flags(),
4347            Ok(ast::Flags {
4348                span: span(0..4),
4349                items: vec![
4350                    ast::FlagsItem {
4351                        span: span(0..1),
4352                        kind: ast::FlagsItemKind::Flag(
4353                            ast::Flag::CaseInsensitive
4354                        ),
4355                    },
4356                    ast::FlagsItem {
4357                        span: span(1..2),
4358                        kind: ast::FlagsItemKind::Negation,
4359                    },
4360                    ast::FlagsItem {
4361                        span: span(2..3),
4362                        kind: ast::FlagsItemKind::Flag(
4363                            ast::Flag::DotMatchesNewLine
4364                        ),
4365                    },
4366                    ast::FlagsItem {
4367                        span: span(3..4),
4368                        kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF),
4369                    },
4370                ],
4371            })
4372        );
4373
4374        assert_eq!(
4375            parser("isU").parse_flags().unwrap_err(),
4376            TestError {
4377                span: span(3..3),
4378                kind: ast::ErrorKind::FlagUnexpectedEof,
4379            }
4380        );
4381        assert_eq!(
4382            parser("isUa:").parse_flags().unwrap_err(),
4383            TestError {
4384                span: span(3..4),
4385                kind: ast::ErrorKind::FlagUnrecognized,
4386            }
4387        );
4388        assert_eq!(
4389            parser("isUi:").parse_flags().unwrap_err(),
4390            TestError {
4391                span: span(3..4),
4392                kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) },
4393            }
4394        );
4395        assert_eq!(
4396            parser("i-sU-i:").parse_flags().unwrap_err(),
4397            TestError {
4398                span: span(4..5),
4399                kind: ast::ErrorKind::FlagRepeatedNegation {
4400                    original: span(1..2),
4401                },
4402            }
4403        );
4404        assert_eq!(
4405            parser("-)").parse_flags().unwrap_err(),
4406            TestError {
4407                span: span(0..1),
4408                kind: ast::ErrorKind::FlagDanglingNegation,
4409            }
4410        );
4411        assert_eq!(
4412            parser("i-)").parse_flags().unwrap_err(),
4413            TestError {
4414                span: span(1..2),
4415                kind: ast::ErrorKind::FlagDanglingNegation,
4416            }
4417        );
4418        assert_eq!(
4419            parser("iU-)").parse_flags().unwrap_err(),
4420            TestError {
4421                span: span(2..3),
4422                kind: ast::ErrorKind::FlagDanglingNegation,
4423            }
4424        );
4425    }
4426
4427    #[test]
4428    fn parse_flag() {
4429        assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
4430        assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
4431        assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
4432        assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
4433        assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
4434        assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF));
4435        assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
4436
4437        assert_eq!(
4438            parser("a").parse_flag().unwrap_err(),
4439            TestError {
4440                span: span(0..1),
4441                kind: ast::ErrorKind::FlagUnrecognized,
4442            }
4443        );
4444        assert_eq!(
4445            parser("☃").parse_flag().unwrap_err(),
4446            TestError {
4447                span: span_range("☃", 0..3),
4448                kind: ast::ErrorKind::FlagUnrecognized,
4449            }
4450        );
4451    }
4452
4453    #[test]
4454    fn parse_primitive_non_escape() {
4455        assert_eq!(
4456            parser(r".").parse_primitive(),
4457            Ok(Primitive::Dot(span(0..1)))
4458        );
4459        assert_eq!(
4460            parser(r"^").parse_primitive(),
4461            Ok(Primitive::Assertion(ast::Assertion {
4462                span: span(0..1),
4463                kind: ast::AssertionKind::StartLine,
4464            }))
4465        );
4466        assert_eq!(
4467            parser(r"$").parse_primitive(),
4468            Ok(Primitive::Assertion(ast::Assertion {
4469                span: span(0..1),
4470                kind: ast::AssertionKind::EndLine,
4471            }))
4472        );
4473
4474        assert_eq!(
4475            parser(r"a").parse_primitive(),
4476            Ok(Primitive::Literal(ast::Literal {
4477                span: span(0..1),
4478                kind: ast::LiteralKind::Verbatim,
4479                c: 'a',
4480            }))
4481        );
4482        assert_eq!(
4483            parser(r"|").parse_primitive(),
4484            Ok(Primitive::Literal(ast::Literal {
4485                span: span(0..1),
4486                kind: ast::LiteralKind::Verbatim,
4487                c: '|',
4488            }))
4489        );
4490        assert_eq!(
4491            parser(r"☃").parse_primitive(),
4492            Ok(Primitive::Literal(ast::Literal {
4493                span: span_range("☃", 0..3),
4494                kind: ast::LiteralKind::Verbatim,
4495                c: '☃',
4496            }))
4497        );
4498    }
4499
4500    #[test]
4501    fn parse_escape() {
4502        assert_eq!(
4503            parser(r"\|").parse_primitive(),
4504            Ok(Primitive::Literal(ast::Literal {
4505                span: span(0..2),
4506                kind: ast::LiteralKind::Meta,
4507                c: '|',
4508            }))
4509        );
4510        let specials = &[
4511            (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
4512            (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
4513            (r"\t", '\t', ast::SpecialLiteralKind::Tab),
4514            (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
4515            (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
4516            (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
4517        ];
4518        for &(pat, c, ref kind) in specials {
4519            assert_eq!(
4520                parser(pat).parse_primitive(),
4521                Ok(Primitive::Literal(ast::Literal {
4522                    span: span(0..2),
4523                    kind: ast::LiteralKind::Special(kind.clone()),
4524                    c,
4525                }))
4526            );
4527        }
4528        assert_eq!(
4529            parser(r"\A").parse_primitive(),
4530            Ok(Primitive::Assertion(ast::Assertion {
4531                span: span(0..2),
4532                kind: ast::AssertionKind::StartText,
4533            }))
4534        );
4535        assert_eq!(
4536            parser(r"\z").parse_primitive(),
4537            Ok(Primitive::Assertion(ast::Assertion {
4538                span: span(0..2),
4539                kind: ast::AssertionKind::EndText,
4540            }))
4541        );
4542        assert_eq!(
4543            parser(r"\b").parse_primitive(),
4544            Ok(Primitive::Assertion(ast::Assertion {
4545                span: span(0..2),
4546                kind: ast::AssertionKind::WordBoundary,
4547            }))
4548        );
4549        assert_eq!(
4550            parser(r"\b{start}").parse_primitive(),
4551            Ok(Primitive::Assertion(ast::Assertion {
4552                span: span(0..9),
4553                kind: ast::AssertionKind::WordBoundaryStart,
4554            }))
4555        );
4556        assert_eq!(
4557            parser(r"\b{end}").parse_primitive(),
4558            Ok(Primitive::Assertion(ast::Assertion {
4559                span: span(0..7),
4560                kind: ast::AssertionKind::WordBoundaryEnd,
4561            }))
4562        );
4563        assert_eq!(
4564            parser(r"\b{start-half}").parse_primitive(),
4565            Ok(Primitive::Assertion(ast::Assertion {
4566                span: span(0..14),
4567                kind: ast::AssertionKind::WordBoundaryStartHalf,
4568            }))
4569        );
4570        assert_eq!(
4571            parser(r"\b{end-half}").parse_primitive(),
4572            Ok(Primitive::Assertion(ast::Assertion {
4573                span: span(0..12),
4574                kind: ast::AssertionKind::WordBoundaryEndHalf,
4575            }))
4576        );
4577        assert_eq!(
4578            parser(r"\<").parse_primitive(),
4579            Ok(Primitive::Assertion(ast::Assertion {
4580                span: span(0..2),
4581                kind: ast::AssertionKind::WordBoundaryStartAngle,
4582            }))
4583        );
4584        assert_eq!(
4585            parser(r"\>").parse_primitive(),
4586            Ok(Primitive::Assertion(ast::Assertion {
4587                span: span(0..2),
4588                kind: ast::AssertionKind::WordBoundaryEndAngle,
4589            }))
4590        );
4591        assert_eq!(
4592            parser(r"\B").parse_primitive(),
4593            Ok(Primitive::Assertion(ast::Assertion {
4594                span: span(0..2),
4595                kind: ast::AssertionKind::NotWordBoundary,
4596            }))
4597        );
4598
4599        // We also support superfluous escapes in most cases now too.
4600        for c in ['!', '@', '%', '"', '\'', '/', ' '] {
4601            let pat = format!(r"\{}", c);
4602            assert_eq!(
4603                parser(&pat).parse_primitive(),
4604                Ok(Primitive::Literal(ast::Literal {
4605                    span: span(0..2),
4606                    kind: ast::LiteralKind::Superfluous,
4607                    c,
4608                }))
4609            );
4610        }
4611
4612        // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This
4613        // gives flexibility for future evolution.
4614        assert_eq!(
4615            parser(r"\e").parse_escape().unwrap_err(),
4616            TestError {
4617                span: span(0..2),
4618                kind: ast::ErrorKind::EscapeUnrecognized,
4619            }
4620        );
4621        assert_eq!(
4622            parser(r"\y").parse_escape().unwrap_err(),
4623            TestError {
4624                span: span(0..2),
4625                kind: ast::ErrorKind::EscapeUnrecognized,
4626            }
4627        );
4628
4629        // Starting a special word boundary without any non-whitespace chars
4630        // after the brace makes it ambiguous whether the user meant to write
4631        // a counted repetition (probably not?) or an actual special word
4632        // boundary assertion.
4633        assert_eq!(
4634            parser(r"\b{").parse_escape().unwrap_err(),
4635            TestError {
4636                span: span(0..3),
4637                kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
4638            }
4639        );
4640        assert_eq!(
4641            parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(),
4642            TestError {
4643                span: span(0..4),
4644                kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
4645            }
4646        );
4647        // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char,
4648        // and thus causes the parser to treat it as a counted repetition.
4649        assert_eq!(
4650            parser(r"\b{ ").parse().unwrap_err(),
4651            TestError {
4652                span: span(2..4),
4653                kind: ast::ErrorKind::RepetitionCountUnclosed,
4654            }
4655        );
4656        // In this case, we got some valid chars that makes it look like the
4657        // user is writing one of the special word boundary assertions, but
4658        // we forget to close the brace.
4659        assert_eq!(
4660            parser(r"\b{foo").parse_escape().unwrap_err(),
4661            TestError {
4662                span: span(2..6),
4663                kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
4664            }
4665        );
4666        // We get the same error as above, except it is provoked by seeing a
4667        // char that we know is invalid before seeing a closing brace.
4668        assert_eq!(
4669            parser(r"\b{foo!}").parse_escape().unwrap_err(),
4670            TestError {
4671                span: span(2..6),
4672                kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
4673            }
4674        );
4675        // And this one occurs when, syntactically, everything looks okay, but
4676        // we don't use a valid spelling of a word boundary assertion.
4677        assert_eq!(
4678            parser(r"\b{foo}").parse_escape().unwrap_err(),
4679            TestError {
4680                span: span(3..6),
4681                kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized,
4682            }
4683        );
4684
4685        // An unfinished escape is illegal.
4686        assert_eq!(
4687            parser(r"\").parse_escape().unwrap_err(),
4688            TestError {
4689                span: span(0..1),
4690                kind: ast::ErrorKind::EscapeUnexpectedEof,
4691            }
4692        );
4693    }
4694
4695    #[test]
4696    fn parse_unsupported_backreference() {
4697        assert_eq!(
4698            parser(r"\0").parse_escape().unwrap_err(),
4699            TestError {
4700                span: span(0..2),
4701                kind: ast::ErrorKind::UnsupportedBackreference,
4702            }
4703        );
4704        assert_eq!(
4705            parser(r"\9").parse_escape().unwrap_err(),
4706            TestError {
4707                span: span(0..2),
4708                kind: ast::ErrorKind::UnsupportedBackreference,
4709            }
4710        );
4711    }
4712
4713    #[test]
4714    fn parse_octal() {
4715        for i in 0..511 {
4716            let pat = format!(r"\{:o}", i);
4717            assert_eq!(
4718                parser_octal(&pat).parse_escape(),
4719                Ok(Primitive::Literal(ast::Literal {
4720                    span: span(0..pat.len()),
4721                    kind: ast::LiteralKind::Octal,
4722                    c: char::from_u32(i).unwrap(),
4723                }))
4724            );
4725        }
4726        assert_eq!(
4727            parser_octal(r"\778").parse_escape(),
4728            Ok(Primitive::Literal(ast::Literal {
4729                span: span(0..3),
4730                kind: ast::LiteralKind::Octal,
4731                c: '?',
4732            }))
4733        );
4734        assert_eq!(
4735            parser_octal(r"\7777").parse_escape(),
4736            Ok(Primitive::Literal(ast::Literal {
4737                span: span(0..4),
4738                kind: ast::LiteralKind::Octal,
4739                c: '\u{01FF}',
4740            }))
4741        );
4742        assert_eq!(
4743            parser_octal(r"\778").parse(),
4744            Ok(Ast::concat(ast::Concat {
4745                span: span(0..4),
4746                asts: vec![
4747                    Ast::literal(ast::Literal {
4748                        span: span(0..3),
4749                        kind: ast::LiteralKind::Octal,
4750                        c: '?',
4751                    }),
4752                    Ast::literal(ast::Literal {
4753                        span: span(3..4),
4754                        kind: ast::LiteralKind::Verbatim,
4755                        c: '8',
4756                    }),
4757                ],
4758            }))
4759        );
4760        assert_eq!(
4761            parser_octal(r"\7777").parse(),
4762            Ok(Ast::concat(ast::Concat {
4763                span: span(0..5),
4764                asts: vec![
4765                    Ast::literal(ast::Literal {
4766                        span: span(0..4),
4767                        kind: ast::LiteralKind::Octal,
4768                        c: '\u{01FF}',
4769                    }),
4770                    Ast::literal(ast::Literal {
4771                        span: span(4..5),
4772                        kind: ast::LiteralKind::Verbatim,
4773                        c: '7',
4774                    }),
4775                ],
4776            }))
4777        );
4778
4779        assert_eq!(
4780            parser_octal(r"\8").parse_escape().unwrap_err(),
4781            TestError {
4782                span: span(0..2),
4783                kind: ast::ErrorKind::EscapeUnrecognized,
4784            }
4785        );
4786    }
4787
4788    #[test]
4789    fn parse_hex_two() {
4790        for i in 0..256 {
4791            let pat = format!(r"\x{:02x}", i);
4792            assert_eq!(
4793                parser(&pat).parse_escape(),
4794                Ok(Primitive::Literal(ast::Literal {
4795                    span: span(0..pat.len()),
4796                    kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
4797                    c: char::from_u32(i).unwrap(),
4798                }))
4799            );
4800        }
4801
4802        assert_eq!(
4803            parser(r"\xF").parse_escape().unwrap_err(),
4804            TestError {
4805                span: span(3..3),
4806                kind: ast::ErrorKind::EscapeUnexpectedEof,
4807            }
4808        );
4809        assert_eq!(
4810            parser(r"\xG").parse_escape().unwrap_err(),
4811            TestError {
4812                span: span(2..3),
4813                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4814            }
4815        );
4816        assert_eq!(
4817            parser(r"\xFG").parse_escape().unwrap_err(),
4818            TestError {
4819                span: span(3..4),
4820                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4821            }
4822        );
4823    }
4824
4825    #[test]
4826    fn parse_hex_four() {
4827        for i in 0..65536 {
4828            let c = match char::from_u32(i) {
4829                None => continue,
4830                Some(c) => c,
4831            };
4832            let pat = format!(r"\u{:04x}", i);
4833            assert_eq!(
4834                parser(&pat).parse_escape(),
4835                Ok(Primitive::Literal(ast::Literal {
4836                    span: span(0..pat.len()),
4837                    kind: ast::LiteralKind::HexFixed(
4838                        ast::HexLiteralKind::UnicodeShort
4839                    ),
4840                    c,
4841                }))
4842            );
4843        }
4844
4845        assert_eq!(
4846            parser(r"\uF").parse_escape().unwrap_err(),
4847            TestError {
4848                span: span(3..3),
4849                kind: ast::ErrorKind::EscapeUnexpectedEof,
4850            }
4851        );
4852        assert_eq!(
4853            parser(r"\uG").parse_escape().unwrap_err(),
4854            TestError {
4855                span: span(2..3),
4856                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4857            }
4858        );
4859        assert_eq!(
4860            parser(r"\uFG").parse_escape().unwrap_err(),
4861            TestError {
4862                span: span(3..4),
4863                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4864            }
4865        );
4866        assert_eq!(
4867            parser(r"\uFFG").parse_escape().unwrap_err(),
4868            TestError {
4869                span: span(4..5),
4870                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4871            }
4872        );
4873        assert_eq!(
4874            parser(r"\uFFFG").parse_escape().unwrap_err(),
4875            TestError {
4876                span: span(5..6),
4877                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4878            }
4879        );
4880        assert_eq!(
4881            parser(r"\uD800").parse_escape().unwrap_err(),
4882            TestError {
4883                span: span(2..6),
4884                kind: ast::ErrorKind::EscapeHexInvalid,
4885            }
4886        );
4887    }
4888
4889    #[test]
4890    fn parse_hex_eight() {
4891        for i in 0..65536 {
4892            let c = match char::from_u32(i) {
4893                None => continue,
4894                Some(c) => c,
4895            };
4896            let pat = format!(r"\U{:08x}", i);
4897            assert_eq!(
4898                parser(&pat).parse_escape(),
4899                Ok(Primitive::Literal(ast::Literal {
4900                    span: span(0..pat.len()),
4901                    kind: ast::LiteralKind::HexFixed(
4902                        ast::HexLiteralKind::UnicodeLong
4903                    ),
4904                    c,
4905                }))
4906            );
4907        }
4908
4909        assert_eq!(
4910            parser(r"\UF").parse_escape().unwrap_err(),
4911            TestError {
4912                span: span(3..3),
4913                kind: ast::ErrorKind::EscapeUnexpectedEof,
4914            }
4915        );
4916        assert_eq!(
4917            parser(r"\UG").parse_escape().unwrap_err(),
4918            TestError {
4919                span: span(2..3),
4920                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4921            }
4922        );
4923        assert_eq!(
4924            parser(r"\UFG").parse_escape().unwrap_err(),
4925            TestError {
4926                span: span(3..4),
4927                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4928            }
4929        );
4930        assert_eq!(
4931            parser(r"\UFFG").parse_escape().unwrap_err(),
4932            TestError {
4933                span: span(4..5),
4934                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4935            }
4936        );
4937        assert_eq!(
4938            parser(r"\UFFFG").parse_escape().unwrap_err(),
4939            TestError {
4940                span: span(5..6),
4941                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4942            }
4943        );
4944        assert_eq!(
4945            parser(r"\UFFFFG").parse_escape().unwrap_err(),
4946            TestError {
4947                span: span(6..7),
4948                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4949            }
4950        );
4951        assert_eq!(
4952            parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4953            TestError {
4954                span: span(7..8),
4955                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4956            }
4957        );
4958        assert_eq!(
4959            parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4960            TestError {
4961                span: span(8..9),
4962                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4963            }
4964        );
4965        assert_eq!(
4966            parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4967            TestError {
4968                span: span(9..10),
4969                kind: ast::ErrorKind::EscapeHexInvalidDigit,
4970            }
4971        );
4972    }
4973
4974    #[test]
4975    fn parse_hex_brace() {
4976        assert_eq!(
4977            parser(r"\u{26c4}").parse_escape(),
4978            Ok(Primitive::Literal(ast::Literal {
4979                span: span(0..8),
4980                kind: ast::LiteralKind::HexBrace(
4981                    ast::HexLiteralKind::UnicodeShort
4982                ),
4983                c: '⛄',
4984            }))
4985        );
4986        assert_eq!(
4987            parser(r"\U{26c4}").parse_escape(),
4988            Ok(Primitive::Literal(ast::Literal {
4989                span: span(0..8),
4990                kind: ast::LiteralKind::HexBrace(
4991                    ast::HexLiteralKind::UnicodeLong
4992                ),
4993                c: '⛄',
4994            }))
4995        );
4996        assert_eq!(
4997            parser(r"\x{26c4}").parse_escape(),
4998            Ok(Primitive::Literal(ast::Literal {
4999                span: span(0..8),
5000                kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
5001                c: '⛄',
5002            }))
5003        );
5004        assert_eq!(
5005            parser(r"\x{26C4}").parse_escape(),
5006            Ok(Primitive::Literal(ast::Literal {
5007                span: span(0..8),
5008                kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
5009                c: '⛄',
5010            }))
5011        );
5012        assert_eq!(
5013            parser(r"\x{10fFfF}").parse_escape(),
5014            Ok(Primitive::Literal(ast::Literal {
5015                span: span(0..10),
5016                kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
5017                c: '\u{10FFFF}',
5018            }))
5019        );
5020
5021        assert_eq!(
5022            parser(r"\x").parse_escape().unwrap_err(),
5023            TestError {
5024                span: span(2..2),
5025                kind: ast::ErrorKind::EscapeUnexpectedEof,
5026            }
5027        );
5028        assert_eq!(
5029            parser(r"\x{").parse_escape().unwrap_err(),
5030            TestError {
5031                span: span(2..3),
5032                kind: ast::ErrorKind::EscapeUnexpectedEof,
5033            }
5034        );
5035        assert_eq!(
5036            parser(r"\x{FF").parse_escape().unwrap_err(),
5037            TestError {
5038                span: span(2..5),
5039                kind: ast::ErrorKind::EscapeUnexpectedEof,
5040            }
5041        );
5042        assert_eq!(
5043            parser(r"\x{}").parse_escape().unwrap_err(),
5044            TestError {
5045                span: span(2..4),
5046                kind: ast::ErrorKind::EscapeHexEmpty,
5047            }
5048        );
5049        assert_eq!(
5050            parser(r"\x{FGF}").parse_escape().unwrap_err(),
5051            TestError {
5052                span: span(4..5),
5053                kind: ast::ErrorKind::EscapeHexInvalidDigit,
5054            }
5055        );
5056        assert_eq!(
5057            parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
5058            TestError {
5059                span: span(3..9),
5060                kind: ast::ErrorKind::EscapeHexInvalid,
5061            }
5062        );
5063        assert_eq!(
5064            parser(r"\x{D800}").parse_escape().unwrap_err(),
5065            TestError {
5066                span: span(3..7),
5067                kind: ast::ErrorKind::EscapeHexInvalid,
5068            }
5069        );
5070        assert_eq!(
5071            parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
5072            TestError {
5073                span: span(3..12),
5074                kind: ast::ErrorKind::EscapeHexInvalid,
5075            }
5076        );
5077    }
5078
5079    #[test]
5080    fn parse_decimal() {
5081        assert_eq!(parser("123").parse_decimal(), Ok(123));
5082        assert_eq!(parser("0").parse_decimal(), Ok(0));
5083        assert_eq!(parser("01").parse_decimal(), Ok(1));
5084
5085        assert_eq!(
5086            parser("-1").parse_decimal().unwrap_err(),
5087            TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
5088        );
5089        assert_eq!(
5090            parser("").parse_decimal().unwrap_err(),
5091            TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
5092        );
5093        assert_eq!(
5094            parser("9999999999").parse_decimal().unwrap_err(),
5095            TestError {
5096                span: span(0..10),
5097                kind: ast::ErrorKind::DecimalInvalid,
5098            }
5099        );
5100    }
5101
5102    #[test]
5103    fn parse_set_class() {
5104        fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
5105            ast::ClassSet::union(ast::ClassSetUnion { span, items })
5106        }
5107
5108        fn intersection(
5109            span: Span,
5110            lhs: ast::ClassSet,
5111            rhs: ast::ClassSet,
5112        ) -> ast::ClassSet {
5113            ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
5114                span,
5115                kind: ast::ClassSetBinaryOpKind::Intersection,
5116                lhs: Box::new(lhs),
5117                rhs: Box::new(rhs),
5118            })
5119        }
5120
5121        fn difference(
5122            span: Span,
5123            lhs: ast::ClassSet,
5124            rhs: ast::ClassSet,
5125        ) -> ast::ClassSet {
5126            ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
5127                span,
5128                kind: ast::ClassSetBinaryOpKind::Difference,
5129                lhs: Box::new(lhs),
5130                rhs: Box::new(rhs),
5131            })
5132        }
5133
5134        fn symdifference(
5135            span: Span,
5136            lhs: ast::ClassSet,
5137            rhs: ast::ClassSet,
5138        ) -> ast::ClassSet {
5139            ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
5140                span,
5141                kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
5142                lhs: Box::new(lhs),
5143                rhs: Box::new(rhs),
5144            })
5145        }
5146
5147        fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
5148            ast::ClassSet::Item(item)
5149        }
5150
5151        fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
5152            ast::ClassSetItem::Ascii(cls)
5153        }
5154
5155        fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
5156            ast::ClassSetItem::Unicode(cls)
5157        }
5158
5159        fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
5160            ast::ClassSetItem::Perl(cls)
5161        }
5162
5163        fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
5164            ast::ClassSetItem::Bracketed(Box::new(cls))
5165        }
5166
5167        fn lit(span: Span, c: char) -> ast::ClassSetItem {
5168            ast::ClassSetItem::Literal(ast::Literal {
5169                span,
5170                kind: ast::LiteralKind::Verbatim,
5171                c,
5172            })
5173        }
5174
5175        fn empty(span: Span) -> ast::ClassSetItem {
5176            ast::ClassSetItem::Empty(span)
5177        }
5178
5179        fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
5180            let pos1 = Position {
5181                offset: span.start.offset + start.len_utf8(),
5182                column: span.start.column + 1,
5183                ..span.start
5184            };
5185            let pos2 = Position {
5186                offset: span.end.offset - end.len_utf8(),
5187                column: span.end.column - 1,
5188                ..span.end
5189            };
5190            ast::ClassSetItem::Range(ast::ClassSetRange {
5191                span,
5192                start: ast::Literal {
5193                    span: Span { end: pos1, ..span },
5194                    kind: ast::LiteralKind::Verbatim,
5195                    c: start,
5196                },
5197                end: ast::Literal {
5198                    span: Span { start: pos2, ..span },
5199                    kind: ast::LiteralKind::Verbatim,
5200                    c: end,
5201                },
5202            })
5203        }
5204
5205        fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
5206            ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated }
5207        }
5208
5209        fn lower(span: Span, negated: bool) -> ast::ClassAscii {
5210            ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated }
5211        }
5212
5213        assert_eq!(
5214            parser("[[:alnum:]]").parse(),
5215            Ok(Ast::class_bracketed(ast::ClassBracketed {
5216                span: span(0..11),
5217                negated: false,
5218                kind: itemset(item_ascii(alnum(span(1..10), false))),
5219            }))
5220        );
5221        assert_eq!(
5222            parser("[[[:alnum:]]]").parse(),
5223            Ok(Ast::class_bracketed(ast::ClassBracketed {
5224                span: span(0..13),
5225                negated: false,
5226                kind: itemset(item_bracket(ast::ClassBracketed {
5227                    span: span(1..12),
5228                    negated: false,
5229                    kind: itemset(item_ascii(alnum(span(2..11), false))),
5230                })),
5231            }))
5232        );
5233        assert_eq!(
5234            parser("[[:alnum:]&&[:lower:]]").parse(),
5235            Ok(Ast::class_bracketed(ast::ClassBracketed {
5236                span: span(0..22),
5237                negated: false,
5238                kind: intersection(
5239                    span(1..21),
5240                    itemset(item_ascii(alnum(span(1..10), false))),
5241                    itemset(item_ascii(lower(span(12..21), false))),
5242                ),
5243            }))
5244        );
5245        assert_eq!(
5246            parser("[[:alnum:]--[:lower:]]").parse(),
5247            Ok(Ast::class_bracketed(ast::ClassBracketed {
5248                span: span(0..22),
5249                negated: false,
5250                kind: difference(
5251                    span(1..21),
5252                    itemset(item_ascii(alnum(span(1..10), false))),
5253                    itemset(item_ascii(lower(span(12..21), false))),
5254                ),
5255            }))
5256        );
5257        assert_eq!(
5258            parser("[[:alnum:]~~[:lower:]]").parse(),
5259            Ok(Ast::class_bracketed(ast::ClassBracketed {
5260                span: span(0..22),
5261                negated: false,
5262                kind: symdifference(
5263                    span(1..21),
5264                    itemset(item_ascii(alnum(span(1..10), false))),
5265                    itemset(item_ascii(lower(span(12..21), false))),
5266                ),
5267            }))
5268        );
5269
5270        assert_eq!(
5271            parser("[a]").parse(),
5272            Ok(Ast::class_bracketed(ast::ClassBracketed {
5273                span: span(0..3),
5274                negated: false,
5275                kind: itemset(lit(span(1..2), 'a')),
5276            }))
5277        );
5278        assert_eq!(
5279            parser(r"[a\]]").parse(),
5280            Ok(Ast::class_bracketed(ast::ClassBracketed {
5281                span: span(0..5),
5282                negated: false,
5283                kind: union(
5284                    span(1..4),
5285                    vec![
5286                        lit(span(1..2), 'a'),
5287                        ast::ClassSetItem::Literal(ast::Literal {
5288                            span: span(2..4),
5289                            kind: ast::LiteralKind::Meta,
5290                            c: ']',
5291                        }),
5292                    ]
5293                ),
5294            }))
5295        );
5296        assert_eq!(
5297            parser(r"[a\-z]").parse(),
5298            Ok(Ast::class_bracketed(ast::ClassBracketed {
5299                span: span(0..6),
5300                negated: false,
5301                kind: union(
5302                    span(1..5),
5303                    vec![
5304                        lit(span(1..2), 'a'),
5305                        ast::ClassSetItem::Literal(ast::Literal {
5306                            span: span(2..4),
5307                            kind: ast::LiteralKind::Meta,
5308                            c: '-',
5309                        }),
5310                        lit(span(4..5), 'z'),
5311                    ]
5312                ),
5313            }))
5314        );
5315        assert_eq!(
5316            parser("[ab]").parse(),
5317            Ok(Ast::class_bracketed(ast::ClassBracketed {
5318                span: span(0..4),
5319                negated: false,
5320                kind: union(
5321                    span(1..3),
5322                    vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
5323                ),
5324            }))
5325        );
5326        assert_eq!(
5327            parser("[a-]").parse(),
5328            Ok(Ast::class_bracketed(ast::ClassBracketed {
5329                span: span(0..4),
5330                negated: false,
5331                kind: union(
5332                    span(1..3),
5333                    vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
5334                ),
5335            }))
5336        );
5337        assert_eq!(
5338            parser("[-a]").parse(),
5339            Ok(Ast::class_bracketed(ast::ClassBracketed {
5340                span: span(0..4),
5341                negated: false,
5342                kind: union(
5343                    span(1..3),
5344                    vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
5345                ),
5346            }))
5347        );
5348        assert_eq!(
5349            parser(r"[\pL]").parse(),
5350            Ok(Ast::class_bracketed(ast::ClassBracketed {
5351                span: span(0..5),
5352                negated: false,
5353                kind: itemset(item_unicode(ast::ClassUnicode {
5354                    span: span(1..4),
5355                    negated: false,
5356                    kind: ast::ClassUnicodeKind::OneLetter('L'),
5357                })),
5358            }))
5359        );
5360        assert_eq!(
5361            parser(r"[\w]").parse(),
5362            Ok(Ast::class_bracketed(ast::ClassBracketed {
5363                span: span(0..4),
5364                negated: false,
5365                kind: itemset(item_perl(ast::ClassPerl {
5366                    span: span(1..3),
5367                    kind: ast::ClassPerlKind::Word,
5368                    negated: false,
5369                })),
5370            }))
5371        );
5372        assert_eq!(
5373            parser(r"[a\wz]").parse(),
5374            Ok(Ast::class_bracketed(ast::ClassBracketed {
5375                span: span(0..6),
5376                negated: false,
5377                kind: union(
5378                    span(1..5),
5379                    vec![
5380                        lit(span(1..2), 'a'),
5381                        item_perl(ast::ClassPerl {
5382                            span: span(2..4),
5383                            kind: ast::ClassPerlKind::Word,
5384                            negated: false,
5385                        }),
5386                        lit(span(4..5), 'z'),
5387                    ]
5388                ),
5389            }))
5390        );
5391
5392        assert_eq!(
5393            parser("[a-z]").parse(),
5394            Ok(Ast::class_bracketed(ast::ClassBracketed {
5395                span: span(0..5),
5396                negated: false,
5397                kind: itemset(range(span(1..4), 'a', 'z')),
5398            }))
5399        );
5400        assert_eq!(
5401            parser("[a-cx-z]").parse(),
5402            Ok(Ast::class_bracketed(ast::ClassBracketed {
5403                span: span(0..8),
5404                negated: false,
5405                kind: union(
5406                    span(1..7),
5407                    vec![
5408                        range(span(1..4), 'a', 'c'),
5409                        range(span(4..7), 'x', 'z'),
5410                    ]
5411                ),
5412            }))
5413        );
5414        assert_eq!(
5415            parser(r"[\w&&a-cx-z]").parse(),
5416            Ok(Ast::class_bracketed(ast::ClassBracketed {
5417                span: span(0..12),
5418                negated: false,
5419                kind: intersection(
5420                    span(1..11),
5421                    itemset(item_perl(ast::ClassPerl {
5422                        span: span(1..3),
5423                        kind: ast::ClassPerlKind::Word,
5424                        negated: false,
5425                    })),
5426                    union(
5427                        span(5..11),
5428                        vec![
5429                            range(span(5..8), 'a', 'c'),
5430                            range(span(8..11), 'x', 'z'),
5431                        ]
5432                    ),
5433                ),
5434            }))
5435        );
5436        assert_eq!(
5437            parser(r"[a-cx-z&&\w]").parse(),
5438            Ok(Ast::class_bracketed(ast::ClassBracketed {
5439                span: span(0..12),
5440                negated: false,
5441                kind: intersection(
5442                    span(1..11),
5443                    union(
5444                        span(1..7),
5445                        vec![
5446                            range(span(1..4), 'a', 'c'),
5447                            range(span(4..7), 'x', 'z'),
5448                        ]
5449                    ),
5450                    itemset(item_perl(ast::ClassPerl {
5451                        span: span(9..11),
5452                        kind: ast::ClassPerlKind::Word,
5453                        negated: false,
5454                    })),
5455                ),
5456            }))
5457        );
5458        assert_eq!(
5459            parser(r"[a--b--c]").parse(),
5460            Ok(Ast::class_bracketed(ast::ClassBracketed {
5461                span: span(0..9),
5462                negated: false,
5463                kind: difference(
5464                    span(1..8),
5465                    difference(
5466                        span(1..5),
5467                        itemset(lit(span(1..2), 'a')),
5468                        itemset(lit(span(4..5), 'b')),
5469                    ),
5470                    itemset(lit(span(7..8), 'c')),
5471                ),
5472            }))
5473        );
5474        assert_eq!(
5475            parser(r"[a~~b~~c]").parse(),
5476            Ok(Ast::class_bracketed(ast::ClassBracketed {
5477                span: span(0..9),
5478                negated: false,
5479                kind: symdifference(
5480                    span(1..8),
5481                    symdifference(
5482                        span(1..5),
5483                        itemset(lit(span(1..2), 'a')),
5484                        itemset(lit(span(4..5), 'b')),
5485                    ),
5486                    itemset(lit(span(7..8), 'c')),
5487                ),
5488            }))
5489        );
5490        assert_eq!(
5491            parser(r"[\^&&^]").parse(),
5492            Ok(Ast::class_bracketed(ast::ClassBracketed {
5493                span: span(0..7),
5494                negated: false,
5495                kind: intersection(
5496                    span(1..6),
5497                    itemset(ast::ClassSetItem::Literal(ast::Literal {
5498                        span: span(1..3),
5499                        kind: ast::LiteralKind::Meta,
5500                        c: '^',
5501                    })),
5502                    itemset(lit(span(5..6), '^')),
5503                ),
5504            }))
5505        );
5506        assert_eq!(
5507            parser(r"[\&&&&]").parse(),
5508            Ok(Ast::class_bracketed(ast::ClassBracketed {
5509                span: span(0..7),
5510                negated: false,
5511                kind: intersection(
5512                    span(1..6),
5513                    itemset(ast::ClassSetItem::Literal(ast::Literal {
5514                        span: span(1..3),
5515                        kind: ast::LiteralKind::Meta,
5516                        c: '&',
5517                    })),
5518                    itemset(lit(span(5..6), '&')),
5519                ),
5520            }))
5521        );
5522        assert_eq!(
5523            parser(r"[&&&&]").parse(),
5524            Ok(Ast::class_bracketed(ast::ClassBracketed {
5525                span: span(0..6),
5526                negated: false,
5527                kind: intersection(
5528                    span(1..5),
5529                    intersection(
5530                        span(1..3),
5531                        itemset(empty(span(1..1))),
5532                        itemset(empty(span(3..3))),
5533                    ),
5534                    itemset(empty(span(5..5))),
5535                ),
5536            }))
5537        );
5538
5539        let pat = "[☃-⛄]";
5540        assert_eq!(
5541            parser(pat).parse(),
5542            Ok(Ast::class_bracketed(ast::ClassBracketed {
5543                span: span_range(pat, 0..9),
5544                negated: false,
5545                kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
5546                    span: span_range(pat, 1..8),
5547                    start: ast::Literal {
5548                        span: span_range(pat, 1..4),
5549                        kind: ast::LiteralKind::Verbatim,
5550                        c: '☃',
5551                    },
5552                    end: ast::Literal {
5553                        span: span_range(pat, 5..8),
5554                        kind: ast::LiteralKind::Verbatim,
5555                        c: '⛄',
5556                    },
5557                })),
5558            }))
5559        );
5560
5561        assert_eq!(
5562            parser(r"[]]").parse(),
5563            Ok(Ast::class_bracketed(ast::ClassBracketed {
5564                span: span(0..3),
5565                negated: false,
5566                kind: itemset(lit(span(1..2), ']')),
5567            }))
5568        );
5569        assert_eq!(
5570            parser(r"[]\[]").parse(),
5571            Ok(Ast::class_bracketed(ast::ClassBracketed {
5572                span: span(0..5),
5573                negated: false,
5574                kind: union(
5575                    span(1..4),
5576                    vec![
5577                        lit(span(1..2), ']'),
5578                        ast::ClassSetItem::Literal(ast::Literal {
5579                            span: span(2..4),
5580                            kind: ast::LiteralKind::Meta,
5581                            c: '[',
5582                        }),
5583                    ]
5584                ),
5585            }))
5586        );
5587        assert_eq!(
5588            parser(r"[\[]]").parse(),
5589            Ok(concat(
5590                0..5,
5591                vec![
5592                    Ast::class_bracketed(ast::ClassBracketed {
5593                        span: span(0..4),
5594                        negated: false,
5595                        kind: itemset(ast::ClassSetItem::Literal(
5596                            ast::Literal {
5597                                span: span(1..3),
5598                                kind: ast::LiteralKind::Meta,
5599                                c: '[',
5600                            }
5601                        )),
5602                    }),
5603                    Ast::literal(ast::Literal {
5604                        span: span(4..5),
5605                        kind: ast::LiteralKind::Verbatim,
5606                        c: ']',
5607                    }),
5608                ]
5609            ))
5610        );
5611
5612        assert_eq!(
5613            parser("[").parse().unwrap_err(),
5614            TestError {
5615                span: span(0..1),
5616                kind: ast::ErrorKind::ClassUnclosed,
5617            }
5618        );
5619        assert_eq!(
5620            parser("[[").parse().unwrap_err(),
5621            TestError {
5622                span: span(1..2),
5623                kind: ast::ErrorKind::ClassUnclosed,
5624            }
5625        );
5626        assert_eq!(
5627            parser("[[-]").parse().unwrap_err(),
5628            TestError {
5629                span: span(0..1),
5630                kind: ast::ErrorKind::ClassUnclosed,
5631            }
5632        );
5633        assert_eq!(
5634            parser("[[[:alnum:]").parse().unwrap_err(),
5635            TestError {
5636                span: span(1..2),
5637                kind: ast::ErrorKind::ClassUnclosed,
5638            }
5639        );
5640        assert_eq!(
5641            parser(r"[\b]").parse().unwrap_err(),
5642            TestError {
5643                span: span(1..3),
5644                kind: ast::ErrorKind::ClassEscapeInvalid,
5645            }
5646        );
5647        assert_eq!(
5648            parser(r"[\w-a]").parse().unwrap_err(),
5649            TestError {
5650                span: span(1..3),
5651                kind: ast::ErrorKind::ClassRangeLiteral,
5652            }
5653        );
5654        assert_eq!(
5655            parser(r"[a-\w]").parse().unwrap_err(),
5656            TestError {
5657                span: span(3..5),
5658                kind: ast::ErrorKind::ClassRangeLiteral,
5659            }
5660        );
5661        assert_eq!(
5662            parser(r"[z-a]").parse().unwrap_err(),
5663            TestError {
5664                span: span(1..4),
5665                kind: ast::ErrorKind::ClassRangeInvalid,
5666            }
5667        );
5668
5669        assert_eq!(
5670            parser_ignore_whitespace("[a ").parse().unwrap_err(),
5671            TestError {
5672                span: span(0..1),
5673                kind: ast::ErrorKind::ClassUnclosed,
5674            }
5675        );
5676        assert_eq!(
5677            parser_ignore_whitespace("[a- ").parse().unwrap_err(),
5678            TestError {
5679                span: span(0..1),
5680                kind: ast::ErrorKind::ClassUnclosed,
5681            }
5682        );
5683    }
5684
5685    #[test]
5686    fn parse_set_class_open() {
5687        assert_eq!(parser("[a]").parse_set_class_open(), {
5688            let set = ast::ClassBracketed {
5689                span: span(0..1),
5690                negated: false,
5691                kind: ast::ClassSet::union(ast::ClassSetUnion {
5692                    span: span(1..1),
5693                    items: vec![],
5694                }),
5695            };
5696            let union = ast::ClassSetUnion { span: span(1..1), items: vec![] };
5697            Ok((set, union))
5698        });
5699        assert_eq!(
5700            parser_ignore_whitespace("[   a]").parse_set_class_open(),
5701            {
5702                let set = ast::ClassBracketed {
5703                    span: span(0..4),
5704                    negated: false,
5705                    kind: ast::ClassSet::union(ast::ClassSetUnion {
5706                        span: span(4..4),
5707                        items: vec![],
5708                    }),
5709                };
5710                let union =
5711                    ast::ClassSetUnion { span: span(4..4), items: vec![] };
5712                Ok((set, union))
5713            }
5714        );
5715        assert_eq!(parser("[^a]").parse_set_class_open(), {
5716            let set = ast::ClassBracketed {
5717                span: span(0..2),
5718                negated: true,
5719                kind: ast::ClassSet::union(ast::ClassSetUnion {
5720                    span: span(2..2),
5721                    items: vec![],
5722                }),
5723            };
5724            let union = ast::ClassSetUnion { span: span(2..2), items: vec![] };
5725            Ok((set, union))
5726        });
5727        assert_eq!(
5728            parser_ignore_whitespace("[ ^ a]").parse_set_class_open(),
5729            {
5730                let set = ast::ClassBracketed {
5731                    span: span(0..4),
5732                    negated: true,
5733                    kind: ast::ClassSet::union(ast::ClassSetUnion {
5734                        span: span(4..4),
5735                        items: vec![],
5736                    }),
5737                };
5738                let union =
5739                    ast::ClassSetUnion { span: span(4..4), items: vec![] };
5740                Ok((set, union))
5741            }
5742        );
5743        assert_eq!(parser("[-a]").parse_set_class_open(), {
5744            let set = ast::ClassBracketed {
5745                span: span(0..2),
5746                negated: false,
5747                kind: ast::ClassSet::union(ast::ClassSetUnion {
5748                    span: span(1..1),
5749                    items: vec![],
5750                }),
5751            };
5752            let union = ast::ClassSetUnion {
5753                span: span(1..2),
5754                items: vec![ast::ClassSetItem::Literal(ast::Literal {
5755                    span: span(1..2),
5756                    kind: ast::LiteralKind::Verbatim,
5757                    c: '-',
5758                })],
5759            };
5760            Ok((set, union))
5761        });
5762        assert_eq!(
5763            parser_ignore_whitespace("[ - a]").parse_set_class_open(),
5764            {
5765                let set = ast::ClassBracketed {
5766                    span: span(0..4),
5767                    negated: false,
5768                    kind: ast::ClassSet::union(ast::ClassSetUnion {
5769                        span: span(2..2),
5770                        items: vec![],
5771                    }),
5772                };
5773                let union = ast::ClassSetUnion {
5774                    span: span(2..3),
5775                    items: vec![ast::ClassSetItem::Literal(ast::Literal {
5776                        span: span(2..3),
5777                        kind: ast::LiteralKind::Verbatim,
5778                        c: '-',
5779                    })],
5780                };
5781                Ok((set, union))
5782            }
5783        );
5784        assert_eq!(parser("[^-a]").parse_set_class_open(), {
5785            let set = ast::ClassBracketed {
5786                span: span(0..3),
5787                negated: true,
5788                kind: ast::ClassSet::union(ast::ClassSetUnion {
5789                    span: span(2..2),
5790                    items: vec![],
5791                }),
5792            };
5793            let union = ast::ClassSetUnion {
5794                span: span(2..3),
5795                items: vec![ast::ClassSetItem::Literal(ast::Literal {
5796                    span: span(2..3),
5797                    kind: ast::LiteralKind::Verbatim,
5798                    c: '-',
5799                })],
5800            };
5801            Ok((set, union))
5802        });
5803        assert_eq!(parser("[--a]").parse_set_class_open(), {
5804            let set = ast::ClassBracketed {
5805                span: span(0..3),
5806                negated: false,
5807                kind: ast::ClassSet::union(ast::ClassSetUnion {
5808                    span: span(1..1),
5809                    items: vec![],
5810                }),
5811            };
5812            let union = ast::ClassSetUnion {
5813                span: span(1..3),
5814                items: vec![
5815                    ast::ClassSetItem::Literal(ast::Literal {
5816                        span: span(1..2),
5817                        kind: ast::LiteralKind::Verbatim,
5818                        c: '-',
5819                    }),
5820                    ast::ClassSetItem::Literal(ast::Literal {
5821                        span: span(2..3),
5822                        kind: ast::LiteralKind::Verbatim,
5823                        c: '-',
5824                    }),
5825                ],
5826            };
5827            Ok((set, union))
5828        });
5829        assert_eq!(parser("[]a]").parse_set_class_open(), {
5830            let set = ast::ClassBracketed {
5831                span: span(0..2),
5832                negated: false,
5833                kind: ast::ClassSet::union(ast::ClassSetUnion {
5834                    span: span(1..1),
5835                    items: vec![],
5836                }),
5837            };
5838            let union = ast::ClassSetUnion {
5839                span: span(1..2),
5840                items: vec![ast::ClassSetItem::Literal(ast::Literal {
5841                    span: span(1..2),
5842                    kind: ast::LiteralKind::Verbatim,
5843                    c: ']',
5844                })],
5845            };
5846            Ok((set, union))
5847        });
5848        assert_eq!(
5849            parser_ignore_whitespace("[ ] a]").parse_set_class_open(),
5850            {
5851                let set = ast::ClassBracketed {
5852                    span: span(0..4),
5853                    negated: false,
5854                    kind: ast::ClassSet::union(ast::ClassSetUnion {
5855                        span: span(2..2),
5856                        items: vec![],
5857                    }),
5858                };
5859                let union = ast::ClassSetUnion {
5860                    span: span(2..3),
5861                    items: vec![ast::ClassSetItem::Literal(ast::Literal {
5862                        span: span(2..3),
5863                        kind: ast::LiteralKind::Verbatim,
5864                        c: ']',
5865                    })],
5866                };
5867                Ok((set, union))
5868            }
5869        );
5870        assert_eq!(parser("[^]a]").parse_set_class_open(), {
5871            let set = ast::ClassBracketed {
5872                span: span(0..3),
5873                negated: true,
5874                kind: ast::ClassSet::union(ast::ClassSetUnion {
5875                    span: span(2..2),
5876                    items: vec![],
5877                }),
5878            };
5879            let union = ast::ClassSetUnion {
5880                span: span(2..3),
5881                items: vec![ast::ClassSetItem::Literal(ast::Literal {
5882                    span: span(2..3),
5883                    kind: ast::LiteralKind::Verbatim,
5884                    c: ']',
5885                })],
5886            };
5887            Ok((set, union))
5888        });
5889        assert_eq!(parser("[-]a]").parse_set_class_open(), {
5890            let set = ast::ClassBracketed {
5891                span: span(0..2),
5892                negated: false,
5893                kind: ast::ClassSet::union(ast::ClassSetUnion {
5894                    span: span(1..1),
5895                    items: vec![],
5896                }),
5897            };
5898            let union = ast::ClassSetUnion {
5899                span: span(1..2),
5900                items: vec![ast::ClassSetItem::Literal(ast::Literal {
5901                    span: span(1..2),
5902                    kind: ast::LiteralKind::Verbatim,
5903                    c: '-',
5904                })],
5905            };
5906            Ok((set, union))
5907        });
5908
5909        assert_eq!(
5910            parser("[").parse_set_class_open().unwrap_err(),
5911            TestError {
5912                span: span(0..1),
5913                kind: ast::ErrorKind::ClassUnclosed,
5914            }
5915        );
5916        assert_eq!(
5917            parser_ignore_whitespace("[    ")
5918                .parse_set_class_open()
5919                .unwrap_err(),
5920            TestError {
5921                span: span(0..5),
5922                kind: ast::ErrorKind::ClassUnclosed,
5923            }
5924        );
5925        assert_eq!(
5926            parser("[^").parse_set_class_open().unwrap_err(),
5927            TestError {
5928                span: span(0..2),
5929                kind: ast::ErrorKind::ClassUnclosed,
5930            }
5931        );
5932        assert_eq!(
5933            parser("[]").parse_set_class_open().unwrap_err(),
5934            TestError {
5935                span: span(0..2),
5936                kind: ast::ErrorKind::ClassUnclosed,
5937            }
5938        );
5939        assert_eq!(
5940            parser("[-").parse_set_class_open().unwrap_err(),
5941            TestError {
5942                span: span(0..0),
5943                kind: ast::ErrorKind::ClassUnclosed,
5944            }
5945        );
5946        assert_eq!(
5947            parser("[--").parse_set_class_open().unwrap_err(),
5948            TestError {
5949                span: span(0..0),
5950                kind: ast::ErrorKind::ClassUnclosed,
5951            }
5952        );
5953
5954        // See: https://github.com/rust-lang/regex/issues/792
5955        assert_eq!(
5956            parser("(?x)[-#]").parse_with_comments().unwrap_err(),
5957            TestError {
5958                span: span(4..4),
5959                kind: ast::ErrorKind::ClassUnclosed,
5960            }
5961        );
5962    }
5963
5964    #[test]
5965    fn maybe_parse_ascii_class() {
5966        assert_eq!(
5967            parser(r"[:alnum:]").maybe_parse_ascii_class(),
5968            Some(ast::ClassAscii {
5969                span: span(0..9),
5970                kind: ast::ClassAsciiKind::Alnum,
5971                negated: false,
5972            })
5973        );
5974        assert_eq!(
5975            parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5976            Some(ast::ClassAscii {
5977                span: span(0..9),
5978                kind: ast::ClassAsciiKind::Alnum,
5979                negated: false,
5980            })
5981        );
5982        assert_eq!(
5983            parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5984            Some(ast::ClassAscii {
5985                span: span(0..10),
5986                kind: ast::ClassAsciiKind::Alnum,
5987                negated: true,
5988            })
5989        );
5990
5991        let p = parser(r"[:");
5992        assert_eq!(p.maybe_parse_ascii_class(), None);
5993        assert_eq!(p.offset(), 0);
5994
5995        let p = parser(r"[:^");
5996        assert_eq!(p.maybe_parse_ascii_class(), None);
5997        assert_eq!(p.offset(), 0);
5998
5999        let p = parser(r"[^:alnum:]");
6000        assert_eq!(p.maybe_parse_ascii_class(), None);
6001        assert_eq!(p.offset(), 0);
6002
6003        let p = parser(r"[:alnnum:]");
6004        assert_eq!(p.maybe_parse_ascii_class(), None);
6005        assert_eq!(p.offset(), 0);
6006
6007        let p = parser(r"[:alnum]");
6008        assert_eq!(p.maybe_parse_ascii_class(), None);
6009        assert_eq!(p.offset(), 0);
6010
6011        let p = parser(r"[:alnum:");
6012        assert_eq!(p.maybe_parse_ascii_class(), None);
6013        assert_eq!(p.offset(), 0);
6014    }
6015
6016    #[test]
6017    fn parse_unicode_class() {
6018        assert_eq!(
6019            parser(r"\pN").parse_escape(),
6020            Ok(Primitive::Unicode(ast::ClassUnicode {
6021                span: span(0..3),
6022                negated: false,
6023                kind: ast::ClassUnicodeKind::OneLetter('N'),
6024            }))
6025        );
6026        assert_eq!(
6027            parser(r"\PN").parse_escape(),
6028            Ok(Primitive::Unicode(ast::ClassUnicode {
6029                span: span(0..3),
6030                negated: true,
6031                kind: ast::ClassUnicodeKind::OneLetter('N'),
6032            }))
6033        );
6034        assert_eq!(
6035            parser(r"\p{N}").parse_escape(),
6036            Ok(Primitive::Unicode(ast::ClassUnicode {
6037                span: span(0..5),
6038                negated: false,
6039                kind: ast::ClassUnicodeKind::Named(s("N")),
6040            }))
6041        );
6042        assert_eq!(
6043            parser(r"\P{N}").parse_escape(),
6044            Ok(Primitive::Unicode(ast::ClassUnicode {
6045                span: span(0..5),
6046                negated: true,
6047                kind: ast::ClassUnicodeKind::Named(s("N")),
6048            }))
6049        );
6050        assert_eq!(
6051            parser(r"\p{Greek}").parse_escape(),
6052            Ok(Primitive::Unicode(ast::ClassUnicode {
6053                span: span(0..9),
6054                negated: false,
6055                kind: ast::ClassUnicodeKind::Named(s("Greek")),
6056            }))
6057        );
6058
6059        assert_eq!(
6060            parser(r"\p{scx:Katakana}").parse_escape(),
6061            Ok(Primitive::Unicode(ast::ClassUnicode {
6062                span: span(0..16),
6063                negated: false,
6064                kind: ast::ClassUnicodeKind::NamedValue {
6065                    op: ast::ClassUnicodeOpKind::Colon,
6066                    name: s("scx"),
6067                    value: s("Katakana"),
6068                },
6069            }))
6070        );
6071        assert_eq!(
6072            parser(r"\p{scx=Katakana}").parse_escape(),
6073            Ok(Primitive::Unicode(ast::ClassUnicode {
6074                span: span(0..16),
6075                negated: false,
6076                kind: ast::ClassUnicodeKind::NamedValue {
6077                    op: ast::ClassUnicodeOpKind::Equal,
6078                    name: s("scx"),
6079                    value: s("Katakana"),
6080                },
6081            }))
6082        );
6083        assert_eq!(
6084            parser(r"\p{scx!=Katakana}").parse_escape(),
6085            Ok(Primitive::Unicode(ast::ClassUnicode {
6086                span: span(0..17),
6087                negated: false,
6088                kind: ast::ClassUnicodeKind::NamedValue {
6089                    op: ast::ClassUnicodeOpKind::NotEqual,
6090                    name: s("scx"),
6091                    value: s("Katakana"),
6092                },
6093            }))
6094        );
6095
6096        assert_eq!(
6097            parser(r"\p{:}").parse_escape(),
6098            Ok(Primitive::Unicode(ast::ClassUnicode {
6099                span: span(0..5),
6100                negated: false,
6101                kind: ast::ClassUnicodeKind::NamedValue {
6102                    op: ast::ClassUnicodeOpKind::Colon,
6103                    name: s(""),
6104                    value: s(""),
6105                },
6106            }))
6107        );
6108        assert_eq!(
6109            parser(r"\p{=}").parse_escape(),
6110            Ok(Primitive::Unicode(ast::ClassUnicode {
6111                span: span(0..5),
6112                negated: false,
6113                kind: ast::ClassUnicodeKind::NamedValue {
6114                    op: ast::ClassUnicodeOpKind::Equal,
6115                    name: s(""),
6116                    value: s(""),
6117                },
6118            }))
6119        );
6120        assert_eq!(
6121            parser(r"\p{!=}").parse_escape(),
6122            Ok(Primitive::Unicode(ast::ClassUnicode {
6123                span: span(0..6),
6124                negated: false,
6125                kind: ast::ClassUnicodeKind::NamedValue {
6126                    op: ast::ClassUnicodeOpKind::NotEqual,
6127                    name: s(""),
6128                    value: s(""),
6129                },
6130            }))
6131        );
6132
6133        assert_eq!(
6134            parser(r"\p").parse_escape().unwrap_err(),
6135            TestError {
6136                span: span(2..2),
6137                kind: ast::ErrorKind::EscapeUnexpectedEof,
6138            }
6139        );
6140        assert_eq!(
6141            parser(r"\p{").parse_escape().unwrap_err(),
6142            TestError {
6143                span: span(3..3),
6144                kind: ast::ErrorKind::EscapeUnexpectedEof,
6145            }
6146        );
6147        assert_eq!(
6148            parser(r"\p{N").parse_escape().unwrap_err(),
6149            TestError {
6150                span: span(4..4),
6151                kind: ast::ErrorKind::EscapeUnexpectedEof,
6152            }
6153        );
6154        assert_eq!(
6155            parser(r"\p{Greek").parse_escape().unwrap_err(),
6156            TestError {
6157                span: span(8..8),
6158                kind: ast::ErrorKind::EscapeUnexpectedEof,
6159            }
6160        );
6161
6162        assert_eq!(
6163            parser(r"\pNz").parse(),
6164            Ok(Ast::concat(ast::Concat {
6165                span: span(0..4),
6166                asts: vec![
6167                    Ast::class_unicode(ast::ClassUnicode {
6168                        span: span(0..3),
6169                        negated: false,
6170                        kind: ast::ClassUnicodeKind::OneLetter('N'),
6171                    }),
6172                    Ast::literal(ast::Literal {
6173                        span: span(3..4),
6174                        kind: ast::LiteralKind::Verbatim,
6175                        c: 'z',
6176                    }),
6177                ],
6178            }))
6179        );
6180        assert_eq!(
6181            parser(r"\p{Greek}z").parse(),
6182            Ok(Ast::concat(ast::Concat {
6183                span: span(0..10),
6184                asts: vec![
6185                    Ast::class_unicode(ast::ClassUnicode {
6186                        span: span(0..9),
6187                        negated: false,
6188                        kind: ast::ClassUnicodeKind::Named(s("Greek")),
6189                    }),
6190                    Ast::literal(ast::Literal {
6191                        span: span(9..10),
6192                        kind: ast::LiteralKind::Verbatim,
6193                        c: 'z',
6194                    }),
6195                ],
6196            }))
6197        );
6198        assert_eq!(
6199            parser(r"\p\{").parse().unwrap_err(),
6200            TestError {
6201                span: span(2..3),
6202                kind: ast::ErrorKind::UnicodeClassInvalid,
6203            }
6204        );
6205        assert_eq!(
6206            parser(r"\P\{").parse().unwrap_err(),
6207            TestError {
6208                span: span(2..3),
6209                kind: ast::ErrorKind::UnicodeClassInvalid,
6210            }
6211        );
6212    }
6213
6214    #[test]
6215    fn parse_perl_class() {
6216        assert_eq!(
6217            parser(r"\d").parse_escape(),
6218            Ok(Primitive::Perl(ast::ClassPerl {
6219                span: span(0..2),
6220                kind: ast::ClassPerlKind::Digit,
6221                negated: false,
6222            }))
6223        );
6224        assert_eq!(
6225            parser(r"\D").parse_escape(),
6226            Ok(Primitive::Perl(ast::ClassPerl {
6227                span: span(0..2),
6228                kind: ast::ClassPerlKind::Digit,
6229                negated: true,
6230            }))
6231        );
6232        assert_eq!(
6233            parser(r"\s").parse_escape(),
6234            Ok(Primitive::Perl(ast::ClassPerl {
6235                span: span(0..2),
6236                kind: ast::ClassPerlKind::Space,
6237                negated: false,
6238            }))
6239        );
6240        assert_eq!(
6241            parser(r"\S").parse_escape(),
6242            Ok(Primitive::Perl(ast::ClassPerl {
6243                span: span(0..2),
6244                kind: ast::ClassPerlKind::Space,
6245                negated: true,
6246            }))
6247        );
6248        assert_eq!(
6249            parser(r"\w").parse_escape(),
6250            Ok(Primitive::Perl(ast::ClassPerl {
6251                span: span(0..2),
6252                kind: ast::ClassPerlKind::Word,
6253                negated: false,
6254            }))
6255        );
6256        assert_eq!(
6257            parser(r"\W").parse_escape(),
6258            Ok(Primitive::Perl(ast::ClassPerl {
6259                span: span(0..2),
6260                kind: ast::ClassPerlKind::Word,
6261                negated: true,
6262            }))
6263        );
6264
6265        assert_eq!(
6266            parser(r"\d").parse(),
6267            Ok(Ast::class_perl(ast::ClassPerl {
6268                span: span(0..2),
6269                kind: ast::ClassPerlKind::Digit,
6270                negated: false,
6271            }))
6272        );
6273        assert_eq!(
6274            parser(r"\dz").parse(),
6275            Ok(Ast::concat(ast::Concat {
6276                span: span(0..3),
6277                asts: vec![
6278                    Ast::class_perl(ast::ClassPerl {
6279                        span: span(0..2),
6280                        kind: ast::ClassPerlKind::Digit,
6281                        negated: false,
6282                    }),
6283                    Ast::literal(ast::Literal {
6284                        span: span(2..3),
6285                        kind: ast::LiteralKind::Verbatim,
6286                        c: 'z',
6287                    }),
6288                ],
6289            }))
6290        );
6291    }
6292
6293    // This tests a bug fix where the nest limit checker wasn't decrementing
6294    // its depth during post-traversal, which causes long regexes to trip
6295    // the default limit too aggressively.
6296    #[test]
6297    fn regression_454_nest_too_big() {
6298        let pattern = r#"
6299        2(?:
6300          [45]\d{3}|
6301          7(?:
6302            1[0-267]|
6303            2[0-289]|
6304            3[0-29]|
6305            4[01]|
6306            5[1-3]|
6307            6[013]|
6308            7[0178]|
6309            91
6310          )|
6311          8(?:
6312            0[125]|
6313            [139][1-6]|
6314            2[0157-9]|
6315            41|
6316            6[1-35]|
6317            7[1-5]|
6318            8[1-8]|
6319            90
6320          )|
6321          9(?:
6322            0[0-2]|
6323            1[0-4]|
6324            2[568]|
6325            3[3-6]|
6326            5[5-7]|
6327            6[0167]|
6328            7[15]|
6329            8[0146-9]
6330          )
6331        )\d{4}
6332        "#;
6333        assert!(parser_nest_limit(pattern, 50).parse().is_ok());
6334    }
6335
6336    // This tests that we treat a trailing `-` in a character class as a
6337    // literal `-` even when whitespace mode is enabled and there is whitespace
6338    // after the trailing `-`.
6339    #[test]
6340    fn regression_455_trailing_dash_ignore_whitespace() {
6341        assert!(parser("(?x)[ / - ]").parse().is_ok());
6342        assert!(parser("(?x)[ a - ]").parse().is_ok());
6343        assert!(parser(
6344            "(?x)[
6345            a
6346            - ]
6347        "
6348        )
6349        .parse()
6350        .is_ok());
6351        assert!(parser(
6352            "(?x)[
6353            a # wat
6354            - ]
6355        "
6356        )
6357        .parse()
6358        .is_ok());
6359
6360        assert!(parser("(?x)[ / -").parse().is_err());
6361        assert!(parser("(?x)[ / - ").parse().is_err());
6362        assert!(parser(
6363            "(?x)[
6364            / -
6365        "
6366        )
6367        .parse()
6368        .is_err());
6369        assert!(parser(
6370            "(?x)[
6371            / - # wat
6372        "
6373        )
6374        .parse()
6375        .is_err());
6376    }
6377}
regex_syntax/ast/parse.rs

regex_syntax/ast/
parse.rs