regex_syntax/hir/mod.rs
1/*!
2Defines a high-level intermediate (HIR) representation for regular expressions.
3
4The HIR is represented by the [`Hir`] type, and it principally constructed via
5[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6may use the smart constructors defined on `Hir` to build their own by hand. The
7smart constructors simultaneously simplify and "optimize" the HIR, and are also
8the same routines used by translation.
9
10Most regex engines only have an HIR like this, and usually construct it
11directly from the concrete syntax. This crate however first parses the
12concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13as mentioned above. It's done this way to facilitate better error reporting,
14and to have a structured representation of a regex that faithfully represents
15its concrete syntax. Namely, while an `Hir` value can be converted back to an
16equivalent regex pattern string, it is unlikely to look like the original due
17to its simplified structure.
18*/
19
20use core::{char, cmp};
21
22use alloc::{
23 boxed::Box,
24 format,
25 string::{String, ToString},
26 vec,
27 vec::Vec,
28};
29
30use crate::{
31 ast::Span,
32 hir::interval::{Interval, IntervalSet, IntervalSetIter},
33 unicode,
34};
35
36pub use crate::{
37 hir::visitor::{visit, Visitor},
38 unicode::CaseFoldError,
39};
40
41mod interval;
42pub mod literal;
43pub mod print;
44pub mod translate;
45mod visitor;
46
47/// An error that can occur while translating an `Ast` to a `Hir`.
48#[derive(Clone, Debug, Eq, PartialEq)]
49pub struct Error {
50 /// The kind of error.
51 kind: ErrorKind,
52 /// The original pattern that the translator's Ast was parsed from. Every
53 /// span in an error is a valid range into this string.
54 pattern: String,
55 /// The span of this error, derived from the Ast given to the translator.
56 span: Span,
57}
58
59impl Error {
60 /// Return the type of this error.
61 pub fn kind(&self) -> &ErrorKind {
62 &self.kind
63 }
64
65 /// The original pattern string in which this error occurred.
66 ///
67 /// Every span reported by this error is reported in terms of this string.
68 pub fn pattern(&self) -> &str {
69 &self.pattern
70 }
71
72 /// Return the span at which this error occurred.
73 pub fn span(&self) -> &Span {
74 &self.span
75 }
76}
77
78/// The type of an error that occurred while building an `Hir`.
79///
80/// This error type is marked as `non_exhaustive`. This means that adding a
81/// new variant is not considered a breaking change.
82#[non_exhaustive]
83#[derive(Clone, Debug, Eq, PartialEq)]
84pub enum ErrorKind {
85 /// This error occurs when a Unicode feature is used when Unicode
86 /// support is disabled. For example `(?-u:\pL)` would trigger this error.
87 UnicodeNotAllowed,
88 /// This error occurs when translating a pattern that could match a byte
89 /// sequence that isn't UTF-8 and `utf8` was enabled.
90 InvalidUtf8,
91 /// This error occurs when one uses a non-ASCII byte for a line terminator,
92 /// but where Unicode mode is enabled and UTF-8 mode is disabled.
93 InvalidLineTerminator,
94 /// This occurs when an unrecognized Unicode property name could not
95 /// be found.
96 UnicodePropertyNotFound,
97 /// This occurs when an unrecognized Unicode property value could not
98 /// be found.
99 UnicodePropertyValueNotFound,
100 /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101 /// `\d`) could not be found. This can occur when the `unicode-perl`
102 /// crate feature is not enabled.
103 UnicodePerlClassNotFound,
104 /// This occurs when the Unicode simple case mapping tables are not
105 /// available, and the regular expression required Unicode aware case
106 /// insensitivity.
107 UnicodeCaseUnavailable,
108}
109
110#[cfg(feature = "std")]
111impl std::error::Error for Error {}
112
113impl core::fmt::Display for Error {
114 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115 crate::error::Formatter::from(self).fmt(f)
116 }
117}
118
119impl core::fmt::Display for ErrorKind {
120 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121 use self::ErrorKind::*;
122
123 let msg = match *self {
124 UnicodeNotAllowed => "Unicode not allowed here",
125 InvalidUtf8 => "pattern can match invalid UTF-8",
126 InvalidLineTerminator => "invalid line terminator, must be ASCII",
127 UnicodePropertyNotFound => "Unicode property not found",
128 UnicodePropertyValueNotFound => "Unicode property value not found",
129 UnicodePerlClassNotFound => {
130 "Unicode-aware Perl class not found \
131 (make sure the unicode-perl feature is enabled)"
132 }
133 UnicodeCaseUnavailable => {
134 "Unicode-aware case insensitivity matching is not available \
135 (make sure the unicode-case feature is enabled)"
136 }
137 };
138 f.write_str(msg)
139 }
140}
141
142/// A high-level intermediate representation (HIR) for a regular expression.
143///
144/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145/// An `HirKind` indicates what kind of regular expression it is (a literal,
146/// a repetition, a look-around assertion, etc.), where as a `Properties`
147/// describes various facts about the regular expression. For example, whether
148/// it matches UTF-8 or if it matches the empty string.
149///
150/// The HIR of a regular expression represents an intermediate step between
151/// its abstract syntax (a structured description of the concrete syntax) and
152/// an actual regex matcher. The purpose of HIR is to make regular expressions
153/// easier to analyze. In particular, the AST is much more complex than the
154/// HIR. For example, while an AST supports arbitrarily nested character
155/// classes, the HIR will flatten all nested classes into a single set. The HIR
156/// will also "compile away" every flag present in the concrete syntax. For
157/// example, users of HIR expressions never need to worry about case folding;
158/// it is handled automatically by the translator (e.g., by translating
159/// `(?i:A)` to `[aA]`).
160///
161/// The specific type of an HIR expression can be accessed via its `kind`
162/// or `into_kind` methods. This extra level of indirection exists for two
163/// reasons:
164///
165/// 1. Construction of an HIR expression *must* use the constructor methods on
166/// this `Hir` type instead of building the `HirKind` values directly. This
167/// permits construction to enforce invariants like "concatenations always
168/// consist of two or more sub-expressions."
169/// 2. Every HIR expression contains attributes that are defined inductively,
170/// and can be computed cheaply during the construction process. For example,
171/// one such attribute is whether the expression must match at the beginning of
172/// the haystack.
173///
174/// In particular, if you have an `HirKind` value, then there is intentionally
175/// no way to build an `Hir` value from it. You instead need to do case
176/// analysis on the `HirKind` value and build the `Hir` value using its smart
177/// constructors.
178///
179/// # UTF-8
180///
181/// If the HIR was produced by a translator with
182/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184/// matches.
185///
186/// For empty matches, those can occur at any position. It is the
187/// responsibility of the regex engine to determine whether empty matches are
188/// permitted between the code units of a single codepoint.
189///
190/// # Stack space
191///
192/// This type defines its own destructor that uses constant stack space and
193/// heap space proportional to the size of the HIR.
194///
195/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196/// expression pattern string, and uses constant stack space and heap space
197/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198/// be _semantically_ equivalent to the original concrete syntax, but it may
199/// look very different. (And potentially not practically readable by a human.)
200///
201/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202/// stack space. The implementation will also suppress some details (such as
203/// the `Properties` inlined into every `Hir` value to make it less noisy).
204#[derive(Clone, Eq, PartialEq)]
205pub struct Hir {
206 /// The underlying HIR kind.
207 kind: HirKind,
208 /// Analysis info about this HIR, computed during construction.
209 props: Properties,
210}
211
212/// Methods for accessing the underlying `HirKind` and `Properties`.
213impl Hir {
214 /// Returns a reference to the underlying HIR kind.
215 pub fn kind(&self) -> &HirKind {
216 &self.kind
217 }
218
219 /// Consumes ownership of this HIR expression and returns its underlying
220 /// `HirKind`.
221 pub fn into_kind(mut self) -> HirKind {
222 core::mem::replace(&mut self.kind, HirKind::Empty)
223 }
224
225 /// Returns the properties computed for this `Hir`.
226 pub fn properties(&self) -> &Properties {
227 &self.props
228 }
229
230 /// Splits this HIR into its constituent parts.
231 ///
232 /// This is useful because `let Hir { kind, props } = hir;` does not work
233 /// because of `Hir`'s custom `Drop` implementation.
234 fn into_parts(mut self) -> (HirKind, Properties) {
235 (
236 core::mem::replace(&mut self.kind, HirKind::Empty),
237 core::mem::replace(&mut self.props, Properties::empty()),
238 )
239 }
240}
241
242/// Smart constructors for HIR values.
243///
244/// These constructors are called "smart" because they do inductive work or
245/// simplifications. For example, calling `Hir::repetition` with a repetition
246/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247/// since it is equivalent to an empty regex. Another example is calling
248/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249/// just get back the original `expr` since it's precisely equivalent.
250///
251/// Smart constructors enable maintaining invariants about the HIR data type
252/// while also simulanteously keeping the representation as simple as possible.
253impl Hir {
254 /// Returns an empty HIR expression.
255 ///
256 /// An empty HIR expression always matches, including the empty string.
257 #[inline]
258 pub fn empty() -> Hir {
259 let props = Properties::empty();
260 Hir { kind: HirKind::Empty, props }
261 }
262
263 /// Returns an HIR expression that can never match anything. That is,
264 /// the size of the set of strings in the language described by the HIR
265 /// returned is `0`.
266 ///
267 /// This is distinct from [`Hir::empty`] in that the empty string matches
268 /// the HIR returned by `Hir::empty`. That is, the set of strings in the
269 /// language describe described by `Hir::empty` is non-empty.
270 ///
271 /// Note that currently, the HIR returned uses an empty character class to
272 /// indicate that nothing can match. An equivalent expression that cannot
273 /// match is an empty alternation, but all such "fail" expressions are
274 /// normalized (via smart constructors) to empty character classes. This is
275 /// because empty character classes can be spelled in the concrete syntax
276 /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277 /// empty alternations cannot.
278 #[inline]
279 pub fn fail() -> Hir {
280 let class = Class::Bytes(ClassBytes::empty());
281 let props = Properties::class(&class);
282 // We can't just call Hir::class here because it defers to Hir::fail
283 // in order to canonicalize the Hir value used to represent "cannot
284 // match."
285 Hir { kind: HirKind::Class(class), props }
286 }
287
288 /// Creates a literal HIR expression.
289 ///
290 /// This accepts anything that can be converted into a `Box<[u8]>`.
291 ///
292 /// Note that there is no mechanism for storing a `char` or a `Box<str>`
293 /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294 /// any HIR node) matches valid UTF-8 exclusively can be queried via
295 /// [`Properties::is_utf8`].
296 ///
297 /// # Example
298 ///
299 /// This example shows that concatenations of `Literal` HIR values will
300 /// automatically get flattened and combined together. So for example, even
301 /// if you concat multiple `Literal` values that are themselves not valid
302 /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303 /// how "smart" Hir's smart constructors are.
304 ///
305 /// ```
306 /// use regex_syntax::hir::{Hir, HirKind, Literal};
307 ///
308 /// let literals = vec![
309 /// Hir::literal([0xE2]),
310 /// Hir::literal([0x98]),
311 /// Hir::literal([0x83]),
312 /// ];
313 /// // Each literal, on its own, is invalid UTF-8.
314 /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
315 ///
316 /// let concat = Hir::concat(literals);
317 /// // But the concatenation is valid UTF-8!
318 /// assert!(concat.properties().is_utf8());
319 ///
320 /// // And also notice that the literals have been concatenated into a
321 /// // single `Literal`, to the point where there is no explicit `Concat`!
322 /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323 /// assert_eq!(&expected, concat.kind());
324 /// ```
325 ///
326 /// # Example: building a literal from a `char`
327 ///
328 /// This example shows how to build a single `Hir` literal from a `char`
329 /// value. Since a [`Literal`] is just bytes, we just need to UTF-8
330 /// encode a `char` value:
331 ///
332 /// ```
333 /// use regex_syntax::hir::{Hir, HirKind, Literal};
334 ///
335 /// let ch = '☃';
336 /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes());
337 ///
338 /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
339 /// assert_eq!(&expected, got.kind());
340 /// ```
341 #[inline]
342 pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343 let bytes = lit.into();
344 if bytes.is_empty() {
345 return Hir::empty();
346 }
347
348 let lit = Literal(bytes);
349 let props = Properties::literal(&lit);
350 Hir { kind: HirKind::Literal(lit), props }
351 }
352
353 /// Creates a class HIR expression. The class may either be defined over
354 /// ranges of Unicode codepoints or ranges of raw byte values.
355 ///
356 /// Note that an empty class is permitted. An empty class is equivalent to
357 /// `Hir::fail()`.
358 #[inline]
359 pub fn class(class: Class) -> Hir {
360 if class.is_empty() {
361 return Hir::fail();
362 } else if let Some(bytes) = class.literal() {
363 return Hir::literal(bytes);
364 }
365 let props = Properties::class(&class);
366 Hir { kind: HirKind::Class(class), props }
367 }
368
369 /// Creates a look-around assertion HIR expression.
370 #[inline]
371 pub fn look(look: Look) -> Hir {
372 let props = Properties::look(look);
373 Hir { kind: HirKind::Look(look), props }
374 }
375
376 /// Creates a repetition HIR expression.
377 #[inline]
378 pub fn repetition(mut rep: Repetition) -> Hir {
379 // If the sub-expression of a repetition can only match the empty
380 // string, then we force its maximum to be at most 1.
381 if rep.sub.properties().maximum_len() == Some(0) {
382 rep.min = cmp::min(rep.min, 1);
383 rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
384 }
385 // The regex 'a{0}' is always equivalent to the empty regex. This is
386 // true even when 'a' is an expression that never matches anything
387 // (like '\P{any}').
388 //
389 // Additionally, the regex 'a{1}' is always equivalent to 'a'.
390 if rep.min == 0 && rep.max == Some(0) {
391 return Hir::empty();
392 } else if rep.min == 1 && rep.max == Some(1) {
393 return *rep.sub;
394 }
395 let props = Properties::repetition(&rep);
396 Hir { kind: HirKind::Repetition(rep), props }
397 }
398
399 /// Creates a capture HIR expression.
400 ///
401 /// Note that there is no explicit HIR value for a non-capturing group.
402 /// Since a non-capturing group only exists to override precedence in the
403 /// concrete syntax and since an HIR already does its own grouping based on
404 /// what is parsed, there is no need to explicitly represent non-capturing
405 /// groups in the HIR.
406 #[inline]
407 pub fn capture(capture: Capture) -> Hir {
408 let props = Properties::capture(&capture);
409 Hir { kind: HirKind::Capture(capture), props }
410 }
411
412 /// Returns the concatenation of the given expressions.
413 ///
414 /// This attempts to flatten and simplify the concatenation as appropriate.
415 ///
416 /// # Example
417 ///
418 /// This shows a simple example of basic flattening of both concatenations
419 /// and literals.
420 ///
421 /// ```
422 /// use regex_syntax::hir::Hir;
423 ///
424 /// let hir = Hir::concat(vec![
425 /// Hir::concat(vec![
426 /// Hir::literal([b'a']),
427 /// Hir::literal([b'b']),
428 /// Hir::literal([b'c']),
429 /// ]),
430 /// Hir::concat(vec![
431 /// Hir::literal([b'x']),
432 /// Hir::literal([b'y']),
433 /// Hir::literal([b'z']),
434 /// ]),
435 /// ]);
436 /// let expected = Hir::literal("abcxyz".as_bytes());
437 /// assert_eq!(expected, hir);
438 /// ```
439 pub fn concat(subs: Vec<Hir>) -> Hir {
440 // We rebuild the concatenation by simplifying it. Would be nice to do
441 // it in place, but that seems a little tricky?
442 let mut new = vec![];
443 // This gobbles up any adjacent literals in a concatenation and smushes
444 // them together. Basically, when we see a literal, we add its bytes
445 // to 'prior_lit', and whenever we see anything else, we first take
446 // any bytes in 'prior_lit' and add it to the 'new' concatenation.
447 let mut prior_lit: Option<Vec<u8>> = None;
448 for sub in subs {
449 let (kind, props) = sub.into_parts();
450 match kind {
451 HirKind::Literal(Literal(bytes)) => {
452 if let Some(ref mut prior_bytes) = prior_lit {
453 prior_bytes.extend_from_slice(&bytes);
454 } else {
455 prior_lit = Some(bytes.to_vec());
456 }
457 }
458 // We also flatten concats that are direct children of another
459 // concat. We only need to do this one level deep since
460 // Hir::concat is the only way to build concatenations, and so
461 // flattening happens inductively.
462 HirKind::Concat(subs2) => {
463 for sub2 in subs2 {
464 let (kind2, props2) = sub2.into_parts();
465 match kind2 {
466 HirKind::Literal(Literal(bytes)) => {
467 if let Some(ref mut prior_bytes) = prior_lit {
468 prior_bytes.extend_from_slice(&bytes);
469 } else {
470 prior_lit = Some(bytes.to_vec());
471 }
472 }
473 kind2 => {
474 if let Some(prior_bytes) = prior_lit.take() {
475 new.push(Hir::literal(prior_bytes));
476 }
477 new.push(Hir { kind: kind2, props: props2 });
478 }
479 }
480 }
481 }
482 // We can just skip empty HIRs.
483 HirKind::Empty => {}
484 kind => {
485 if let Some(prior_bytes) = prior_lit.take() {
486 new.push(Hir::literal(prior_bytes));
487 }
488 new.push(Hir { kind, props });
489 }
490 }
491 }
492 if let Some(prior_bytes) = prior_lit.take() {
493 new.push(Hir::literal(prior_bytes));
494 }
495 if new.is_empty() {
496 return Hir::empty();
497 } else if new.len() == 1 {
498 return new.pop().unwrap();
499 }
500 let props = Properties::concat(&new);
501 Hir { kind: HirKind::Concat(new), props }
502 }
503
504 /// Returns the alternation of the given expressions.
505 ///
506 /// This flattens and simplifies the alternation as appropriate. This may
507 /// include factoring out common prefixes or even rewriting the alternation
508 /// as a character class.
509 ///
510 /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
511 /// is not possible for one to write an empty alternation, or even an
512 /// alternation with a single sub-expression, in the concrete syntax of a
513 /// regex.)
514 ///
515 /// # Example
516 ///
517 /// This is a simple example showing how an alternation might get
518 /// simplified.
519 ///
520 /// ```
521 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
522 ///
523 /// let hir = Hir::alternation(vec![
524 /// Hir::literal([b'a']),
525 /// Hir::literal([b'b']),
526 /// Hir::literal([b'c']),
527 /// Hir::literal([b'd']),
528 /// Hir::literal([b'e']),
529 /// Hir::literal([b'f']),
530 /// ]);
531 /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
532 /// ClassUnicodeRange::new('a', 'f'),
533 /// ])));
534 /// assert_eq!(expected, hir);
535 /// ```
536 ///
537 /// And another example showing how common prefixes might get factored
538 /// out.
539 ///
540 /// ```
541 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
542 ///
543 /// let hir = Hir::alternation(vec![
544 /// Hir::concat(vec![
545 /// Hir::literal("abc".as_bytes()),
546 /// Hir::class(Class::Unicode(ClassUnicode::new([
547 /// ClassUnicodeRange::new('A', 'Z'),
548 /// ]))),
549 /// ]),
550 /// Hir::concat(vec![
551 /// Hir::literal("abc".as_bytes()),
552 /// Hir::class(Class::Unicode(ClassUnicode::new([
553 /// ClassUnicodeRange::new('a', 'z'),
554 /// ]))),
555 /// ]),
556 /// ]);
557 /// let expected = Hir::concat(vec![
558 /// Hir::literal("abc".as_bytes()),
559 /// Hir::alternation(vec![
560 /// Hir::class(Class::Unicode(ClassUnicode::new([
561 /// ClassUnicodeRange::new('A', 'Z'),
562 /// ]))),
563 /// Hir::class(Class::Unicode(ClassUnicode::new([
564 /// ClassUnicodeRange::new('a', 'z'),
565 /// ]))),
566 /// ]),
567 /// ]);
568 /// assert_eq!(expected, hir);
569 /// ```
570 ///
571 /// Note that these sorts of simplifications are not guaranteed.
572 pub fn alternation(subs: Vec<Hir>) -> Hir {
573 // We rebuild the alternation by simplifying it. We proceed similarly
574 // as the concatenation case. But in this case, there's no literal
575 // simplification happening. We're just flattening alternations.
576 let mut new = Vec::with_capacity(subs.len());
577 for sub in subs {
578 let (kind, props) = sub.into_parts();
579 match kind {
580 HirKind::Alternation(subs2) => {
581 new.extend(subs2);
582 }
583 kind => {
584 new.push(Hir { kind, props });
585 }
586 }
587 }
588 if new.is_empty() {
589 return Hir::fail();
590 } else if new.len() == 1 {
591 return new.pop().unwrap();
592 }
593 // Now that it's completely flattened, look for the special case of
594 // 'char1|char2|...|charN' and collapse that into a class. Note that
595 // we look for 'char' first and then bytes. The issue here is that if
596 // we find both non-ASCII codepoints and non-ASCII singleton bytes,
597 // then it isn't actually possible to smush them into a single class.
598 // (Because classes are either "all codepoints" or "all bytes." You
599 // can have a class that both matches non-ASCII but valid UTF-8 and
600 // invalid UTF-8.) So we look for all chars and then all bytes, and
601 // don't handle anything else.
602 if let Some(singletons) = singleton_chars(&new) {
603 let it = singletons
604 .into_iter()
605 .map(|ch| ClassUnicodeRange { start: ch, end: ch });
606 return Hir::class(Class::Unicode(ClassUnicode::new(it)));
607 }
608 if let Some(singletons) = singleton_bytes(&new) {
609 let it = singletons
610 .into_iter()
611 .map(|b| ClassBytesRange { start: b, end: b });
612 return Hir::class(Class::Bytes(ClassBytes::new(it)));
613 }
614 // Similar to singleton chars, we can also look for alternations of
615 // classes. Those can be smushed into a single class.
616 if let Some(cls) = class_chars(&new) {
617 return Hir::class(cls);
618 }
619 if let Some(cls) = class_bytes(&new) {
620 return Hir::class(cls);
621 }
622 // Factor out a common prefix if we can, which might potentially
623 // simplify the expression and unlock other optimizations downstream.
624 // It also might generally make NFA matching and DFA construction
625 // faster by reducing the scope of branching in the regex.
626 new = match lift_common_prefix(new) {
627 Ok(hir) => return hir,
628 Err(unchanged) => unchanged,
629 };
630 let props = Properties::alternation(&new);
631 Hir { kind: HirKind::Alternation(new), props }
632 }
633
634 /// Returns an HIR expression for `.`.
635 ///
636 /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
637 /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
638 /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
639 /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
640 /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
641 /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
642 ///
643 /// # Example
644 ///
645 /// Note that this is a convenience routine for constructing the correct
646 /// character class based on the value of `Dot`. There is no explicit "dot"
647 /// HIR value. It is just an abbreviation for a common character class.
648 ///
649 /// ```
650 /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
651 ///
652 /// let hir = Hir::dot(Dot::AnyByte);
653 /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
654 /// ClassBytesRange::new(0x00, 0xFF),
655 /// ])));
656 /// assert_eq!(expected, hir);
657 /// ```
658 #[inline]
659 pub fn dot(dot: Dot) -> Hir {
660 match dot {
661 Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([
662 ClassUnicodeRange::new('\0', '\u{10FFFF}'),
663 ]))),
664 Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([
665 ClassBytesRange::new(b'\0', b'\xFF'),
666 ]))),
667 Dot::AnyCharExcept(ch) => {
668 let mut cls =
669 ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
670 cls.negate();
671 Hir::class(Class::Unicode(cls))
672 }
673 Dot::AnyCharExceptLF => {
674 Hir::class(Class::Unicode(ClassUnicode::new([
675 ClassUnicodeRange::new('\0', '\x09'),
676 ClassUnicodeRange::new('\x0B', '\u{10FFFF}'),
677 ])))
678 }
679 Dot::AnyCharExceptCRLF => {
680 Hir::class(Class::Unicode(ClassUnicode::new([
681 ClassUnicodeRange::new('\0', '\x09'),
682 ClassUnicodeRange::new('\x0B', '\x0C'),
683 ClassUnicodeRange::new('\x0E', '\u{10FFFF}'),
684 ])))
685 }
686 Dot::AnyByteExcept(byte) => {
687 let mut cls =
688 ClassBytes::new([ClassBytesRange::new(byte, byte)]);
689 cls.negate();
690 Hir::class(Class::Bytes(cls))
691 }
692 Dot::AnyByteExceptLF => {
693 Hir::class(Class::Bytes(ClassBytes::new([
694 ClassBytesRange::new(b'\0', b'\x09'),
695 ClassBytesRange::new(b'\x0B', b'\xFF'),
696 ])))
697 }
698 Dot::AnyByteExceptCRLF => {
699 Hir::class(Class::Bytes(ClassBytes::new([
700 ClassBytesRange::new(b'\0', b'\x09'),
701 ClassBytesRange::new(b'\x0B', b'\x0C'),
702 ClassBytesRange::new(b'\x0E', b'\xFF'),
703 ])))
704 }
705 }
706 }
707}
708
709/// The underlying kind of an arbitrary [`Hir`] expression.
710///
711/// An `HirKind` is principally useful for doing case analysis on the type
712/// of a regular expression. If you're looking to build new `Hir` values,
713/// then you _must_ use the smart constructors defined on `Hir`, like
714/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
715/// not expose any way of building an `Hir` directly from an `HirKind`.
716#[derive(Clone, Debug, Eq, PartialEq)]
717pub enum HirKind {
718 /// The empty regular expression, which matches everything, including the
719 /// empty string.
720 Empty,
721 /// A literalstring that matches exactly these bytes.
722 Literal(Literal),
723 /// A single character class that matches any of the characters in the
724 /// class. A class can either consist of Unicode scalar values as
725 /// characters, or it can use bytes.
726 ///
727 /// A class may be empty. In which case, it matches nothing.
728 Class(Class),
729 /// A look-around assertion. A look-around match always has zero length.
730 Look(Look),
731 /// A repetition operation applied to a sub-expression.
732 Repetition(Repetition),
733 /// A capturing group, which contains a sub-expression.
734 Capture(Capture),
735 /// A concatenation of expressions.
736 ///
737 /// A concatenation matches only if each of its sub-expressions match one
738 /// after the other.
739 ///
740 /// Concatenations are guaranteed by `Hir`'s smart constructors to always
741 /// have at least two sub-expressions.
742 Concat(Vec<Hir>),
743 /// An alternation of expressions.
744 ///
745 /// An alternation matches only if at least one of its sub-expressions
746 /// match. If multiple sub-expressions match, then the leftmost is
747 /// preferred.
748 ///
749 /// Alternations are guaranteed by `Hir`'s smart constructors to always
750 /// have at least two sub-expressions.
751 Alternation(Vec<Hir>),
752}
753
754impl HirKind {
755 /// Returns a slice of this kind's sub-expressions, if any.
756 pub fn subs(&self) -> &[Hir] {
757 use core::slice::from_ref;
758
759 match *self {
760 HirKind::Empty
761 | HirKind::Literal(_)
762 | HirKind::Class(_)
763 | HirKind::Look(_) => &[],
764 HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
765 HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
766 HirKind::Concat(ref subs) => subs,
767 HirKind::Alternation(ref subs) => subs,
768 }
769 }
770}
771
772impl core::fmt::Debug for Hir {
773 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
774 self.kind.fmt(f)
775 }
776}
777
778/// Print a display representation of this Hir.
779///
780/// The result of this is a valid regular expression pattern string.
781///
782/// This implementation uses constant stack space and heap space proportional
783/// to the size of the `Hir`.
784impl core::fmt::Display for Hir {
785 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
786 crate::hir::print::Printer::new().print(self, f)
787 }
788}
789
790/// The high-level intermediate representation of a literal.
791///
792/// A literal corresponds to `0` or more bytes that should be matched
793/// literally. The smart constructors defined on `Hir` will automatically
794/// concatenate adjacent literals into one literal, and will even automatically
795/// replace empty literals with `Hir::empty()`.
796///
797/// Note that despite a literal being represented by a sequence of bytes, its
798/// `Debug` implementation will attempt to print it as a normal string. (That
799/// is, not a sequence of decimal numbers.)
800#[derive(Clone, Eq, PartialEq)]
801pub struct Literal(pub Box<[u8]>);
802
803impl core::fmt::Debug for Literal {
804 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
805 crate::debug::Bytes(&self.0).fmt(f)
806 }
807}
808
809/// The high-level intermediate representation of a character class.
810///
811/// A character class corresponds to a set of characters. A character is either
812/// defined by a Unicode scalar value or a byte.
813///
814/// A character class, regardless of its character type, is represented by a
815/// sequence of non-overlapping non-adjacent ranges of characters.
816///
817/// There are no guarantees about which class variant is used. Generally
818/// speaking, the Unicode variat is used whenever a class needs to contain
819/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
820/// when Unicode mode is disabled. For example, at the time of writing, the
821/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
822/// `[a\u00A0]` due to optimizations.
823///
824/// Note that `Bytes` variant may be produced even when it exclusively matches
825/// valid UTF-8. This is because a `Bytes` variant represents an intention by
826/// the author of the regular expression to disable Unicode mode, which in turn
827/// impacts the semantics of case insensitive matching. For example, `(?i)k`
828/// and `(?i-u)k` will not match the same set of strings.
829#[derive(Clone, Eq, PartialEq)]
830pub enum Class {
831 /// A set of characters represented by Unicode scalar values.
832 Unicode(ClassUnicode),
833 /// A set of characters represented by arbitrary bytes (one byte per
834 /// character).
835 Bytes(ClassBytes),
836}
837
838impl Class {
839 /// Apply Unicode simple case folding to this character class, in place.
840 /// The character class will be expanded to include all simple case folded
841 /// character variants.
842 ///
843 /// If this is a byte oriented character class, then this will be limited
844 /// to the ASCII ranges `A-Z` and `a-z`.
845 ///
846 /// # Panics
847 ///
848 /// This routine panics when the case mapping data necessary for this
849 /// routine to complete is unavailable. This occurs when the `unicode-case`
850 /// feature is not enabled and the underlying class is Unicode oriented.
851 ///
852 /// Callers should prefer using `try_case_fold_simple` instead, which will
853 /// return an error instead of panicking.
854 pub fn case_fold_simple(&mut self) {
855 match *self {
856 Class::Unicode(ref mut x) => x.case_fold_simple(),
857 Class::Bytes(ref mut x) => x.case_fold_simple(),
858 }
859 }
860
861 /// Apply Unicode simple case folding to this character class, in place.
862 /// The character class will be expanded to include all simple case folded
863 /// character variants.
864 ///
865 /// If this is a byte oriented character class, then this will be limited
866 /// to the ASCII ranges `A-Z` and `a-z`.
867 ///
868 /// # Error
869 ///
870 /// This routine returns an error when the case mapping data necessary
871 /// for this routine to complete is unavailable. This occurs when the
872 /// `unicode-case` feature is not enabled and the underlying class is
873 /// Unicode oriented.
874 pub fn try_case_fold_simple(
875 &mut self,
876 ) -> core::result::Result<(), CaseFoldError> {
877 match *self {
878 Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
879 Class::Bytes(ref mut x) => x.case_fold_simple(),
880 }
881 Ok(())
882 }
883
884 /// Negate this character class in place.
885 ///
886 /// After completion, this character class will contain precisely the
887 /// characters that weren't previously in the class.
888 pub fn negate(&mut self) {
889 match *self {
890 Class::Unicode(ref mut x) => x.negate(),
891 Class::Bytes(ref mut x) => x.negate(),
892 }
893 }
894
895 /// Returns true if and only if this character class will only ever match
896 /// valid UTF-8.
897 ///
898 /// A character class can match invalid UTF-8 only when the following
899 /// conditions are met:
900 ///
901 /// 1. The translator was configured to permit generating an expression
902 /// that can match invalid UTF-8. (By default, this is disabled.)
903 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
904 /// syntax or in the parser builder. By default, Unicode mode is
905 /// enabled.
906 pub fn is_utf8(&self) -> bool {
907 match *self {
908 Class::Unicode(_) => true,
909 Class::Bytes(ref x) => x.is_ascii(),
910 }
911 }
912
913 /// Returns the length, in bytes, of the smallest string matched by this
914 /// character class.
915 ///
916 /// For non-empty byte oriented classes, this always returns `1`. For
917 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
918 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
919 /// be returned.
920 ///
921 /// # Example
922 ///
923 /// This example shows some examples of regexes and their corresponding
924 /// minimum length, if any.
925 ///
926 /// ```
927 /// use regex_syntax::{hir::Properties, parse};
928 ///
929 /// // The empty string has a min length of 0.
930 /// let hir = parse(r"")?;
931 /// assert_eq!(Some(0), hir.properties().minimum_len());
932 /// // As do other types of regexes that only match the empty string.
933 /// let hir = parse(r"^$\b\B")?;
934 /// assert_eq!(Some(0), hir.properties().minimum_len());
935 /// // A regex that can match the empty string but match more is still 0.
936 /// let hir = parse(r"a*")?;
937 /// assert_eq!(Some(0), hir.properties().minimum_len());
938 /// // A regex that matches nothing has no minimum defined.
939 /// let hir = parse(r"[a&&b]")?;
940 /// assert_eq!(None, hir.properties().minimum_len());
941 /// // Character classes usually have a minimum length of 1.
942 /// let hir = parse(r"\w")?;
943 /// assert_eq!(Some(1), hir.properties().minimum_len());
944 /// // But sometimes Unicode classes might be bigger!
945 /// let hir = parse(r"\p{Cyrillic}")?;
946 /// assert_eq!(Some(2), hir.properties().minimum_len());
947 ///
948 /// # Ok::<(), Box<dyn std::error::Error>>(())
949 /// ```
950 pub fn minimum_len(&self) -> Option<usize> {
951 match *self {
952 Class::Unicode(ref x) => x.minimum_len(),
953 Class::Bytes(ref x) => x.minimum_len(),
954 }
955 }
956
957 /// Returns the length, in bytes, of the longest string matched by this
958 /// character class.
959 ///
960 /// For non-empty byte oriented classes, this always returns `1`. For
961 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
962 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
963 /// be returned.
964 ///
965 /// # Example
966 ///
967 /// This example shows some examples of regexes and their corresponding
968 /// maximum length, if any.
969 ///
970 /// ```
971 /// use regex_syntax::{hir::Properties, parse};
972 ///
973 /// // The empty string has a max length of 0.
974 /// let hir = parse(r"")?;
975 /// assert_eq!(Some(0), hir.properties().maximum_len());
976 /// // As do other types of regexes that only match the empty string.
977 /// let hir = parse(r"^$\b\B")?;
978 /// assert_eq!(Some(0), hir.properties().maximum_len());
979 /// // A regex that matches nothing has no maximum defined.
980 /// let hir = parse(r"[a&&b]")?;
981 /// assert_eq!(None, hir.properties().maximum_len());
982 /// // Bounded repeats work as you expect.
983 /// let hir = parse(r"x{2,10}")?;
984 /// assert_eq!(Some(10), hir.properties().maximum_len());
985 /// // An unbounded repeat means there is no maximum.
986 /// let hir = parse(r"x{2,}")?;
987 /// assert_eq!(None, hir.properties().maximum_len());
988 /// // With Unicode enabled, \w can match up to 4 bytes!
989 /// let hir = parse(r"\w")?;
990 /// assert_eq!(Some(4), hir.properties().maximum_len());
991 /// // Without Unicode enabled, \w matches at most 1 byte.
992 /// let hir = parse(r"(?-u)\w")?;
993 /// assert_eq!(Some(1), hir.properties().maximum_len());
994 ///
995 /// # Ok::<(), Box<dyn std::error::Error>>(())
996 /// ```
997 pub fn maximum_len(&self) -> Option<usize> {
998 match *self {
999 Class::Unicode(ref x) => x.maximum_len(),
1000 Class::Bytes(ref x) => x.maximum_len(),
1001 }
1002 }
1003
1004 /// Returns true if and only if this character class is empty. That is,
1005 /// it has no elements.
1006 ///
1007 /// An empty character can never match anything, including an empty string.
1008 pub fn is_empty(&self) -> bool {
1009 match *self {
1010 Class::Unicode(ref x) => x.ranges().is_empty(),
1011 Class::Bytes(ref x) => x.ranges().is_empty(),
1012 }
1013 }
1014
1015 /// If this class consists of exactly one element (whether a codepoint or a
1016 /// byte), then return it as a literal byte string.
1017 ///
1018 /// If this class is empty or contains more than one element, then `None`
1019 /// is returned.
1020 pub fn literal(&self) -> Option<Vec<u8>> {
1021 match *self {
1022 Class::Unicode(ref x) => x.literal(),
1023 Class::Bytes(ref x) => x.literal(),
1024 }
1025 }
1026}
1027
1028impl core::fmt::Debug for Class {
1029 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1030 use crate::debug::Byte;
1031
1032 let mut fmter = f.debug_set();
1033 match *self {
1034 Class::Unicode(ref cls) => {
1035 for r in cls.ranges().iter() {
1036 fmter.entry(&(r.start..=r.end));
1037 }
1038 }
1039 Class::Bytes(ref cls) => {
1040 for r in cls.ranges().iter() {
1041 fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1042 }
1043 }
1044 }
1045 fmter.finish()
1046 }
1047}
1048
1049/// A set of characters represented by Unicode scalar values.
1050#[derive(Clone, Debug, Eq, PartialEq)]
1051pub struct ClassUnicode {
1052 set: IntervalSet<ClassUnicodeRange>,
1053}
1054
1055impl ClassUnicode {
1056 /// Create a new class from a sequence of ranges.
1057 ///
1058 /// The given ranges do not need to be in any specific order, and ranges
1059 /// may overlap. Ranges will automatically be sorted into a canonical
1060 /// non-overlapping order.
1061 pub fn new<I>(ranges: I) -> ClassUnicode
1062 where
1063 I: IntoIterator<Item = ClassUnicodeRange>,
1064 {
1065 ClassUnicode { set: IntervalSet::new(ranges) }
1066 }
1067
1068 /// Create a new class with no ranges.
1069 ///
1070 /// An empty class matches nothing. That is, it is equivalent to
1071 /// [`Hir::fail`].
1072 pub fn empty() -> ClassUnicode {
1073 ClassUnicode::new(vec![])
1074 }
1075
1076 /// Add a new range to this set.
1077 pub fn push(&mut self, range: ClassUnicodeRange) {
1078 self.set.push(range);
1079 }
1080
1081 /// Return an iterator over all ranges in this class.
1082 ///
1083 /// The iterator yields ranges in ascending order.
1084 pub fn iter(&self) -> ClassUnicodeIter<'_> {
1085 ClassUnicodeIter(self.set.iter())
1086 }
1087
1088 /// Return the underlying ranges as a slice.
1089 pub fn ranges(&self) -> &[ClassUnicodeRange] {
1090 self.set.intervals()
1091 }
1092
1093 /// Expand this character class such that it contains all case folded
1094 /// characters, according to Unicode's "simple" mapping. For example, if
1095 /// this class consists of the range `a-z`, then applying case folding will
1096 /// result in the class containing both the ranges `a-z` and `A-Z`.
1097 ///
1098 /// # Panics
1099 ///
1100 /// This routine panics when the case mapping data necessary for this
1101 /// routine to complete is unavailable. This occurs when the `unicode-case`
1102 /// feature is not enabled.
1103 ///
1104 /// Callers should prefer using `try_case_fold_simple` instead, which will
1105 /// return an error instead of panicking.
1106 pub fn case_fold_simple(&mut self) {
1107 self.set
1108 .case_fold_simple()
1109 .expect("unicode-case feature must be enabled");
1110 }
1111
1112 /// Expand this character class such that it contains all case folded
1113 /// characters, according to Unicode's "simple" mapping. For example, if
1114 /// this class consists of the range `a-z`, then applying case folding will
1115 /// result in the class containing both the ranges `a-z` and `A-Z`.
1116 ///
1117 /// # Error
1118 ///
1119 /// This routine returns an error when the case mapping data necessary
1120 /// for this routine to complete is unavailable. This occurs when the
1121 /// `unicode-case` feature is not enabled.
1122 pub fn try_case_fold_simple(
1123 &mut self,
1124 ) -> core::result::Result<(), CaseFoldError> {
1125 self.set.case_fold_simple()
1126 }
1127
1128 /// Negate this character class.
1129 ///
1130 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1131 /// set, then it will not be in this set after negation.
1132 pub fn negate(&mut self) {
1133 self.set.negate();
1134 }
1135
1136 /// Union this character class with the given character class, in place.
1137 pub fn union(&mut self, other: &ClassUnicode) {
1138 self.set.union(&other.set);
1139 }
1140
1141 /// Intersect this character class with the given character class, in
1142 /// place.
1143 pub fn intersect(&mut self, other: &ClassUnicode) {
1144 self.set.intersect(&other.set);
1145 }
1146
1147 /// Subtract the given character class from this character class, in place.
1148 pub fn difference(&mut self, other: &ClassUnicode) {
1149 self.set.difference(&other.set);
1150 }
1151
1152 /// Compute the symmetric difference of the given character classes, in
1153 /// place.
1154 ///
1155 /// This computes the symmetric difference of two character classes. This
1156 /// removes all elements in this class that are also in the given class,
1157 /// but all adds all elements from the given class that aren't in this
1158 /// class. That is, the class will contain all elements in either class,
1159 /// but will not contain any elements that are in both classes.
1160 pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1161 self.set.symmetric_difference(&other.set);
1162 }
1163
1164 /// Returns true if and only if this character class will either match
1165 /// nothing or only ASCII bytes. Stated differently, this returns false
1166 /// if and only if this class contains a non-ASCII codepoint.
1167 pub fn is_ascii(&self) -> bool {
1168 self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
1169 }
1170
1171 /// Returns the length, in bytes, of the smallest string matched by this
1172 /// character class.
1173 ///
1174 /// Returns `None` when the class is empty.
1175 pub fn minimum_len(&self) -> Option<usize> {
1176 let first = self.ranges().get(0)?;
1177 // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1178 Some(first.start.len_utf8())
1179 }
1180
1181 /// Returns the length, in bytes, of the longest string matched by this
1182 /// character class.
1183 ///
1184 /// Returns `None` when the class is empty.
1185 pub fn maximum_len(&self) -> Option<usize> {
1186 let last = self.ranges().last()?;
1187 // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1188 Some(last.end.len_utf8())
1189 }
1190
1191 /// If this class consists of exactly one codepoint, then return it as
1192 /// a literal byte string.
1193 ///
1194 /// If this class is empty or contains more than one codepoint, then `None`
1195 /// is returned.
1196 pub fn literal(&self) -> Option<Vec<u8>> {
1197 let rs = self.ranges();
1198 if rs.len() == 1 && rs[0].start == rs[0].end {
1199 Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
1200 } else {
1201 None
1202 }
1203 }
1204
1205 /// If this class consists of only ASCII ranges, then return its
1206 /// corresponding and equivalent byte class.
1207 pub fn to_byte_class(&self) -> Option<ClassBytes> {
1208 if !self.is_ascii() {
1209 return None;
1210 }
1211 Some(ClassBytes::new(self.ranges().iter().map(|r| {
1212 // Since we are guaranteed that our codepoint range is ASCII, the
1213 // 'u8::try_from' calls below are guaranteed to be correct.
1214 ClassBytesRange {
1215 start: u8::try_from(r.start).unwrap(),
1216 end: u8::try_from(r.end).unwrap(),
1217 }
1218 })))
1219 }
1220}
1221
1222/// An iterator over all ranges in a Unicode character class.
1223///
1224/// The lifetime `'a` refers to the lifetime of the underlying class.
1225#[derive(Debug)]
1226pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1227
1228impl<'a> Iterator for ClassUnicodeIter<'a> {
1229 type Item = &'a ClassUnicodeRange;
1230
1231 fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1232 self.0.next()
1233 }
1234}
1235
1236/// A single range of characters represented by Unicode scalar values.
1237///
1238/// The range is closed. That is, the start and end of the range are included
1239/// in the range.
1240#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1241pub struct ClassUnicodeRange {
1242 start: char,
1243 end: char,
1244}
1245
1246impl core::fmt::Debug for ClassUnicodeRange {
1247 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1248 let start = if !self.start.is_whitespace() && !self.start.is_control()
1249 {
1250 self.start.to_string()
1251 } else {
1252 format!("0x{:X}", u32::from(self.start))
1253 };
1254 let end = if !self.end.is_whitespace() && !self.end.is_control() {
1255 self.end.to_string()
1256 } else {
1257 format!("0x{:X}", u32::from(self.end))
1258 };
1259 f.debug_struct("ClassUnicodeRange")
1260 .field("start", &start)
1261 .field("end", &end)
1262 .finish()
1263 }
1264}
1265
1266impl Interval for ClassUnicodeRange {
1267 type Bound = char;
1268
1269 #[inline]
1270 fn lower(&self) -> char {
1271 self.start
1272 }
1273 #[inline]
1274 fn upper(&self) -> char {
1275 self.end
1276 }
1277 #[inline]
1278 fn set_lower(&mut self, bound: char) {
1279 self.start = bound;
1280 }
1281 #[inline]
1282 fn set_upper(&mut self, bound: char) {
1283 self.end = bound;
1284 }
1285
1286 /// Apply simple case folding to this Unicode scalar value range.
1287 ///
1288 /// Additional ranges are appended to the given vector. Canonical ordering
1289 /// is *not* maintained in the given vector.
1290 fn case_fold_simple(
1291 &self,
1292 ranges: &mut Vec<ClassUnicodeRange>,
1293 ) -> Result<(), unicode::CaseFoldError> {
1294 let mut folder = unicode::SimpleCaseFolder::new()?;
1295 if !folder.overlaps(self.start, self.end) {
1296 return Ok(());
1297 }
1298 let (start, end) = (u32::from(self.start), u32::from(self.end));
1299 for cp in (start..=end).filter_map(char::from_u32) {
1300 for &cp_folded in folder.mapping(cp) {
1301 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1302 }
1303 }
1304 Ok(())
1305 }
1306}
1307
1308impl ClassUnicodeRange {
1309 /// Create a new Unicode scalar value range for a character class.
1310 ///
1311 /// The returned range is always in a canonical form. That is, the range
1312 /// returned always satisfies the invariant that `start <= end`.
1313 pub fn new(start: char, end: char) -> ClassUnicodeRange {
1314 ClassUnicodeRange::create(start, end)
1315 }
1316
1317 /// Return the start of this range.
1318 ///
1319 /// The start of a range is always less than or equal to the end of the
1320 /// range.
1321 pub fn start(&self) -> char {
1322 self.start
1323 }
1324
1325 /// Return the end of this range.
1326 ///
1327 /// The end of a range is always greater than or equal to the start of the
1328 /// range.
1329 pub fn end(&self) -> char {
1330 self.end
1331 }
1332
1333 /// Returns the number of codepoints in this range.
1334 pub fn len(&self) -> usize {
1335 let diff = 1 + u32::from(self.end) - u32::from(self.start);
1336 // This is likely to panic in 16-bit targets since a usize can only fit
1337 // 2^16. It's not clear what to do here, other than to return an error
1338 // when building a Unicode class that contains a range whose length
1339 // overflows usize. (Which, to be honest, is probably quite common on
1340 // 16-bit targets. For example, this would imply that '.' and '\p{any}'
1341 // would be impossible to build.)
1342 usize::try_from(diff).expect("char class len fits in usize")
1343 }
1344}
1345
1346/// A set of characters represented by arbitrary bytes.
1347///
1348/// Each byte corresponds to one character.
1349#[derive(Clone, Debug, Eq, PartialEq)]
1350pub struct ClassBytes {
1351 set: IntervalSet<ClassBytesRange>,
1352}
1353
1354impl ClassBytes {
1355 /// Create a new class from a sequence of ranges.
1356 ///
1357 /// The given ranges do not need to be in any specific order, and ranges
1358 /// may overlap. Ranges will automatically be sorted into a canonical
1359 /// non-overlapping order.
1360 pub fn new<I>(ranges: I) -> ClassBytes
1361 where
1362 I: IntoIterator<Item = ClassBytesRange>,
1363 {
1364 ClassBytes { set: IntervalSet::new(ranges) }
1365 }
1366
1367 /// Create a new class with no ranges.
1368 ///
1369 /// An empty class matches nothing. That is, it is equivalent to
1370 /// [`Hir::fail`].
1371 pub fn empty() -> ClassBytes {
1372 ClassBytes::new(vec![])
1373 }
1374
1375 /// Add a new range to this set.
1376 pub fn push(&mut self, range: ClassBytesRange) {
1377 self.set.push(range);
1378 }
1379
1380 /// Return an iterator over all ranges in this class.
1381 ///
1382 /// The iterator yields ranges in ascending order.
1383 pub fn iter(&self) -> ClassBytesIter<'_> {
1384 ClassBytesIter(self.set.iter())
1385 }
1386
1387 /// Return the underlying ranges as a slice.
1388 pub fn ranges(&self) -> &[ClassBytesRange] {
1389 self.set.intervals()
1390 }
1391
1392 /// Expand this character class such that it contains all case folded
1393 /// characters. For example, if this class consists of the range `a-z`,
1394 /// then applying case folding will result in the class containing both the
1395 /// ranges `a-z` and `A-Z`.
1396 ///
1397 /// Note that this only applies ASCII case folding, which is limited to the
1398 /// characters `a-z` and `A-Z`.
1399 pub fn case_fold_simple(&mut self) {
1400 self.set.case_fold_simple().expect("ASCII case folding never fails");
1401 }
1402
1403 /// Negate this byte class.
1404 ///
1405 /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1406 /// will not be in this set after negation.
1407 pub fn negate(&mut self) {
1408 self.set.negate();
1409 }
1410
1411 /// Union this byte class with the given byte class, in place.
1412 pub fn union(&mut self, other: &ClassBytes) {
1413 self.set.union(&other.set);
1414 }
1415
1416 /// Intersect this byte class with the given byte class, in place.
1417 pub fn intersect(&mut self, other: &ClassBytes) {
1418 self.set.intersect(&other.set);
1419 }
1420
1421 /// Subtract the given byte class from this byte class, in place.
1422 pub fn difference(&mut self, other: &ClassBytes) {
1423 self.set.difference(&other.set);
1424 }
1425
1426 /// Compute the symmetric difference of the given byte classes, in place.
1427 ///
1428 /// This computes the symmetric difference of two byte classes. This
1429 /// removes all elements in this class that are also in the given class,
1430 /// but all adds all elements from the given class that aren't in this
1431 /// class. That is, the class will contain all elements in either class,
1432 /// but will not contain any elements that are in both classes.
1433 pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1434 self.set.symmetric_difference(&other.set);
1435 }
1436
1437 /// Returns true if and only if this character class will either match
1438 /// nothing or only ASCII bytes. Stated differently, this returns false
1439 /// if and only if this class contains a non-ASCII byte.
1440 pub fn is_ascii(&self) -> bool {
1441 self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1442 }
1443
1444 /// Returns the length, in bytes, of the smallest string matched by this
1445 /// character class.
1446 ///
1447 /// Returns `None` when the class is empty.
1448 pub fn minimum_len(&self) -> Option<usize> {
1449 if self.ranges().is_empty() {
1450 None
1451 } else {
1452 Some(1)
1453 }
1454 }
1455
1456 /// Returns the length, in bytes, of the longest string matched by this
1457 /// character class.
1458 ///
1459 /// Returns `None` when the class is empty.
1460 pub fn maximum_len(&self) -> Option<usize> {
1461 if self.ranges().is_empty() {
1462 None
1463 } else {
1464 Some(1)
1465 }
1466 }
1467
1468 /// If this class consists of exactly one byte, then return it as
1469 /// a literal byte string.
1470 ///
1471 /// If this class is empty or contains more than one byte, then `None`
1472 /// is returned.
1473 pub fn literal(&self) -> Option<Vec<u8>> {
1474 let rs = self.ranges();
1475 if rs.len() == 1 && rs[0].start == rs[0].end {
1476 Some(vec![rs[0].start])
1477 } else {
1478 None
1479 }
1480 }
1481
1482 /// If this class consists of only ASCII ranges, then return its
1483 /// corresponding and equivalent Unicode class.
1484 pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1485 if !self.is_ascii() {
1486 return None;
1487 }
1488 Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1489 // Since we are guaranteed that our byte range is ASCII, the
1490 // 'char::from' calls below are correct and will not erroneously
1491 // convert a raw byte value into its corresponding codepoint.
1492 ClassUnicodeRange {
1493 start: char::from(r.start),
1494 end: char::from(r.end),
1495 }
1496 })))
1497 }
1498}
1499
1500/// An iterator over all ranges in a byte character class.
1501///
1502/// The lifetime `'a` refers to the lifetime of the underlying class.
1503#[derive(Debug)]
1504pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1505
1506impl<'a> Iterator for ClassBytesIter<'a> {
1507 type Item = &'a ClassBytesRange;
1508
1509 fn next(&mut self) -> Option<&'a ClassBytesRange> {
1510 self.0.next()
1511 }
1512}
1513
1514/// A single range of characters represented by arbitrary bytes.
1515///
1516/// The range is closed. That is, the start and end of the range are included
1517/// in the range.
1518#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1519pub struct ClassBytesRange {
1520 start: u8,
1521 end: u8,
1522}
1523
1524impl Interval for ClassBytesRange {
1525 type Bound = u8;
1526
1527 #[inline]
1528 fn lower(&self) -> u8 {
1529 self.start
1530 }
1531 #[inline]
1532 fn upper(&self) -> u8 {
1533 self.end
1534 }
1535 #[inline]
1536 fn set_lower(&mut self, bound: u8) {
1537 self.start = bound;
1538 }
1539 #[inline]
1540 fn set_upper(&mut self, bound: u8) {
1541 self.end = bound;
1542 }
1543
1544 /// Apply simple case folding to this byte range. Only ASCII case mappings
1545 /// (for a-z) are applied.
1546 ///
1547 /// Additional ranges are appended to the given vector. Canonical ordering
1548 /// is *not* maintained in the given vector.
1549 fn case_fold_simple(
1550 &self,
1551 ranges: &mut Vec<ClassBytesRange>,
1552 ) -> Result<(), unicode::CaseFoldError> {
1553 if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1554 let lower = cmp::max(self.start, b'a');
1555 let upper = cmp::min(self.end, b'z');
1556 ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1557 }
1558 if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1559 let lower = cmp::max(self.start, b'A');
1560 let upper = cmp::min(self.end, b'Z');
1561 ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1562 }
1563 Ok(())
1564 }
1565}
1566
1567impl ClassBytesRange {
1568 /// Create a new byte range for a character class.
1569 ///
1570 /// The returned range is always in a canonical form. That is, the range
1571 /// returned always satisfies the invariant that `start <= end`.
1572 pub fn new(start: u8, end: u8) -> ClassBytesRange {
1573 ClassBytesRange::create(start, end)
1574 }
1575
1576 /// Return the start of this range.
1577 ///
1578 /// The start of a range is always less than or equal to the end of the
1579 /// range.
1580 pub fn start(&self) -> u8 {
1581 self.start
1582 }
1583
1584 /// Return the end of this range.
1585 ///
1586 /// The end of a range is always greater than or equal to the start of the
1587 /// range.
1588 pub fn end(&self) -> u8 {
1589 self.end
1590 }
1591
1592 /// Returns the number of bytes in this range.
1593 pub fn len(&self) -> usize {
1594 usize::from(self.end.checked_sub(self.start).unwrap())
1595 .checked_add(1)
1596 .unwrap()
1597 }
1598}
1599
1600impl core::fmt::Debug for ClassBytesRange {
1601 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1602 f.debug_struct("ClassBytesRange")
1603 .field("start", &crate::debug::Byte(self.start))
1604 .field("end", &crate::debug::Byte(self.end))
1605 .finish()
1606 }
1607}
1608
1609/// The high-level intermediate representation for a look-around assertion.
1610///
1611/// An assertion match is always zero-length. Also called an "empty match."
1612#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1613pub enum Look {
1614 /// Match the beginning of text. Specifically, this matches at the starting
1615 /// position of the input.
1616 Start = 1 << 0,
1617 /// Match the end of text. Specifically, this matches at the ending
1618 /// position of the input.
1619 End = 1 << 1,
1620 /// Match the beginning of a line or the beginning of text. Specifically,
1621 /// this matches at the starting position of the input, or at the position
1622 /// immediately following a `\n` character.
1623 StartLF = 1 << 2,
1624 /// Match the end of a line or the end of text. Specifically, this matches
1625 /// at the end position of the input, or at the position immediately
1626 /// preceding a `\n` character.
1627 EndLF = 1 << 3,
1628 /// Match the beginning of a line or the beginning of text. Specifically,
1629 /// this matches at the starting position of the input, or at the position
1630 /// immediately following either a `\r` or `\n` character, but never after
1631 /// a `\r` when a `\n` follows.
1632 StartCRLF = 1 << 4,
1633 /// Match the end of a line or the end of text. Specifically, this matches
1634 /// at the end position of the input, or at the position immediately
1635 /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1636 /// precedes it.
1637 EndCRLF = 1 << 5,
1638 /// Match an ASCII-only word boundary. That is, this matches a position
1639 /// where the left adjacent character and right adjacent character
1640 /// correspond to a word and non-word or a non-word and word character.
1641 WordAscii = 1 << 6,
1642 /// Match an ASCII-only negation of a word boundary.
1643 WordAsciiNegate = 1 << 7,
1644 /// Match a Unicode-aware word boundary. That is, this matches a position
1645 /// where the left adjacent character and right adjacent character
1646 /// correspond to a word and non-word or a non-word and word character.
1647 WordUnicode = 1 << 8,
1648 /// Match a Unicode-aware negation of a word boundary.
1649 WordUnicodeNegate = 1 << 9,
1650 /// Match the start of an ASCII-only word boundary. That is, this matches a
1651 /// position at either the beginning of the haystack or where the previous
1652 /// character is not a word character and the following character is a word
1653 /// character.
1654 WordStartAscii = 1 << 10,
1655 /// Match the end of an ASCII-only word boundary. That is, this matches
1656 /// a position at either the end of the haystack or where the previous
1657 /// character is a word character and the following character is not a word
1658 /// character.
1659 WordEndAscii = 1 << 11,
1660 /// Match the start of a Unicode word boundary. That is, this matches a
1661 /// position at either the beginning of the haystack or where the previous
1662 /// character is not a word character and the following character is a word
1663 /// character.
1664 WordStartUnicode = 1 << 12,
1665 /// Match the end of a Unicode word boundary. That is, this matches a
1666 /// position at either the end of the haystack or where the previous
1667 /// character is a word character and the following character is not a word
1668 /// character.
1669 WordEndUnicode = 1 << 13,
1670 /// Match the start half of an ASCII-only word boundary. That is, this
1671 /// matches a position at either the beginning of the haystack or where the
1672 /// previous character is not a word character.
1673 WordStartHalfAscii = 1 << 14,
1674 /// Match the end half of an ASCII-only word boundary. That is, this
1675 /// matches a position at either the end of the haystack or where the
1676 /// following character is not a word character.
1677 WordEndHalfAscii = 1 << 15,
1678 /// Match the start half of a Unicode word boundary. That is, this matches
1679 /// a position at either the beginning of the haystack or where the
1680 /// previous character is not a word character.
1681 WordStartHalfUnicode = 1 << 16,
1682 /// Match the end half of a Unicode word boundary. That is, this matches
1683 /// a position at either the end of the haystack or where the following
1684 /// character is not a word character.
1685 WordEndHalfUnicode = 1 << 17,
1686}
1687
1688impl Look {
1689 /// Flip the look-around assertion to its equivalent for reverse searches.
1690 /// For example, `StartLF` gets translated to `EndLF`.
1691 ///
1692 /// Some assertions, such as `WordUnicode`, remain the same since they
1693 /// match the same positions regardless of the direction of the search.
1694 #[inline]
1695 pub const fn reversed(self) -> Look {
1696 match self {
1697 Look::Start => Look::End,
1698 Look::End => Look::Start,
1699 Look::StartLF => Look::EndLF,
1700 Look::EndLF => Look::StartLF,
1701 Look::StartCRLF => Look::EndCRLF,
1702 Look::EndCRLF => Look::StartCRLF,
1703 Look::WordAscii => Look::WordAscii,
1704 Look::WordAsciiNegate => Look::WordAsciiNegate,
1705 Look::WordUnicode => Look::WordUnicode,
1706 Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1707 Look::WordStartAscii => Look::WordEndAscii,
1708 Look::WordEndAscii => Look::WordStartAscii,
1709 Look::WordStartUnicode => Look::WordEndUnicode,
1710 Look::WordEndUnicode => Look::WordStartUnicode,
1711 Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1712 Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1713 Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1714 Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1715 }
1716 }
1717
1718 /// Return the underlying representation of this look-around enumeration
1719 /// as an integer. Giving the return value to the [`Look::from_repr`]
1720 /// constructor is guaranteed to return the same look-around variant that
1721 /// one started with within a semver compatible release of this crate.
1722 #[inline]
1723 pub const fn as_repr(self) -> u32 {
1724 // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1725 // actual int.
1726 self as u32
1727 }
1728
1729 /// Given the underlying representation of a `Look` value, return the
1730 /// corresponding `Look` value if the representation is valid. Otherwise
1731 /// `None` is returned.
1732 #[inline]
1733 pub const fn from_repr(repr: u32) -> Option<Look> {
1734 match repr {
1735 0b00_0000_0000_0000_0001 => Some(Look::Start),
1736 0b00_0000_0000_0000_0010 => Some(Look::End),
1737 0b00_0000_0000_0000_0100 => Some(Look::StartLF),
1738 0b00_0000_0000_0000_1000 => Some(Look::EndLF),
1739 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
1740 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
1741 0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
1742 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
1743 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
1744 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
1745 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
1746 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
1747 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
1748 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
1749 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
1750 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
1751 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
1752 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
1753 _ => None,
1754 }
1755 }
1756
1757 /// Returns a convenient single codepoint representation of this
1758 /// look-around assertion. Each assertion is guaranteed to be represented
1759 /// by a distinct character.
1760 ///
1761 /// This is useful for succinctly representing a look-around assertion in
1762 /// human friendly but succinct output intended for a programmer working on
1763 /// regex internals.
1764 #[inline]
1765 pub const fn as_char(self) -> char {
1766 match self {
1767 Look::Start => 'A',
1768 Look::End => 'z',
1769 Look::StartLF => '^',
1770 Look::EndLF => '$',
1771 Look::StartCRLF => 'r',
1772 Look::EndCRLF => 'R',
1773 Look::WordAscii => 'b',
1774 Look::WordAsciiNegate => 'B',
1775 Look::WordUnicode => '𝛃',
1776 Look::WordUnicodeNegate => '𝚩',
1777 Look::WordStartAscii => '<',
1778 Look::WordEndAscii => '>',
1779 Look::WordStartUnicode => '〈',
1780 Look::WordEndUnicode => '〉',
1781 Look::WordStartHalfAscii => '◁',
1782 Look::WordEndHalfAscii => '▷',
1783 Look::WordStartHalfUnicode => '◀',
1784 Look::WordEndHalfUnicode => '▶',
1785 }
1786 }
1787}
1788
1789/// The high-level intermediate representation for a capturing group.
1790///
1791/// A capturing group always has an index and a child expression. It may
1792/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1793/// necessary.
1794///
1795/// Note that there is no explicit representation of a non-capturing group
1796/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1797/// the recursive structure of the `Hir` itself.
1798#[derive(Clone, Debug, Eq, PartialEq)]
1799pub struct Capture {
1800 /// The capture index of the capture.
1801 pub index: u32,
1802 /// The name of the capture, if it exists.
1803 pub name: Option<Box<str>>,
1804 /// The expression inside the capturing group, which may be empty.
1805 pub sub: Box<Hir>,
1806}
1807
1808/// The high-level intermediate representation of a repetition operator.
1809///
1810/// A repetition operator permits the repetition of an arbitrary
1811/// sub-expression.
1812#[derive(Clone, Debug, Eq, PartialEq)]
1813pub struct Repetition {
1814 /// The minimum range of the repetition.
1815 ///
1816 /// Note that special cases like `?`, `+` and `*` all get translated into
1817 /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1818 ///
1819 /// When `min` is zero, this expression can match the empty string
1820 /// regardless of what its sub-expression is.
1821 pub min: u32,
1822 /// The maximum range of the repetition.
1823 ///
1824 /// Note that when `max` is `None`, `min` acts as a lower bound but where
1825 /// there is no upper bound. For something like `x{5}` where the min and
1826 /// max are equivalent, `min` will be set to `5` and `max` will be set to
1827 /// `Some(5)`.
1828 pub max: Option<u32>,
1829 /// Whether this repetition operator is greedy or not. A greedy operator
1830 /// will match as much as it can. A non-greedy operator will match as
1831 /// little as it can.
1832 ///
1833 /// Typically, operators are greedy by default and are only non-greedy when
1834 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1835 /// not. However, this can be inverted via the `U` "ungreedy" flag.
1836 pub greedy: bool,
1837 /// The expression being repeated.
1838 pub sub: Box<Hir>,
1839}
1840
1841impl Repetition {
1842 /// Returns a new repetition with the same `min`, `max` and `greedy`
1843 /// values, but with its sub-expression replaced with the one given.
1844 pub fn with(&self, sub: Hir) -> Repetition {
1845 Repetition {
1846 min: self.min,
1847 max: self.max,
1848 greedy: self.greedy,
1849 sub: Box::new(sub),
1850 }
1851 }
1852}
1853
1854/// A type describing the different flavors of `.`.
1855///
1856/// This type is meant to be used with [`Hir::dot`], which is a convenience
1857/// routine for building HIR values derived from the `.` regex.
1858#[non_exhaustive]
1859#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1860pub enum Dot {
1861 /// Matches the UTF-8 encoding of any Unicode scalar value.
1862 ///
1863 /// This is equivalent to `(?su:.)` and also `\p{any}`.
1864 AnyChar,
1865 /// Matches any byte value.
1866 ///
1867 /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1868 AnyByte,
1869 /// Matches the UTF-8 encoding of any Unicode scalar value except for the
1870 /// `char` given.
1871 ///
1872 /// This is equivalent to using `(?u-s:.)` with the line terminator set
1873 /// to a particular ASCII byte. (Because of peculiarities in the regex
1874 /// engines, a line terminator must be a single byte. It follows that when
1875 /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1876 /// value. That is, ti must be ASCII.)
1877 ///
1878 /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1879 /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1880 AnyCharExcept(char),
1881 /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1882 ///
1883 /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1884 AnyCharExceptLF,
1885 /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1886 /// and `\n`.
1887 ///
1888 /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1889 AnyCharExceptCRLF,
1890 /// Matches any byte value except for the `u8` given.
1891 ///
1892 /// This is equivalent to using `(?-us:.)` with the line terminator set
1893 /// to a particular ASCII byte. (Because of peculiarities in the regex
1894 /// engines, a line terminator must be a single byte. It follows that when
1895 /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1896 /// value. That is, ti must be ASCII.)
1897 ///
1898 /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1899 /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1900 AnyByteExcept(u8),
1901 /// Matches any byte value except for `\n`.
1902 ///
1903 /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1904 AnyByteExceptLF,
1905 /// Matches any byte value except for `\r` and `\n`.
1906 ///
1907 /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1908 AnyByteExceptCRLF,
1909}
1910
1911/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1912/// space but heap space proportional to the depth of the total `Hir`.
1913impl Drop for Hir {
1914 fn drop(&mut self) {
1915 use core::mem;
1916
1917 match *self.kind() {
1918 HirKind::Empty
1919 | HirKind::Literal(_)
1920 | HirKind::Class(_)
1921 | HirKind::Look(_) => return,
1922 HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1923 HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1924 return
1925 }
1926 HirKind::Concat(ref x) if x.is_empty() => return,
1927 HirKind::Alternation(ref x) if x.is_empty() => return,
1928 _ => {}
1929 }
1930
1931 let mut stack = vec![mem::replace(self, Hir::empty())];
1932 while let Some(mut expr) = stack.pop() {
1933 match expr.kind {
1934 HirKind::Empty
1935 | HirKind::Literal(_)
1936 | HirKind::Class(_)
1937 | HirKind::Look(_) => {}
1938 HirKind::Capture(ref mut x) => {
1939 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1940 }
1941 HirKind::Repetition(ref mut x) => {
1942 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1943 }
1944 HirKind::Concat(ref mut x) => {
1945 stack.extend(x.drain(..));
1946 }
1947 HirKind::Alternation(ref mut x) => {
1948 stack.extend(x.drain(..));
1949 }
1950 }
1951 }
1952 }
1953}
1954
1955/// A type that collects various properties of an HIR value.
1956///
1957/// Properties are always scalar values and represent meta data that is
1958/// computed inductively on an HIR value. Properties are defined for all
1959/// HIR values.
1960///
1961/// All methods on a `Properties` value take constant time and are meant to
1962/// be cheap to call.
1963#[derive(Clone, Debug, Eq, PartialEq)]
1964pub struct Properties(Box<PropertiesI>);
1965
1966/// The property definition. It is split out so that we can box it, and
1967/// there by make `Properties` use less stack size. This is kind-of important
1968/// because every HIR value has a `Properties` attached to it.
1969///
1970/// This does have the unfortunate consequence that creating any HIR value
1971/// always leads to at least one alloc for properties, but this is generally
1972/// true anyway (for pretty much all HirKinds except for look-arounds).
1973#[derive(Clone, Debug, Eq, PartialEq)]
1974struct PropertiesI {
1975 minimum_len: Option<usize>,
1976 maximum_len: Option<usize>,
1977 look_set: LookSet,
1978 look_set_prefix: LookSet,
1979 look_set_suffix: LookSet,
1980 look_set_prefix_any: LookSet,
1981 look_set_suffix_any: LookSet,
1982 utf8: bool,
1983 explicit_captures_len: usize,
1984 static_explicit_captures_len: Option<usize>,
1985 literal: bool,
1986 alternation_literal: bool,
1987}
1988
1989impl Properties {
1990 /// Returns the length (in bytes) of the smallest string matched by this
1991 /// HIR.
1992 ///
1993 /// A return value of `0` is possible and occurs when the HIR can match an
1994 /// empty string.
1995 ///
1996 /// `None` is returned when there is no minimum length. This occurs in
1997 /// precisely the cases where the HIR matches nothing. i.e., The language
1998 /// the regex matches is empty. An example of such a regex is `\P{any}`.
1999 #[inline]
2000 pub fn minimum_len(&self) -> Option<usize> {
2001 self.0.minimum_len
2002 }
2003
2004 /// Returns the length (in bytes) of the longest string matched by this
2005 /// HIR.
2006 ///
2007 /// A return value of `0` is possible and occurs when nothing longer than
2008 /// the empty string is in the language described by this HIR.
2009 ///
2010 /// `None` is returned when there is no longest matching string. This
2011 /// occurs when the HIR matches nothing or when there is no upper bound on
2012 /// the length of matching strings. Example of such regexes are `\P{any}`
2013 /// (matches nothing) and `a+` (has no upper bound).
2014 #[inline]
2015 pub fn maximum_len(&self) -> Option<usize> {
2016 self.0.maximum_len
2017 }
2018
2019 /// Returns a set of all look-around assertions that appear at least once
2020 /// in this HIR value.
2021 #[inline]
2022 pub fn look_set(&self) -> LookSet {
2023 self.0.look_set
2024 }
2025
2026 /// Returns a set of all look-around assertions that appear as a prefix for
2027 /// this HIR value. That is, the set returned corresponds to the set of
2028 /// assertions that must be passed before matching any bytes in a haystack.
2029 ///
2030 /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2031 /// if and only if the HIR is fully anchored at the start.
2032 #[inline]
2033 pub fn look_set_prefix(&self) -> LookSet {
2034 self.0.look_set_prefix
2035 }
2036
2037 /// Returns a set of all look-around assertions that appear as a _possible_
2038 /// prefix for this HIR value. That is, the set returned corresponds to the
2039 /// set of assertions that _may_ be passed before matching any bytes in a
2040 /// haystack.
2041 ///
2042 /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2043 /// true if and only if it's possible for the regex to match through a
2044 /// anchored assertion before consuming any input.
2045 #[inline]
2046 pub fn look_set_prefix_any(&self) -> LookSet {
2047 self.0.look_set_prefix_any
2048 }
2049
2050 /// Returns a set of all look-around assertions that appear as a suffix for
2051 /// this HIR value. That is, the set returned corresponds to the set of
2052 /// assertions that must be passed in order to be considered a match after
2053 /// all other consuming HIR expressions.
2054 ///
2055 /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2056 /// if and only if the HIR is fully anchored at the end.
2057 #[inline]
2058 pub fn look_set_suffix(&self) -> LookSet {
2059 self.0.look_set_suffix
2060 }
2061
2062 /// Returns a set of all look-around assertions that appear as a _possible_
2063 /// suffix for this HIR value. That is, the set returned corresponds to the
2064 /// set of assertions that _may_ be passed before matching any bytes in a
2065 /// haystack.
2066 ///
2067 /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2068 /// true if and only if it's possible for the regex to match through a
2069 /// anchored assertion at the end of a match without consuming any input.
2070 #[inline]
2071 pub fn look_set_suffix_any(&self) -> LookSet {
2072 self.0.look_set_suffix_any
2073 }
2074
2075 /// Return true if and only if the corresponding HIR will always match
2076 /// valid UTF-8.
2077 ///
2078 /// When this returns false, then it is possible for this HIR expression to
2079 /// match invalid UTF-8, including by matching between the code units of
2080 /// a single UTF-8 encoded codepoint.
2081 ///
2082 /// Note that this returns true even when the corresponding HIR can match
2083 /// the empty string. Since an empty string can technically appear between
2084 /// UTF-8 code units, it is possible for a match to be reported that splits
2085 /// a codepoint which could in turn be considered matching invalid UTF-8.
2086 /// However, it is generally assumed that such empty matches are handled
2087 /// specially by the search routine if it is absolutely required that
2088 /// matches not split a codepoint.
2089 ///
2090 /// # Example
2091 ///
2092 /// This code example shows the UTF-8 property of a variety of patterns.
2093 ///
2094 /// ```
2095 /// use regex_syntax::{ParserBuilder, parse};
2096 ///
2097 /// // Examples of 'is_utf8() == true'.
2098 /// assert!(parse(r"a")?.properties().is_utf8());
2099 /// assert!(parse(r"[^a]")?.properties().is_utf8());
2100 /// assert!(parse(r".")?.properties().is_utf8());
2101 /// assert!(parse(r"\W")?.properties().is_utf8());
2102 /// assert!(parse(r"\b")?.properties().is_utf8());
2103 /// assert!(parse(r"\B")?.properties().is_utf8());
2104 /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2105 /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2106 /// // Unicode mode is enabled by default, and in
2107 /// // that mode, all \x hex escapes are treated as
2108 /// // codepoints. So this actually matches the UTF-8
2109 /// // encoding of U+00FF.
2110 /// assert!(parse(r"\xFF")?.properties().is_utf8());
2111 ///
2112 /// // Now we show examples of 'is_utf8() == false'.
2113 /// // The only way to do this is to force the parser
2114 /// // to permit invalid UTF-8, otherwise all of these
2115 /// // would fail to parse!
2116 /// let parse = |pattern| {
2117 /// ParserBuilder::new().utf8(false).build().parse(pattern)
2118 /// };
2119 /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2120 /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2121 /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2122 /// // Conversely to the equivalent example above,
2123 /// // when Unicode mode is disabled, \x hex escapes
2124 /// // are treated as their raw byte values.
2125 /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2126 /// // Note that just because we disabled UTF-8 in the
2127 /// // parser doesn't mean we still can't use Unicode.
2128 /// // It is enabled by default, so \xFF is still
2129 /// // equivalent to matching the UTF-8 encoding of
2130 /// // U+00FF by default.
2131 /// assert!(parse(r"\xFF")?.properties().is_utf8());
2132 /// // Even though we use raw bytes that individually
2133 /// // are not valid UTF-8, when combined together, the
2134 /// // overall expression *does* match valid UTF-8!
2135 /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2136 ///
2137 /// # Ok::<(), Box<dyn std::error::Error>>(())
2138 /// ```
2139 #[inline]
2140 pub fn is_utf8(&self) -> bool {
2141 self.0.utf8
2142 }
2143
2144 /// Returns the total number of explicit capturing groups in the
2145 /// corresponding HIR.
2146 ///
2147 /// Note that this does not include the implicit capturing group
2148 /// corresponding to the entire match that is typically included by regex
2149 /// engines.
2150 ///
2151 /// # Example
2152 ///
2153 /// This method will return `0` for `a` and `1` for `(a)`:
2154 ///
2155 /// ```
2156 /// use regex_syntax::parse;
2157 ///
2158 /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
2159 /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
2160 ///
2161 /// # Ok::<(), Box<dyn std::error::Error>>(())
2162 /// ```
2163 #[inline]
2164 pub fn explicit_captures_len(&self) -> usize {
2165 self.0.explicit_captures_len
2166 }
2167
2168 /// Returns the total number of explicit capturing groups that appear in
2169 /// every possible match.
2170 ///
2171 /// If the number of capture groups can vary depending on the match, then
2172 /// this returns `None`. That is, a value is only returned when the number
2173 /// of matching groups is invariant or "static."
2174 ///
2175 /// Note that this does not include the implicit capturing group
2176 /// corresponding to the entire match.
2177 ///
2178 /// # Example
2179 ///
2180 /// This shows a few cases where a static number of capture groups is
2181 /// available and a few cases where it is not.
2182 ///
2183 /// ```
2184 /// use regex_syntax::parse;
2185 ///
2186 /// let len = |pattern| {
2187 /// parse(pattern).map(|h| {
2188 /// h.properties().static_explicit_captures_len()
2189 /// })
2190 /// };
2191 ///
2192 /// assert_eq!(Some(0), len("a")?);
2193 /// assert_eq!(Some(1), len("(a)")?);
2194 /// assert_eq!(Some(1), len("(a)|(b)")?);
2195 /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2196 /// assert_eq!(None, len("(a)|b")?);
2197 /// assert_eq!(None, len("a|(b)")?);
2198 /// assert_eq!(None, len("(b)*")?);
2199 /// assert_eq!(Some(1), len("(b)+")?);
2200 ///
2201 /// # Ok::<(), Box<dyn std::error::Error>>(())
2202 /// ```
2203 #[inline]
2204 pub fn static_explicit_captures_len(&self) -> Option<usize> {
2205 self.0.static_explicit_captures_len
2206 }
2207
2208 /// Return true if and only if this HIR is a simple literal. This is
2209 /// only true when this HIR expression is either itself a `Literal` or a
2210 /// concatenation of only `Literal`s.
2211 ///
2212 /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2213 /// the empty string are not (even though they contain sub-expressions that
2214 /// are literals).
2215 #[inline]
2216 pub fn is_literal(&self) -> bool {
2217 self.0.literal
2218 }
2219
2220 /// Return true if and only if this HIR is either a simple literal or an
2221 /// alternation of simple literals. This is only
2222 /// true when this HIR expression is either itself a `Literal` or a
2223 /// concatenation of only `Literal`s or an alternation of only `Literal`s.
2224 ///
2225 /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
2226 /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2227 /// (even though that contain sub-expressions that are literals).
2228 #[inline]
2229 pub fn is_alternation_literal(&self) -> bool {
2230 self.0.alternation_literal
2231 }
2232
2233 /// Returns the total amount of heap memory usage, in bytes, used by this
2234 /// `Properties` value.
2235 #[inline]
2236 pub fn memory_usage(&self) -> usize {
2237 core::mem::size_of::<PropertiesI>()
2238 }
2239
2240 /// Returns a new set of properties that corresponds to the union of the
2241 /// iterator of properties given.
2242 ///
2243 /// This is useful when one has multiple `Hir` expressions and wants
2244 /// to combine them into a single alternation without constructing the
2245 /// corresponding `Hir`. This routine provides a way of combining the
2246 /// properties of each `Hir` expression into one set of properties
2247 /// representing the union of those expressions.
2248 ///
2249 /// # Example: union with HIRs that never match
2250 ///
2251 /// This example shows that unioning properties together with one that
2252 /// represents a regex that never matches will "poison" certain attributes,
2253 /// like the minimum and maximum lengths.
2254 ///
2255 /// ```
2256 /// use regex_syntax::{hir::Properties, parse};
2257 ///
2258 /// let hir1 = parse("ab?c?")?;
2259 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2260 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2261 ///
2262 /// let hir2 = parse(r"[a&&b]")?;
2263 /// assert_eq!(None, hir2.properties().minimum_len());
2264 /// assert_eq!(None, hir2.properties().maximum_len());
2265 ///
2266 /// let hir3 = parse(r"wxy?z?")?;
2267 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2268 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2269 ///
2270 /// let unioned = Properties::union([
2271 /// hir1.properties(),
2272 /// hir2.properties(),
2273 /// hir3.properties(),
2274 /// ]);
2275 /// assert_eq!(None, unioned.minimum_len());
2276 /// assert_eq!(None, unioned.maximum_len());
2277 ///
2278 /// # Ok::<(), Box<dyn std::error::Error>>(())
2279 /// ```
2280 ///
2281 /// The maximum length can also be "poisoned" by a pattern that has no
2282 /// upper bound on the length of a match. The minimum length remains
2283 /// unaffected:
2284 ///
2285 /// ```
2286 /// use regex_syntax::{hir::Properties, parse};
2287 ///
2288 /// let hir1 = parse("ab?c?")?;
2289 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2290 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2291 ///
2292 /// let hir2 = parse(r"a+")?;
2293 /// assert_eq!(Some(1), hir2.properties().minimum_len());
2294 /// assert_eq!(None, hir2.properties().maximum_len());
2295 ///
2296 /// let hir3 = parse(r"wxy?z?")?;
2297 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2298 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2299 ///
2300 /// let unioned = Properties::union([
2301 /// hir1.properties(),
2302 /// hir2.properties(),
2303 /// hir3.properties(),
2304 /// ]);
2305 /// assert_eq!(Some(1), unioned.minimum_len());
2306 /// assert_eq!(None, unioned.maximum_len());
2307 ///
2308 /// # Ok::<(), Box<dyn std::error::Error>>(())
2309 /// ```
2310 pub fn union<I, P>(props: I) -> Properties
2311 where
2312 I: IntoIterator<Item = P>,
2313 P: core::borrow::Borrow<Properties>,
2314 {
2315 let mut it = props.into_iter().peekable();
2316 // While empty alternations aren't possible, we still behave as if they
2317 // are. When we have an empty alternate, then clearly the look-around
2318 // prefix and suffix is empty. Otherwise, it is the intersection of all
2319 // prefixes and suffixes (respectively) of the branches.
2320 let fix = if it.peek().is_none() {
2321 LookSet::empty()
2322 } else {
2323 LookSet::full()
2324 };
2325 // And also, an empty alternate means we have 0 static capture groups,
2326 // but we otherwise start with the number corresponding to the first
2327 // alternate. If any subsequent alternate has a different number of
2328 // static capture groups, then we overall have a variation and not a
2329 // static number of groups.
2330 let static_explicit_captures_len =
2331 it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2332 // The base case is an empty alternation, which matches nothing.
2333 // Note though that empty alternations aren't possible, because the
2334 // Hir::alternation smart constructor rewrites those as empty character
2335 // classes.
2336 let mut props = PropertiesI {
2337 minimum_len: None,
2338 maximum_len: None,
2339 look_set: LookSet::empty(),
2340 look_set_prefix: fix,
2341 look_set_suffix: fix,
2342 look_set_prefix_any: LookSet::empty(),
2343 look_set_suffix_any: LookSet::empty(),
2344 utf8: true,
2345 explicit_captures_len: 0,
2346 static_explicit_captures_len,
2347 literal: false,
2348 alternation_literal: true,
2349 };
2350 let (mut min_poisoned, mut max_poisoned) = (false, false);
2351 // Handle properties that need to visit every child hir.
2352 for prop in it {
2353 let p = prop.borrow();
2354 props.look_set.set_union(p.look_set());
2355 props.look_set_prefix.set_intersect(p.look_set_prefix());
2356 props.look_set_suffix.set_intersect(p.look_set_suffix());
2357 props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2358 props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2359 props.utf8 = props.utf8 && p.is_utf8();
2360 props.explicit_captures_len = props
2361 .explicit_captures_len
2362 .saturating_add(p.explicit_captures_len());
2363 if props.static_explicit_captures_len
2364 != p.static_explicit_captures_len()
2365 {
2366 props.static_explicit_captures_len = None;
2367 }
2368 props.alternation_literal =
2369 props.alternation_literal && p.is_literal();
2370 if !min_poisoned {
2371 if let Some(xmin) = p.minimum_len() {
2372 if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2373 props.minimum_len = Some(xmin);
2374 }
2375 } else {
2376 props.minimum_len = None;
2377 min_poisoned = true;
2378 }
2379 }
2380 if !max_poisoned {
2381 if let Some(xmax) = p.maximum_len() {
2382 if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2383 props.maximum_len = Some(xmax);
2384 }
2385 } else {
2386 props.maximum_len = None;
2387 max_poisoned = true;
2388 }
2389 }
2390 }
2391 Properties(Box::new(props))
2392 }
2393}
2394
2395impl Properties {
2396 /// Create a new set of HIR properties for an empty regex.
2397 fn empty() -> Properties {
2398 let inner = PropertiesI {
2399 minimum_len: Some(0),
2400 maximum_len: Some(0),
2401 look_set: LookSet::empty(),
2402 look_set_prefix: LookSet::empty(),
2403 look_set_suffix: LookSet::empty(),
2404 look_set_prefix_any: LookSet::empty(),
2405 look_set_suffix_any: LookSet::empty(),
2406 // It is debatable whether an empty regex always matches at valid
2407 // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2408 // it is clearly false. There are, for example, many empty strings
2409 // between the bytes encoding a '☃'.
2410 //
2411 // However, when Unicode mode is enabled, the fundamental atom
2412 // of matching is really a codepoint. And in that scenario, an
2413 // empty regex is defined to only match at valid UTF-8 boundaries
2414 // and to never split a codepoint. It just so happens that this
2415 // enforcement is somewhat tricky to do for regexes that match
2416 // the empty string inside regex engines themselves. It usually
2417 // requires some layer above the regex engine to filter out such
2418 // matches.
2419 //
2420 // In any case, 'true' is really the only coherent option. If it
2421 // were false, for example, then 'a*' would also need to be false
2422 // since it too can match the empty string.
2423 utf8: true,
2424 explicit_captures_len: 0,
2425 static_explicit_captures_len: Some(0),
2426 literal: false,
2427 alternation_literal: false,
2428 };
2429 Properties(Box::new(inner))
2430 }
2431
2432 /// Create a new set of HIR properties for a literal regex.
2433 fn literal(lit: &Literal) -> Properties {
2434 let inner = PropertiesI {
2435 minimum_len: Some(lit.0.len()),
2436 maximum_len: Some(lit.0.len()),
2437 look_set: LookSet::empty(),
2438 look_set_prefix: LookSet::empty(),
2439 look_set_suffix: LookSet::empty(),
2440 look_set_prefix_any: LookSet::empty(),
2441 look_set_suffix_any: LookSet::empty(),
2442 utf8: core::str::from_utf8(&lit.0).is_ok(),
2443 explicit_captures_len: 0,
2444 static_explicit_captures_len: Some(0),
2445 literal: true,
2446 alternation_literal: true,
2447 };
2448 Properties(Box::new(inner))
2449 }
2450
2451 /// Create a new set of HIR properties for a character class.
2452 fn class(class: &Class) -> Properties {
2453 let inner = PropertiesI {
2454 minimum_len: class.minimum_len(),
2455 maximum_len: class.maximum_len(),
2456 look_set: LookSet::empty(),
2457 look_set_prefix: LookSet::empty(),
2458 look_set_suffix: LookSet::empty(),
2459 look_set_prefix_any: LookSet::empty(),
2460 look_set_suffix_any: LookSet::empty(),
2461 utf8: class.is_utf8(),
2462 explicit_captures_len: 0,
2463 static_explicit_captures_len: Some(0),
2464 literal: false,
2465 alternation_literal: false,
2466 };
2467 Properties(Box::new(inner))
2468 }
2469
2470 /// Create a new set of HIR properties for a look-around assertion.
2471 fn look(look: Look) -> Properties {
2472 let inner = PropertiesI {
2473 minimum_len: Some(0),
2474 maximum_len: Some(0),
2475 look_set: LookSet::singleton(look),
2476 look_set_prefix: LookSet::singleton(look),
2477 look_set_suffix: LookSet::singleton(look),
2478 look_set_prefix_any: LookSet::singleton(look),
2479 look_set_suffix_any: LookSet::singleton(look),
2480 // This requires a little explanation. Basically, we don't consider
2481 // matching an empty string to be equivalent to matching invalid
2482 // UTF-8, even though technically matching every empty string will
2483 // split the UTF-8 encoding of a single codepoint when treating a
2484 // UTF-8 encoded string as a sequence of bytes. Our defense here is
2485 // that in such a case, a codepoint should logically be treated as
2486 // the fundamental atom for matching, and thus the only valid match
2487 // points are between codepoints and not bytes.
2488 //
2489 // More practically, this is true here because it's also true
2490 // for 'Hir::empty()', otherwise something like 'a*' would be
2491 // considered to match invalid UTF-8. That in turn makes this
2492 // property borderline useless.
2493 utf8: true,
2494 explicit_captures_len: 0,
2495 static_explicit_captures_len: Some(0),
2496 literal: false,
2497 alternation_literal: false,
2498 };
2499 Properties(Box::new(inner))
2500 }
2501
2502 /// Create a new set of HIR properties for a repetition.
2503 fn repetition(rep: &Repetition) -> Properties {
2504 let p = rep.sub.properties();
2505 let minimum_len = p.minimum_len().map(|child_min| {
2506 let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2507 child_min.saturating_mul(rep_min)
2508 });
2509 let maximum_len = rep.max.and_then(|rep_max| {
2510 let rep_max = usize::try_from(rep_max).ok()?;
2511 let child_max = p.maximum_len()?;
2512 child_max.checked_mul(rep_max)
2513 });
2514
2515 let mut inner = PropertiesI {
2516 minimum_len,
2517 maximum_len,
2518 look_set: p.look_set(),
2519 look_set_prefix: LookSet::empty(),
2520 look_set_suffix: LookSet::empty(),
2521 look_set_prefix_any: p.look_set_prefix_any(),
2522 look_set_suffix_any: p.look_set_suffix_any(),
2523 utf8: p.is_utf8(),
2524 explicit_captures_len: p.explicit_captures_len(),
2525 static_explicit_captures_len: p.static_explicit_captures_len(),
2526 literal: false,
2527 alternation_literal: false,
2528 };
2529 // If the repetition operator can match the empty string, then its
2530 // lookset prefix and suffixes themselves remain empty since they are
2531 // no longer required to match.
2532 if rep.min > 0 {
2533 inner.look_set_prefix = p.look_set_prefix();
2534 inner.look_set_suffix = p.look_set_suffix();
2535 }
2536 // If the static captures len of the sub-expression is not known or
2537 // is greater than zero, then it automatically propagates to the
2538 // repetition, regardless of the repetition. Otherwise, it might
2539 // change, but only when the repetition can match 0 times.
2540 if rep.min == 0
2541 && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
2542 {
2543 // If we require a match 0 times, then our captures len is
2544 // guaranteed to be zero. Otherwise, if we *can* match the empty
2545 // string, then it's impossible to know how many captures will be
2546 // in the resulting match.
2547 if rep.max == Some(0) {
2548 inner.static_explicit_captures_len = Some(0);
2549 } else {
2550 inner.static_explicit_captures_len = None;
2551 }
2552 }
2553 Properties(Box::new(inner))
2554 }
2555
2556 /// Create a new set of HIR properties for a capture.
2557 fn capture(capture: &Capture) -> Properties {
2558 let p = capture.sub.properties();
2559 Properties(Box::new(PropertiesI {
2560 explicit_captures_len: p.explicit_captures_len().saturating_add(1),
2561 static_explicit_captures_len: p
2562 .static_explicit_captures_len()
2563 .map(|len| len.saturating_add(1)),
2564 literal: false,
2565 alternation_literal: false,
2566 ..*p.0.clone()
2567 }))
2568 }
2569
2570 /// Create a new set of HIR properties for a concatenation.
2571 fn concat(concat: &[Hir]) -> Properties {
2572 // The base case is an empty concatenation, which matches the empty
2573 // string. Note though that empty concatenations aren't possible,
2574 // because the Hir::concat smart constructor rewrites those as
2575 // Hir::empty.
2576 let mut props = PropertiesI {
2577 minimum_len: Some(0),
2578 maximum_len: Some(0),
2579 look_set: LookSet::empty(),
2580 look_set_prefix: LookSet::empty(),
2581 look_set_suffix: LookSet::empty(),
2582 look_set_prefix_any: LookSet::empty(),
2583 look_set_suffix_any: LookSet::empty(),
2584 utf8: true,
2585 explicit_captures_len: 0,
2586 static_explicit_captures_len: Some(0),
2587 literal: true,
2588 alternation_literal: true,
2589 };
2590 // Handle properties that need to visit every child hir.
2591 for x in concat.iter() {
2592 let p = x.properties();
2593 props.look_set.set_union(p.look_set());
2594 props.utf8 = props.utf8 && p.is_utf8();
2595 props.explicit_captures_len = props
2596 .explicit_captures_len
2597 .saturating_add(p.explicit_captures_len());
2598 props.static_explicit_captures_len = p
2599 .static_explicit_captures_len()
2600 .and_then(|len1| {
2601 Some((len1, props.static_explicit_captures_len?))
2602 })
2603 .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
2604 props.literal = props.literal && p.is_literal();
2605 props.alternation_literal =
2606 props.alternation_literal && p.is_alternation_literal();
2607 if let Some(minimum_len) = props.minimum_len {
2608 match p.minimum_len() {
2609 None => props.minimum_len = None,
2610 Some(len) => {
2611 // We use saturating arithmetic here because the
2612 // minimum is just a lower bound. We can't go any
2613 // higher than what our number types permit.
2614 props.minimum_len =
2615 Some(minimum_len.saturating_add(len));
2616 }
2617 }
2618 }
2619 if let Some(maximum_len) = props.maximum_len {
2620 match p.maximum_len() {
2621 None => props.maximum_len = None,
2622 Some(len) => {
2623 props.maximum_len = maximum_len.checked_add(len)
2624 }
2625 }
2626 }
2627 }
2628 // Handle the prefix properties, which only requires visiting
2629 // child exprs until one matches more than the empty string.
2630 let mut it = concat.iter();
2631 while let Some(x) = it.next() {
2632 props.look_set_prefix.set_union(x.properties().look_set_prefix());
2633 props
2634 .look_set_prefix_any
2635 .set_union(x.properties().look_set_prefix_any());
2636 if x.properties().maximum_len().map_or(true, |x| x > 0) {
2637 break;
2638 }
2639 }
2640 // Same thing for the suffix properties, but in reverse.
2641 let mut it = concat.iter().rev();
2642 while let Some(x) = it.next() {
2643 props.look_set_suffix.set_union(x.properties().look_set_suffix());
2644 props
2645 .look_set_suffix_any
2646 .set_union(x.properties().look_set_suffix_any());
2647 if x.properties().maximum_len().map_or(true, |x| x > 0) {
2648 break;
2649 }
2650 }
2651 Properties(Box::new(props))
2652 }
2653
2654 /// Create a new set of HIR properties for a concatenation.
2655 fn alternation(alts: &[Hir]) -> Properties {
2656 Properties::union(alts.iter().map(|hir| hir.properties()))
2657 }
2658}
2659
2660/// A set of look-around assertions.
2661///
2662/// This is useful for efficiently tracking look-around assertions. For
2663/// example, an [`Hir`] provides properties that return `LookSet`s.
2664#[derive(Clone, Copy, Default, Eq, PartialEq)]
2665pub struct LookSet {
2666 /// The underlying representation this set is exposed to make it possible
2667 /// to store it somewhere efficiently. The representation is that
2668 /// of a bitset, where each assertion occupies bit `i` where `i =
2669 /// Look::as_repr()`.
2670 ///
2671 /// Note that users of this internal representation must permit the full
2672 /// range of `u16` values to be represented. For example, even if the
2673 /// current implementation only makes use of the 10 least significant bits,
2674 /// it may use more bits in a future semver compatible release.
2675 pub bits: u32,
2676}
2677
2678impl LookSet {
2679 /// Create an empty set of look-around assertions.
2680 #[inline]
2681 pub fn empty() -> LookSet {
2682 LookSet { bits: 0 }
2683 }
2684
2685 /// Create a full set of look-around assertions.
2686 ///
2687 /// This set contains all possible look-around assertions.
2688 #[inline]
2689 pub fn full() -> LookSet {
2690 LookSet { bits: !0 }
2691 }
2692
2693 /// Create a look-around set containing the look-around assertion given.
2694 ///
2695 /// This is a convenience routine for creating an empty set and inserting
2696 /// one look-around assertions.
2697 #[inline]
2698 pub fn singleton(look: Look) -> LookSet {
2699 LookSet::empty().insert(look)
2700 }
2701
2702 /// Returns the total number of look-around assertions in this set.
2703 #[inline]
2704 pub fn len(self) -> usize {
2705 // OK because max value always fits in a u8, which in turn always
2706 // fits in a usize, regardless of target.
2707 usize::try_from(self.bits.count_ones()).unwrap()
2708 }
2709
2710 /// Returns true if and only if this set is empty.
2711 #[inline]
2712 pub fn is_empty(self) -> bool {
2713 self.len() == 0
2714 }
2715
2716 /// Returns true if and only if the given look-around assertion is in this
2717 /// set.
2718 #[inline]
2719 pub fn contains(self, look: Look) -> bool {
2720 self.bits & look.as_repr() != 0
2721 }
2722
2723 /// Returns true if and only if this set contains any anchor assertions.
2724 /// This includes both "start/end of haystack" and "start/end of line."
2725 #[inline]
2726 pub fn contains_anchor(&self) -> bool {
2727 self.contains_anchor_haystack() || self.contains_anchor_line()
2728 }
2729
2730 /// Returns true if and only if this set contains any "start/end of
2731 /// haystack" anchors. This doesn't include "start/end of line" anchors.
2732 #[inline]
2733 pub fn contains_anchor_haystack(&self) -> bool {
2734 self.contains(Look::Start) || self.contains(Look::End)
2735 }
2736
2737 /// Returns true if and only if this set contains any "start/end of line"
2738 /// anchors. This doesn't include "start/end of haystack" anchors. This
2739 /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2740 #[inline]
2741 pub fn contains_anchor_line(&self) -> bool {
2742 self.contains(Look::StartLF)
2743 || self.contains(Look::EndLF)
2744 || self.contains(Look::StartCRLF)
2745 || self.contains(Look::EndCRLF)
2746 }
2747
2748 /// Returns true if and only if this set contains any "start/end of line"
2749 /// anchors that only treat `\n` as line terminators. This does not include
2750 /// haystack anchors or CRLF aware line anchors.
2751 #[inline]
2752 pub fn contains_anchor_lf(&self) -> bool {
2753 self.contains(Look::StartLF) || self.contains(Look::EndLF)
2754 }
2755
2756 /// Returns true if and only if this set contains any "start/end of line"
2757 /// anchors that are CRLF-aware. This doesn't include "start/end of
2758 /// haystack" or "start/end of line-feed" anchors.
2759 #[inline]
2760 pub fn contains_anchor_crlf(&self) -> bool {
2761 self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
2762 }
2763
2764 /// Returns true if and only if this set contains any word boundary or
2765 /// negated word boundary assertions. This include both Unicode and ASCII
2766 /// word boundaries.
2767 #[inline]
2768 pub fn contains_word(self) -> bool {
2769 self.contains_word_unicode() || self.contains_word_ascii()
2770 }
2771
2772 /// Returns true if and only if this set contains any Unicode word boundary
2773 /// or negated Unicode word boundary assertions.
2774 #[inline]
2775 pub fn contains_word_unicode(self) -> bool {
2776 self.contains(Look::WordUnicode)
2777 || self.contains(Look::WordUnicodeNegate)
2778 || self.contains(Look::WordStartUnicode)
2779 || self.contains(Look::WordEndUnicode)
2780 || self.contains(Look::WordStartHalfUnicode)
2781 || self.contains(Look::WordEndHalfUnicode)
2782 }
2783
2784 /// Returns true if and only if this set contains any ASCII word boundary
2785 /// or negated ASCII word boundary assertions.
2786 #[inline]
2787 pub fn contains_word_ascii(self) -> bool {
2788 self.contains(Look::WordAscii)
2789 || self.contains(Look::WordAsciiNegate)
2790 || self.contains(Look::WordStartAscii)
2791 || self.contains(Look::WordEndAscii)
2792 || self.contains(Look::WordStartHalfAscii)
2793 || self.contains(Look::WordEndHalfAscii)
2794 }
2795
2796 /// Returns an iterator over all of the look-around assertions in this set.
2797 #[inline]
2798 pub fn iter(self) -> LookSetIter {
2799 LookSetIter { set: self }
2800 }
2801
2802 /// Return a new set that is equivalent to the original, but with the given
2803 /// assertion added to it. If the assertion is already in the set, then the
2804 /// returned set is equivalent to the original.
2805 #[inline]
2806 pub fn insert(self, look: Look) -> LookSet {
2807 LookSet { bits: self.bits | look.as_repr() }
2808 }
2809
2810 /// Updates this set in place with the result of inserting the given
2811 /// assertion into this set.
2812 #[inline]
2813 pub fn set_insert(&mut self, look: Look) {
2814 *self = self.insert(look);
2815 }
2816
2817 /// Return a new set that is equivalent to the original, but with the given
2818 /// assertion removed from it. If the assertion is not in the set, then the
2819 /// returned set is equivalent to the original.
2820 #[inline]
2821 pub fn remove(self, look: Look) -> LookSet {
2822 LookSet { bits: self.bits & !look.as_repr() }
2823 }
2824
2825 /// Updates this set in place with the result of removing the given
2826 /// assertion from this set.
2827 #[inline]
2828 pub fn set_remove(&mut self, look: Look) {
2829 *self = self.remove(look);
2830 }
2831
2832 /// Returns a new set that is the result of subtracting the given set from
2833 /// this set.
2834 #[inline]
2835 pub fn subtract(self, other: LookSet) -> LookSet {
2836 LookSet { bits: self.bits & !other.bits }
2837 }
2838
2839 /// Updates this set in place with the result of subtracting the given set
2840 /// from this set.
2841 #[inline]
2842 pub fn set_subtract(&mut self, other: LookSet) {
2843 *self = self.subtract(other);
2844 }
2845
2846 /// Returns a new set that is the union of this and the one given.
2847 #[inline]
2848 pub fn union(self, other: LookSet) -> LookSet {
2849 LookSet { bits: self.bits | other.bits }
2850 }
2851
2852 /// Updates this set in place with the result of unioning it with the one
2853 /// given.
2854 #[inline]
2855 pub fn set_union(&mut self, other: LookSet) {
2856 *self = self.union(other);
2857 }
2858
2859 /// Returns a new set that is the intersection of this and the one given.
2860 #[inline]
2861 pub fn intersect(self, other: LookSet) -> LookSet {
2862 LookSet { bits: self.bits & other.bits }
2863 }
2864
2865 /// Updates this set in place with the result of intersecting it with the
2866 /// one given.
2867 #[inline]
2868 pub fn set_intersect(&mut self, other: LookSet) {
2869 *self = self.intersect(other);
2870 }
2871
2872 /// Return a `LookSet` from the slice given as a native endian 32-bit
2873 /// integer.
2874 ///
2875 /// # Panics
2876 ///
2877 /// This panics if `slice.len() < 4`.
2878 #[inline]
2879 pub fn read_repr(slice: &[u8]) -> LookSet {
2880 let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
2881 LookSet { bits }
2882 }
2883
2884 /// Write a `LookSet` as a native endian 32-bit integer to the beginning
2885 /// of the slice given.
2886 ///
2887 /// # Panics
2888 ///
2889 /// This panics if `slice.len() < 4`.
2890 #[inline]
2891 pub fn write_repr(self, slice: &mut [u8]) {
2892 let raw = self.bits.to_ne_bytes();
2893 slice[0] = raw[0];
2894 slice[1] = raw[1];
2895 slice[2] = raw[2];
2896 slice[3] = raw[3];
2897 }
2898}
2899
2900impl core::fmt::Debug for LookSet {
2901 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2902 if self.is_empty() {
2903 return write!(f, "∅");
2904 }
2905 for look in self.iter() {
2906 write!(f, "{}", look.as_char())?;
2907 }
2908 Ok(())
2909 }
2910}
2911
2912/// An iterator over all look-around assertions in a [`LookSet`].
2913///
2914/// This iterator is created by [`LookSet::iter`].
2915#[derive(Clone, Debug)]
2916pub struct LookSetIter {
2917 set: LookSet,
2918}
2919
2920impl Iterator for LookSetIter {
2921 type Item = Look;
2922
2923 #[inline]
2924 fn next(&mut self) -> Option<Look> {
2925 if self.set.is_empty() {
2926 return None;
2927 }
2928 // We'll never have more than u8::MAX distinct look-around assertions,
2929 // so 'bit' will always fit into a u16.
2930 let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2931 let look = Look::from_repr(1 << bit)?;
2932 self.set = self.set.remove(look);
2933 Some(look)
2934 }
2935}
2936
2937/// Given a sequence of HIR values where each value corresponds to a Unicode
2938/// class (or an all-ASCII byte class), return a single Unicode class
2939/// corresponding to the union of the classes found.
2940fn class_chars(hirs: &[Hir]) -> Option<Class> {
2941 let mut cls = ClassUnicode::new(vec![]);
2942 for hir in hirs.iter() {
2943 match *hir.kind() {
2944 HirKind::Class(Class::Unicode(ref cls2)) => {
2945 cls.union(cls2);
2946 }
2947 HirKind::Class(Class::Bytes(ref cls2)) => {
2948 cls.union(&cls2.to_unicode_class()?);
2949 }
2950 _ => return None,
2951 };
2952 }
2953 Some(Class::Unicode(cls))
2954}
2955
2956/// Given a sequence of HIR values where each value corresponds to a byte class
2957/// (or an all-ASCII Unicode class), return a single byte class corresponding
2958/// to the union of the classes found.
2959fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2960 let mut cls = ClassBytes::new(vec![]);
2961 for hir in hirs.iter() {
2962 match *hir.kind() {
2963 HirKind::Class(Class::Unicode(ref cls2)) => {
2964 cls.union(&cls2.to_byte_class()?);
2965 }
2966 HirKind::Class(Class::Bytes(ref cls2)) => {
2967 cls.union(cls2);
2968 }
2969 _ => return None,
2970 };
2971 }
2972 Some(Class::Bytes(cls))
2973}
2974
2975/// Given a sequence of HIR values where each value corresponds to a literal
2976/// that is a single `char`, return that sequence of `char`s. Otherwise return
2977/// None. No deduplication is done.
2978fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2979 let mut singletons = vec![];
2980 for hir in hirs.iter() {
2981 let literal = match *hir.kind() {
2982 HirKind::Literal(Literal(ref bytes)) => bytes,
2983 _ => return None,
2984 };
2985 let ch = match crate::debug::utf8_decode(literal) {
2986 None => return None,
2987 Some(Err(_)) => return None,
2988 Some(Ok(ch)) => ch,
2989 };
2990 if literal.len() != ch.len_utf8() {
2991 return None;
2992 }
2993 singletons.push(ch);
2994 }
2995 Some(singletons)
2996}
2997
2998/// Given a sequence of HIR values where each value corresponds to a literal
2999/// that is a single byte, return that sequence of bytes. Otherwise return
3000/// None. No deduplication is done.
3001fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
3002 let mut singletons = vec![];
3003 for hir in hirs.iter() {
3004 let literal = match *hir.kind() {
3005 HirKind::Literal(Literal(ref bytes)) => bytes,
3006 _ => return None,
3007 };
3008 if literal.len() != 1 {
3009 return None;
3010 }
3011 singletons.push(literal[0]);
3012 }
3013 Some(singletons)
3014}
3015
3016/// Looks for a common prefix in the list of alternation branches given. If one
3017/// is found, then an equivalent but (hopefully) simplified Hir is returned.
3018/// Otherwise, the original given list of branches is returned unmodified.
3019///
3020/// This is not quite as good as it could be. Right now, it requires that
3021/// all branches are 'Concat' expressions. It also doesn't do well with
3022/// literals. For example, given 'foofoo|foobar', it will not refactor it to
3023/// 'foo(?:foo|bar)' because literals are flattened into their own special
3024/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3025/// instead of a string of bytes because of this. Otherwise, handling the
3026/// current representation in this routine will be pretty gnarly. Sigh.)
3027fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3028 if hirs.len() <= 1 {
3029 return Err(hirs);
3030 }
3031 let mut prefix = match hirs[0].kind() {
3032 HirKind::Concat(ref xs) => &**xs,
3033 _ => return Err(hirs),
3034 };
3035 if prefix.is_empty() {
3036 return Err(hirs);
3037 }
3038 for h in hirs.iter().skip(1) {
3039 let concat = match h.kind() {
3040 HirKind::Concat(ref xs) => xs,
3041 _ => return Err(hirs),
3042 };
3043 let common_len = prefix
3044 .iter()
3045 .zip(concat.iter())
3046 .take_while(|(x, y)| x == y)
3047 .count();
3048 prefix = &prefix[..common_len];
3049 if prefix.is_empty() {
3050 return Err(hirs);
3051 }
3052 }
3053 let len = prefix.len();
3054 assert_ne!(0, len);
3055 let mut prefix_concat = vec![];
3056 let mut suffix_alts = vec![];
3057 for h in hirs {
3058 let mut concat = match h.into_kind() {
3059 HirKind::Concat(xs) => xs,
3060 // We required all sub-expressions to be
3061 // concats above, so we're only here if we
3062 // have a concat.
3063 _ => unreachable!(),
3064 };
3065 suffix_alts.push(Hir::concat(concat.split_off(len)));
3066 if prefix_concat.is_empty() {
3067 prefix_concat = concat;
3068 }
3069 }
3070 let mut concat = prefix_concat;
3071 concat.push(Hir::alternation(suffix_alts));
3072 Ok(Hir::concat(concat))
3073}
3074
3075#[cfg(test)]
3076mod tests {
3077 use super::*;
3078
3079 fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3080 let ranges: Vec<ClassUnicodeRange> = ranges
3081 .iter()
3082 .map(|&(s, e)| ClassUnicodeRange::new(s, e))
3083 .collect();
3084 ClassUnicode::new(ranges)
3085 }
3086
3087 fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3088 let ranges: Vec<ClassBytesRange> =
3089 ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
3090 ClassBytes::new(ranges)
3091 }
3092
3093 fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3094 cls.iter().map(|x| (x.start(), x.end())).collect()
3095 }
3096
3097 #[cfg(feature = "unicode-case")]
3098 fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3099 let mut cls_ = cls.clone();
3100 cls_.case_fold_simple();
3101 cls_
3102 }
3103
3104 fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3105 let mut cls_ = cls1.clone();
3106 cls_.union(cls2);
3107 cls_
3108 }
3109
3110 fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3111 let mut cls_ = cls1.clone();
3112 cls_.intersect(cls2);
3113 cls_
3114 }
3115
3116 fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3117 let mut cls_ = cls1.clone();
3118 cls_.difference(cls2);
3119 cls_
3120 }
3121
3122 fn usymdifference(
3123 cls1: &ClassUnicode,
3124 cls2: &ClassUnicode,
3125 ) -> ClassUnicode {
3126 let mut cls_ = cls1.clone();
3127 cls_.symmetric_difference(cls2);
3128 cls_
3129 }
3130
3131 fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3132 let mut cls_ = cls.clone();
3133 cls_.negate();
3134 cls_
3135 }
3136
3137 fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3138 cls.iter().map(|x| (x.start(), x.end())).collect()
3139 }
3140
3141 fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3142 let mut cls_ = cls.clone();
3143 cls_.case_fold_simple();
3144 cls_
3145 }
3146
3147 fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3148 let mut cls_ = cls1.clone();
3149 cls_.union(cls2);
3150 cls_
3151 }
3152
3153 fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3154 let mut cls_ = cls1.clone();
3155 cls_.intersect(cls2);
3156 cls_
3157 }
3158
3159 fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3160 let mut cls_ = cls1.clone();
3161 cls_.difference(cls2);
3162 cls_
3163 }
3164
3165 fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3166 let mut cls_ = cls1.clone();
3167 cls_.symmetric_difference(cls2);
3168 cls_
3169 }
3170
3171 fn bnegate(cls: &ClassBytes) -> ClassBytes {
3172 let mut cls_ = cls.clone();
3173 cls_.negate();
3174 cls_
3175 }
3176
3177 #[test]
3178 fn class_range_canonical_unicode() {
3179 let range = ClassUnicodeRange::new('\u{00FF}', '\0');
3180 assert_eq!('\0', range.start());
3181 assert_eq!('\u{00FF}', range.end());
3182 }
3183
3184 #[test]
3185 fn class_range_canonical_bytes() {
3186 let range = ClassBytesRange::new(b'\xFF', b'\0');
3187 assert_eq!(b'\0', range.start());
3188 assert_eq!(b'\xFF', range.end());
3189 }
3190
3191 #[test]
3192 fn class_canonicalize_unicode() {
3193 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3194 let expected = vec![('a', 'c'), ('x', 'z')];
3195 assert_eq!(expected, uranges(&cls));
3196
3197 let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3198 let expected = vec![('a', 'c'), ('x', 'z')];
3199 assert_eq!(expected, uranges(&cls));
3200
3201 let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3202 let expected = vec![('w', 'z')];
3203 assert_eq!(expected, uranges(&cls));
3204
3205 let cls = uclass(&[
3206 ('c', 'f'),
3207 ('a', 'g'),
3208 ('d', 'j'),
3209 ('a', 'c'),
3210 ('m', 'p'),
3211 ('l', 's'),
3212 ]);
3213 let expected = vec![('a', 'j'), ('l', 's')];
3214 assert_eq!(expected, uranges(&cls));
3215
3216 let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3217 let expected = vec![('u', 'z')];
3218 assert_eq!(expected, uranges(&cls));
3219
3220 let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
3221 let expected = vec![('\x00', '\u{10FFFF}')];
3222 assert_eq!(expected, uranges(&cls));
3223
3224 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3225 let expected = vec![('a', 'b')];
3226 assert_eq!(expected, uranges(&cls));
3227 }
3228
3229 #[test]
3230 fn class_canonicalize_bytes() {
3231 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3232 let expected = vec![(b'a', b'c'), (b'x', b'z')];
3233 assert_eq!(expected, branges(&cls));
3234
3235 let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3236 let expected = vec![(b'a', b'c'), (b'x', b'z')];
3237 assert_eq!(expected, branges(&cls));
3238
3239 let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3240 let expected = vec![(b'w', b'z')];
3241 assert_eq!(expected, branges(&cls));
3242
3243 let cls = bclass(&[
3244 (b'c', b'f'),
3245 (b'a', b'g'),
3246 (b'd', b'j'),
3247 (b'a', b'c'),
3248 (b'm', b'p'),
3249 (b'l', b's'),
3250 ]);
3251 let expected = vec![(b'a', b'j'), (b'l', b's')];
3252 assert_eq!(expected, branges(&cls));
3253
3254 let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3255 let expected = vec![(b'u', b'z')];
3256 assert_eq!(expected, branges(&cls));
3257
3258 let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
3259 let expected = vec![(b'\x00', b'\xFF')];
3260 assert_eq!(expected, branges(&cls));
3261
3262 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3263 let expected = vec![(b'a', b'b')];
3264 assert_eq!(expected, branges(&cls));
3265 }
3266
3267 #[test]
3268 #[cfg(feature = "unicode-case")]
3269 fn class_case_fold_unicode() {
3270 let cls = uclass(&[
3271 ('C', 'F'),
3272 ('A', 'G'),
3273 ('D', 'J'),
3274 ('A', 'C'),
3275 ('M', 'P'),
3276 ('L', 'S'),
3277 ('c', 'f'),
3278 ]);
3279 let expected = uclass(&[
3280 ('A', 'J'),
3281 ('L', 'S'),
3282 ('a', 'j'),
3283 ('l', 's'),
3284 ('\u{17F}', '\u{17F}'),
3285 ]);
3286 assert_eq!(expected, ucasefold(&cls));
3287
3288 let cls = uclass(&[('A', 'Z')]);
3289 let expected = uclass(&[
3290 ('A', 'Z'),
3291 ('a', 'z'),
3292 ('\u{17F}', '\u{17F}'),
3293 ('\u{212A}', '\u{212A}'),
3294 ]);
3295 assert_eq!(expected, ucasefold(&cls));
3296
3297 let cls = uclass(&[('a', 'z')]);
3298 let expected = uclass(&[
3299 ('A', 'Z'),
3300 ('a', 'z'),
3301 ('\u{17F}', '\u{17F}'),
3302 ('\u{212A}', '\u{212A}'),
3303 ]);
3304 assert_eq!(expected, ucasefold(&cls));
3305
3306 let cls = uclass(&[('A', 'A'), ('_', '_')]);
3307 let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3308 assert_eq!(expected, ucasefold(&cls));
3309
3310 let cls = uclass(&[('A', 'A'), ('=', '=')]);
3311 let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3312 assert_eq!(expected, ucasefold(&cls));
3313
3314 let cls = uclass(&[('\x00', '\x10')]);
3315 assert_eq!(cls, ucasefold(&cls));
3316
3317 let cls = uclass(&[('k', 'k')]);
3318 let expected =
3319 uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
3320 assert_eq!(expected, ucasefold(&cls));
3321
3322 let cls = uclass(&[('@', '@')]);
3323 assert_eq!(cls, ucasefold(&cls));
3324 }
3325
3326 #[test]
3327 #[cfg(not(feature = "unicode-case"))]
3328 fn class_case_fold_unicode_disabled() {
3329 let mut cls = uclass(&[
3330 ('C', 'F'),
3331 ('A', 'G'),
3332 ('D', 'J'),
3333 ('A', 'C'),
3334 ('M', 'P'),
3335 ('L', 'S'),
3336 ('c', 'f'),
3337 ]);
3338 assert!(cls.try_case_fold_simple().is_err());
3339 }
3340
3341 #[test]
3342 #[should_panic]
3343 #[cfg(not(feature = "unicode-case"))]
3344 fn class_case_fold_unicode_disabled_panics() {
3345 let mut cls = uclass(&[
3346 ('C', 'F'),
3347 ('A', 'G'),
3348 ('D', 'J'),
3349 ('A', 'C'),
3350 ('M', 'P'),
3351 ('L', 'S'),
3352 ('c', 'f'),
3353 ]);
3354 cls.case_fold_simple();
3355 }
3356
3357 #[test]
3358 fn class_case_fold_bytes() {
3359 let cls = bclass(&[
3360 (b'C', b'F'),
3361 (b'A', b'G'),
3362 (b'D', b'J'),
3363 (b'A', b'C'),
3364 (b'M', b'P'),
3365 (b'L', b'S'),
3366 (b'c', b'f'),
3367 ]);
3368 let expected =
3369 bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3370 assert_eq!(expected, bcasefold(&cls));
3371
3372 let cls = bclass(&[(b'A', b'Z')]);
3373 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3374 assert_eq!(expected, bcasefold(&cls));
3375
3376 let cls = bclass(&[(b'a', b'z')]);
3377 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3378 assert_eq!(expected, bcasefold(&cls));
3379
3380 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3381 let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3382 assert_eq!(expected, bcasefold(&cls));
3383
3384 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3385 let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3386 assert_eq!(expected, bcasefold(&cls));
3387
3388 let cls = bclass(&[(b'\x00', b'\x10')]);
3389 assert_eq!(cls, bcasefold(&cls));
3390
3391 let cls = bclass(&[(b'k', b'k')]);
3392 let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3393 assert_eq!(expected, bcasefold(&cls));
3394
3395 let cls = bclass(&[(b'@', b'@')]);
3396 assert_eq!(cls, bcasefold(&cls));
3397 }
3398
3399 #[test]
3400 fn class_negate_unicode() {
3401 let cls = uclass(&[('a', 'a')]);
3402 let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
3403 assert_eq!(expected, unegate(&cls));
3404
3405 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3406 let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
3407 assert_eq!(expected, unegate(&cls));
3408
3409 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3410 let expected = uclass(&[
3411 ('\x00', '\x60'),
3412 ('\x64', '\x77'),
3413 ('\x7B', '\u{10FFFF}'),
3414 ]);
3415 assert_eq!(expected, unegate(&cls));
3416
3417 let cls = uclass(&[('\x00', 'a')]);
3418 let expected = uclass(&[('\x62', '\u{10FFFF}')]);
3419 assert_eq!(expected, unegate(&cls));
3420
3421 let cls = uclass(&[('a', '\u{10FFFF}')]);
3422 let expected = uclass(&[('\x00', '\x60')]);
3423 assert_eq!(expected, unegate(&cls));
3424
3425 let cls = uclass(&[('\x00', '\u{10FFFF}')]);
3426 let expected = uclass(&[]);
3427 assert_eq!(expected, unegate(&cls));
3428
3429 let cls = uclass(&[]);
3430 let expected = uclass(&[('\x00', '\u{10FFFF}')]);
3431 assert_eq!(expected, unegate(&cls));
3432
3433 let cls =
3434 uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
3435 let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
3436 assert_eq!(expected, unegate(&cls));
3437
3438 let cls = uclass(&[('\x00', '\u{D7FF}')]);
3439 let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3440 assert_eq!(expected, unegate(&cls));
3441
3442 let cls = uclass(&[('\x00', '\u{D7FE}')]);
3443 let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
3444 assert_eq!(expected, unegate(&cls));
3445
3446 let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3447 let expected = uclass(&[('\x00', '\u{D7FF}')]);
3448 assert_eq!(expected, unegate(&cls));
3449
3450 let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
3451 let expected = uclass(&[('\x00', '\u{E000}')]);
3452 assert_eq!(expected, unegate(&cls));
3453 }
3454
3455 #[test]
3456 fn class_negate_bytes() {
3457 let cls = bclass(&[(b'a', b'a')]);
3458 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
3459 assert_eq!(expected, bnegate(&cls));
3460
3461 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3462 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
3463 assert_eq!(expected, bnegate(&cls));
3464
3465 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3466 let expected = bclass(&[
3467 (b'\x00', b'\x60'),
3468 (b'\x64', b'\x77'),
3469 (b'\x7B', b'\xFF'),
3470 ]);
3471 assert_eq!(expected, bnegate(&cls));
3472
3473 let cls = bclass(&[(b'\x00', b'a')]);
3474 let expected = bclass(&[(b'\x62', b'\xFF')]);
3475 assert_eq!(expected, bnegate(&cls));
3476
3477 let cls = bclass(&[(b'a', b'\xFF')]);
3478 let expected = bclass(&[(b'\x00', b'\x60')]);
3479 assert_eq!(expected, bnegate(&cls));
3480
3481 let cls = bclass(&[(b'\x00', b'\xFF')]);
3482 let expected = bclass(&[]);
3483 assert_eq!(expected, bnegate(&cls));
3484
3485 let cls = bclass(&[]);
3486 let expected = bclass(&[(b'\x00', b'\xFF')]);
3487 assert_eq!(expected, bnegate(&cls));
3488
3489 let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
3490 let expected = bclass(&[(b'\xFE', b'\xFE')]);
3491 assert_eq!(expected, bnegate(&cls));
3492 }
3493
3494 #[test]
3495 fn class_union_unicode() {
3496 let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3497 let cls2 = uclass(&[('a', 'z')]);
3498 let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3499 assert_eq!(expected, uunion(&cls1, &cls2));
3500 }
3501
3502 #[test]
3503 fn class_union_bytes() {
3504 let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3505 let cls2 = bclass(&[(b'a', b'z')]);
3506 let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3507 assert_eq!(expected, bunion(&cls1, &cls2));
3508 }
3509
3510 #[test]
3511 fn class_intersect_unicode() {
3512 let cls1 = uclass(&[]);
3513 let cls2 = uclass(&[('a', 'a')]);
3514 let expected = uclass(&[]);
3515 assert_eq!(expected, uintersect(&cls1, &cls2));
3516
3517 let cls1 = uclass(&[('a', 'a')]);
3518 let cls2 = uclass(&[('a', 'a')]);
3519 let expected = uclass(&[('a', 'a')]);
3520 assert_eq!(expected, uintersect(&cls1, &cls2));
3521
3522 let cls1 = uclass(&[('a', 'a')]);
3523 let cls2 = uclass(&[('b', 'b')]);
3524 let expected = uclass(&[]);
3525 assert_eq!(expected, uintersect(&cls1, &cls2));
3526
3527 let cls1 = uclass(&[('a', 'a')]);
3528 let cls2 = uclass(&[('a', 'c')]);
3529 let expected = uclass(&[('a', 'a')]);
3530 assert_eq!(expected, uintersect(&cls1, &cls2));
3531
3532 let cls1 = uclass(&[('a', 'b')]);
3533 let cls2 = uclass(&[('a', 'c')]);
3534 let expected = uclass(&[('a', 'b')]);
3535 assert_eq!(expected, uintersect(&cls1, &cls2));
3536
3537 let cls1 = uclass(&[('a', 'b')]);
3538 let cls2 = uclass(&[('b', 'c')]);
3539 let expected = uclass(&[('b', 'b')]);
3540 assert_eq!(expected, uintersect(&cls1, &cls2));
3541
3542 let cls1 = uclass(&[('a', 'b')]);
3543 let cls2 = uclass(&[('c', 'd')]);
3544 let expected = uclass(&[]);
3545 assert_eq!(expected, uintersect(&cls1, &cls2));
3546
3547 let cls1 = uclass(&[('b', 'c')]);
3548 let cls2 = uclass(&[('a', 'd')]);
3549 let expected = uclass(&[('b', 'c')]);
3550 assert_eq!(expected, uintersect(&cls1, &cls2));
3551
3552 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3553 let cls2 = uclass(&[('a', 'h')]);
3554 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3555 assert_eq!(expected, uintersect(&cls1, &cls2));
3556
3557 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3558 let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3559 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3560 assert_eq!(expected, uintersect(&cls1, &cls2));
3561
3562 let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3563 let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3564 let expected = uclass(&[]);
3565 assert_eq!(expected, uintersect(&cls1, &cls2));
3566
3567 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3568 let cls2 = uclass(&[('h', 'h')]);
3569 let expected = uclass(&[('h', 'h')]);
3570 assert_eq!(expected, uintersect(&cls1, &cls2));
3571
3572 let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3573 let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3574 let expected = uclass(&[]);
3575 assert_eq!(expected, uintersect(&cls1, &cls2));
3576
3577 let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3578 let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3579 let expected = uclass(&[('b', 'f')]);
3580 assert_eq!(expected, uintersect(&cls1, &cls2));
3581 }
3582
3583 #[test]
3584 fn class_intersect_bytes() {
3585 let cls1 = bclass(&[]);
3586 let cls2 = bclass(&[(b'a', b'a')]);
3587 let expected = bclass(&[]);
3588 assert_eq!(expected, bintersect(&cls1, &cls2));
3589
3590 let cls1 = bclass(&[(b'a', b'a')]);
3591 let cls2 = bclass(&[(b'a', b'a')]);
3592 let expected = bclass(&[(b'a', b'a')]);
3593 assert_eq!(expected, bintersect(&cls1, &cls2));
3594
3595 let cls1 = bclass(&[(b'a', b'a')]);
3596 let cls2 = bclass(&[(b'b', b'b')]);
3597 let expected = bclass(&[]);
3598 assert_eq!(expected, bintersect(&cls1, &cls2));
3599
3600 let cls1 = bclass(&[(b'a', b'a')]);
3601 let cls2 = bclass(&[(b'a', b'c')]);
3602 let expected = bclass(&[(b'a', b'a')]);
3603 assert_eq!(expected, bintersect(&cls1, &cls2));
3604
3605 let cls1 = bclass(&[(b'a', b'b')]);
3606 let cls2 = bclass(&[(b'a', b'c')]);
3607 let expected = bclass(&[(b'a', b'b')]);
3608 assert_eq!(expected, bintersect(&cls1, &cls2));
3609
3610 let cls1 = bclass(&[(b'a', b'b')]);
3611 let cls2 = bclass(&[(b'b', b'c')]);
3612 let expected = bclass(&[(b'b', b'b')]);
3613 assert_eq!(expected, bintersect(&cls1, &cls2));
3614
3615 let cls1 = bclass(&[(b'a', b'b')]);
3616 let cls2 = bclass(&[(b'c', b'd')]);
3617 let expected = bclass(&[]);
3618 assert_eq!(expected, bintersect(&cls1, &cls2));
3619
3620 let cls1 = bclass(&[(b'b', b'c')]);
3621 let cls2 = bclass(&[(b'a', b'd')]);
3622 let expected = bclass(&[(b'b', b'c')]);
3623 assert_eq!(expected, bintersect(&cls1, &cls2));
3624
3625 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3626 let cls2 = bclass(&[(b'a', b'h')]);
3627 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3628 assert_eq!(expected, bintersect(&cls1, &cls2));
3629
3630 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3631 let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3632 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3633 assert_eq!(expected, bintersect(&cls1, &cls2));
3634
3635 let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3636 let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3637 let expected = bclass(&[]);
3638 assert_eq!(expected, bintersect(&cls1, &cls2));
3639
3640 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3641 let cls2 = bclass(&[(b'h', b'h')]);
3642 let expected = bclass(&[(b'h', b'h')]);
3643 assert_eq!(expected, bintersect(&cls1, &cls2));
3644
3645 let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3646 let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3647 let expected = bclass(&[]);
3648 assert_eq!(expected, bintersect(&cls1, &cls2));
3649
3650 let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3651 let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3652 let expected = bclass(&[(b'b', b'f')]);
3653 assert_eq!(expected, bintersect(&cls1, &cls2));
3654 }
3655
3656 #[test]
3657 fn class_difference_unicode() {
3658 let cls1 = uclass(&[('a', 'a')]);
3659 let cls2 = uclass(&[('a', 'a')]);
3660 let expected = uclass(&[]);
3661 assert_eq!(expected, udifference(&cls1, &cls2));
3662
3663 let cls1 = uclass(&[('a', 'a')]);
3664 let cls2 = uclass(&[]);
3665 let expected = uclass(&[('a', 'a')]);
3666 assert_eq!(expected, udifference(&cls1, &cls2));
3667
3668 let cls1 = uclass(&[]);
3669 let cls2 = uclass(&[('a', 'a')]);
3670 let expected = uclass(&[]);
3671 assert_eq!(expected, udifference(&cls1, &cls2));
3672
3673 let cls1 = uclass(&[('a', 'z')]);
3674 let cls2 = uclass(&[('a', 'a')]);
3675 let expected = uclass(&[('b', 'z')]);
3676 assert_eq!(expected, udifference(&cls1, &cls2));
3677
3678 let cls1 = uclass(&[('a', 'z')]);
3679 let cls2 = uclass(&[('z', 'z')]);
3680 let expected = uclass(&[('a', 'y')]);
3681 assert_eq!(expected, udifference(&cls1, &cls2));
3682
3683 let cls1 = uclass(&[('a', 'z')]);
3684 let cls2 = uclass(&[('m', 'm')]);
3685 let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3686 assert_eq!(expected, udifference(&cls1, &cls2));
3687
3688 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3689 let cls2 = uclass(&[('a', 'z')]);
3690 let expected = uclass(&[]);
3691 assert_eq!(expected, udifference(&cls1, &cls2));
3692
3693 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3694 let cls2 = uclass(&[('d', 'v')]);
3695 let expected = uclass(&[('a', 'c')]);
3696 assert_eq!(expected, udifference(&cls1, &cls2));
3697
3698 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3699 let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3700 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3701 assert_eq!(expected, udifference(&cls1, &cls2));
3702
3703 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3704 let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3705 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3706 assert_eq!(expected, udifference(&cls1, &cls2));
3707
3708 let cls1 = uclass(&[('x', 'z')]);
3709 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3710 let expected = uclass(&[('x', 'z')]);
3711 assert_eq!(expected, udifference(&cls1, &cls2));
3712
3713 let cls1 = uclass(&[('a', 'z')]);
3714 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3715 let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3716 assert_eq!(expected, udifference(&cls1, &cls2));
3717 }
3718
3719 #[test]
3720 fn class_difference_bytes() {
3721 let cls1 = bclass(&[(b'a', b'a')]);
3722 let cls2 = bclass(&[(b'a', b'a')]);
3723 let expected = bclass(&[]);
3724 assert_eq!(expected, bdifference(&cls1, &cls2));
3725
3726 let cls1 = bclass(&[(b'a', b'a')]);
3727 let cls2 = bclass(&[]);
3728 let expected = bclass(&[(b'a', b'a')]);
3729 assert_eq!(expected, bdifference(&cls1, &cls2));
3730
3731 let cls1 = bclass(&[]);
3732 let cls2 = bclass(&[(b'a', b'a')]);
3733 let expected = bclass(&[]);
3734 assert_eq!(expected, bdifference(&cls1, &cls2));
3735
3736 let cls1 = bclass(&[(b'a', b'z')]);
3737 let cls2 = bclass(&[(b'a', b'a')]);
3738 let expected = bclass(&[(b'b', b'z')]);
3739 assert_eq!(expected, bdifference(&cls1, &cls2));
3740
3741 let cls1 = bclass(&[(b'a', b'z')]);
3742 let cls2 = bclass(&[(b'z', b'z')]);
3743 let expected = bclass(&[(b'a', b'y')]);
3744 assert_eq!(expected, bdifference(&cls1, &cls2));
3745
3746 let cls1 = bclass(&[(b'a', b'z')]);
3747 let cls2 = bclass(&[(b'm', b'm')]);
3748 let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3749 assert_eq!(expected, bdifference(&cls1, &cls2));
3750
3751 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3752 let cls2 = bclass(&[(b'a', b'z')]);
3753 let expected = bclass(&[]);
3754 assert_eq!(expected, bdifference(&cls1, &cls2));
3755
3756 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3757 let cls2 = bclass(&[(b'd', b'v')]);
3758 let expected = bclass(&[(b'a', b'c')]);
3759 assert_eq!(expected, bdifference(&cls1, &cls2));
3760
3761 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3762 let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3763 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3764 assert_eq!(expected, bdifference(&cls1, &cls2));
3765
3766 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3767 let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3768 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3769 assert_eq!(expected, bdifference(&cls1, &cls2));
3770
3771 let cls1 = bclass(&[(b'x', b'z')]);
3772 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3773 let expected = bclass(&[(b'x', b'z')]);
3774 assert_eq!(expected, bdifference(&cls1, &cls2));
3775
3776 let cls1 = bclass(&[(b'a', b'z')]);
3777 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3778 let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3779 assert_eq!(expected, bdifference(&cls1, &cls2));
3780 }
3781
3782 #[test]
3783 fn class_symmetric_difference_unicode() {
3784 let cls1 = uclass(&[('a', 'm')]);
3785 let cls2 = uclass(&[('g', 't')]);
3786 let expected = uclass(&[('a', 'f'), ('n', 't')]);
3787 assert_eq!(expected, usymdifference(&cls1, &cls2));
3788 }
3789
3790 #[test]
3791 fn class_symmetric_difference_bytes() {
3792 let cls1 = bclass(&[(b'a', b'm')]);
3793 let cls2 = bclass(&[(b'g', b't')]);
3794 let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3795 assert_eq!(expected, bsymdifference(&cls1, &cls2));
3796 }
3797
3798 // We use a thread with an explicit stack size to test that our destructor
3799 // for Hir can handle arbitrarily sized expressions in constant stack
3800 // space. In case we run on a platform without threads (WASM?), we limit
3801 // this test to Windows/Unix.
3802 #[test]
3803 #[cfg(any(unix, windows))]
3804 fn no_stack_overflow_on_drop() {
3805 use std::thread;
3806
3807 let run = || {
3808 let mut expr = Hir::empty();
3809 for _ in 0..100 {
3810 expr = Hir::capture(Capture {
3811 index: 1,
3812 name: None,
3813 sub: Box::new(expr),
3814 });
3815 expr = Hir::repetition(Repetition {
3816 min: 0,
3817 max: Some(1),
3818 greedy: true,
3819 sub: Box::new(expr),
3820 });
3821
3822 expr = Hir {
3823 kind: HirKind::Concat(vec![expr]),
3824 props: Properties::empty(),
3825 };
3826 expr = Hir {
3827 kind: HirKind::Alternation(vec![expr]),
3828 props: Properties::empty(),
3829 };
3830 }
3831 assert!(!matches!(*expr.kind(), HirKind::Empty));
3832 };
3833
3834 // We run our test on a thread with a small stack size so we can
3835 // force the issue more easily.
3836 //
3837 // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3838 // for context on the specific stack size chosen here.
3839 thread::Builder::new()
3840 .stack_size(16 << 10)
3841 .spawn(run)
3842 .unwrap()
3843 .join()
3844 .unwrap();
3845 }
3846
3847 #[test]
3848 fn look_set_iter() {
3849 let set = LookSet::empty();
3850 assert_eq!(0, set.iter().count());
3851
3852 let set = LookSet::full();
3853 assert_eq!(18, set.iter().count());
3854
3855 let set =
3856 LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3857 assert_eq!(2, set.iter().count());
3858
3859 let set = LookSet::empty().insert(Look::StartLF);
3860 assert_eq!(1, set.iter().count());
3861
3862 let set = LookSet::empty().insert(Look::WordAsciiNegate);
3863 assert_eq!(1, set.iter().count());
3864 }
3865
3866 #[test]
3867 fn look_set_debug() {
3868 let res = format!("{:?}", LookSet::empty());
3869 assert_eq!("∅", res);
3870 let res = format!("{:?}", LookSet::full());
3871 assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
3872 }
3873}