regex_syntax/
unicode.rs

1use alloc::{
2    string::{String, ToString},
3    vec::Vec,
4};
5
6use crate::hir;
7
8/// An inclusive range of codepoints from a generated file (hence the static
9/// lifetime).
10type Range = &'static [(char, char)];
11
12/// An error that occurs when dealing with Unicode.
13///
14/// We don't impl the Error trait here because these always get converted
15/// into other public errors. (This error type isn't exported.)
16#[derive(Debug)]
17pub enum Error {
18    PropertyNotFound,
19    PropertyValueNotFound,
20    // Not used when unicode-perl is enabled.
21    #[allow(dead_code)]
22    PerlClassNotFound,
23}
24
25/// An error that occurs when Unicode-aware simple case folding fails.
26///
27/// This error can occur when the case mapping tables necessary for Unicode
28/// aware case folding are unavailable. This only occurs when the
29/// `unicode-case` feature is disabled. (The feature is enabled by default.)
30#[derive(Debug)]
31pub struct CaseFoldError(());
32
33#[cfg(feature = "std")]
34impl std::error::Error for CaseFoldError {}
35
36impl core::fmt::Display for CaseFoldError {
37    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
38        write!(
39            f,
40            "Unicode-aware case folding is not available \
41             (probably because the unicode-case feature is not enabled)"
42        )
43    }
44}
45
46/// An error that occurs when the Unicode-aware `\w` class is unavailable.
47///
48/// This error can occur when the data tables necessary for the Unicode aware
49/// Perl character class `\w` are unavailable. This only occurs when the
50/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
51#[derive(Debug)]
52pub struct UnicodeWordError(());
53
54#[cfg(feature = "std")]
55impl std::error::Error for UnicodeWordError {}
56
57impl core::fmt::Display for UnicodeWordError {
58    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
59        write!(
60            f,
61            "Unicode-aware \\w class is not available \
62             (probably because the unicode-perl feature is not enabled)"
63        )
64    }
65}
66
67/// A state oriented traverser of the simple case folding table.
68///
69/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will
70/// return an error if the underlying case folding table is unavailable.
71///
72/// After construction, it is expected that callers will use
73/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly
74/// increasing order. For example, calling it on `b` and then on `a` is illegal
75/// and will result in a panic.
76///
77/// The main idea of this type is that it tries hard to make mapping lookups
78/// fast by exploiting the structure of the underlying table, and the ordering
79/// assumption enables this.
80#[derive(Debug)]
81pub struct SimpleCaseFolder {
82    /// The simple case fold table. It's a sorted association list, where the
83    /// keys are Unicode scalar values and the values are the corresponding
84    /// equivalence class (not including the key) of the "simple" case folded
85    /// Unicode scalar values.
86    table: &'static [(char, &'static [char])],
87    /// The last codepoint that was used for a lookup.
88    last: Option<char>,
89    /// The index to the entry in `table` corresponding to the smallest key `k`
90    /// such that `k > k0`, where `k0` is the most recent key lookup. Note that
91    /// in particular, `k0` may not be in the table!
92    next: usize,
93}
94
95impl SimpleCaseFolder {
96    /// Create a new simple case folder, returning an error if the underlying
97    /// case folding table is unavailable.
98    pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> {
99        #[cfg(not(feature = "unicode-case"))]
100        {
101            Err(CaseFoldError(()))
102        }
103        #[cfg(feature = "unicode-case")]
104        {
105            Ok(SimpleCaseFolder {
106                table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE,
107                last: None,
108                next: 0,
109            })
110        }
111    }
112
113    /// Return the equivalence class of case folded codepoints for the given
114    /// codepoint. The equivalence class returned never includes the codepoint
115    /// given. If the given codepoint has no case folded codepoints (i.e.,
116    /// no entry in the underlying case folding table), then this returns an
117    /// empty slice.
118    ///
119    /// # Panics
120    ///
121    /// This panics when called with a `c` that is less than or equal to the
122    /// previous call. In other words, callers need to use this method with
123    /// strictly increasing values of `c`.
124    pub fn mapping(&mut self, c: char) -> &'static [char] {
125        if let Some(last) = self.last {
126            assert!(
127                last < c,
128                "got codepoint U+{:X} which occurs before \
129                 last codepoint U+{:X}",
130                u32::from(c),
131                u32::from(last),
132            );
133        }
134        self.last = Some(c);
135        if self.next >= self.table.len() {
136            return &[];
137        }
138        let (k, v) = self.table[self.next];
139        if k == c {
140            self.next += 1;
141            return v;
142        }
143        match self.get(c) {
144            Err(i) => {
145                self.next = i;
146                &[]
147            }
148            Ok(i) => {
149                // Since we require lookups to proceed
150                // in order, anything we find should be
151                // after whatever we thought might be
152                // next. Otherwise, the caller is either
153                // going out of order or we would have
154                // found our next key at 'self.next'.
155                assert!(i > self.next);
156                self.next = i + 1;
157                self.table[i].1
158            }
159        }
160    }
161
162    /// Returns true if and only if the given range overlaps with any region
163    /// of the underlying case folding table. That is, when true, there exists
164    /// at least one codepoint in the inclusive range `[start, end]` that has
165    /// a non-trivial equivalence class of case folded codepoints. Conversely,
166    /// when this returns false, all codepoints in the range `[start, end]`
167    /// correspond to the trivial equivalence class of case folded codepoints,
168    /// i.e., itself.
169    ///
170    /// This is useful to call before iterating over the codepoints in the
171    /// range and looking up the mapping for each. If you know none of the
172    /// mappings will return anything, then you might be able to skip doing it
173    /// altogether.
174    ///
175    /// # Panics
176    ///
177    /// This panics when `end < start`.
178    pub fn overlaps(&self, start: char, end: char) -> bool {
179        use core::cmp::Ordering;
180
181        assert!(start <= end);
182        self.table
183            .binary_search_by(|&(c, _)| {
184                if start <= c && c <= end {
185                    Ordering::Equal
186                } else if c > end {
187                    Ordering::Greater
188                } else {
189                    Ordering::Less
190                }
191            })
192            .is_ok()
193    }
194
195    /// Returns the index at which `c` occurs in the simple case fold table. If
196    /// `c` does not occur, then this returns an `i` such that `table[i-1].0 <
197    /// c` and `table[i].0 > c`.
198    fn get(&self, c: char) -> Result<usize, usize> {
199        self.table.binary_search_by_key(&c, |&(c1, _)| c1)
200    }
201}
202
203/// A query for finding a character class defined by Unicode. This supports
204/// either use of a property name directly, or lookup by property value. The
205/// former generally refers to Binary properties (see UTS#44, Table 8), but
206/// as a special exception (see UTS#18, Section 1.2) both general categories
207/// (an enumeration) and scripts (a catalog) are supported as if each of their
208/// possible values were a binary property.
209///
210/// In all circumstances, property names and values are normalized and
211/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
212///
213/// The lifetime `'a` refers to the shorter of the lifetimes of property name
214/// and property value.
215#[derive(Debug)]
216pub enum ClassQuery<'a> {
217    /// Return a class corresponding to a Unicode binary property, named by
218    /// a single letter.
219    OneLetter(char),
220    /// Return a class corresponding to a Unicode binary property.
221    ///
222    /// Note that, by special exception (see UTS#18, Section 1.2), both
223    /// general category values and script values are permitted here as if
224    /// they were a binary property.
225    Binary(&'a str),
226    /// Return a class corresponding to all codepoints whose property
227    /// (identified by `property_name`) corresponds to the given value
228    /// (identified by `property_value`).
229    ByValue {
230        /// A property name.
231        property_name: &'a str,
232        /// A property value.
233        property_value: &'a str,
234    },
235}
236
237impl<'a> ClassQuery<'a> {
238    fn canonicalize(&self) -> Result<CanonicalClassQuery, Error> {
239        match *self {
240            ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
241            ClassQuery::Binary(name) => self.canonical_binary(name),
242            ClassQuery::ByValue { property_name, property_value } => {
243                let property_name = symbolic_name_normalize(property_name);
244                let property_value = symbolic_name_normalize(property_value);
245
246                let canon_name = match canonical_prop(&property_name)? {
247                    None => return Err(Error::PropertyNotFound),
248                    Some(canon_name) => canon_name,
249                };
250                Ok(match canon_name {
251                    "General_Category" => {
252                        let canon = match canonical_gencat(&property_value)? {
253                            None => return Err(Error::PropertyValueNotFound),
254                            Some(canon) => canon,
255                        };
256                        CanonicalClassQuery::GeneralCategory(canon)
257                    }
258                    "Script" => {
259                        let canon = match canonical_script(&property_value)? {
260                            None => return Err(Error::PropertyValueNotFound),
261                            Some(canon) => canon,
262                        };
263                        CanonicalClassQuery::Script(canon)
264                    }
265                    _ => {
266                        let vals = match property_values(canon_name)? {
267                            None => return Err(Error::PropertyValueNotFound),
268                            Some(vals) => vals,
269                        };
270                        let canon_val =
271                            match canonical_value(vals, &property_value) {
272                                None => {
273                                    return Err(Error::PropertyValueNotFound)
274                                }
275                                Some(canon_val) => canon_val,
276                            };
277                        CanonicalClassQuery::ByValue {
278                            property_name: canon_name,
279                            property_value: canon_val,
280                        }
281                    }
282                })
283            }
284        }
285    }
286
287    fn canonical_binary(
288        &self,
289        name: &str,
290    ) -> Result<CanonicalClassQuery, Error> {
291        let norm = symbolic_name_normalize(name);
292
293        // This is a special case where 'cf' refers to the 'Format' general
294        // category, but where the 'cf' abbreviation is also an abbreviation
295        // for the 'Case_Folding' property. But we want to treat it as
296        // a general category. (Currently, we don't even support the
297        // 'Case_Folding' property. But if we do in the future, users will be
298        // required to spell it out.)
299        //
300        // Also 'sc' refers to the 'Currency_Symbol' general category, but is
301        // also the abbreviation for the 'Script' property. So we avoid calling
302        // 'canonical_prop' for it too, which would erroneously normalize it
303        // to 'Script'.
304        //
305        // Another case: 'lc' is an abbreviation for the 'Cased_Letter'
306        // general category, but is also an abbreviation for the 'Lowercase_Mapping'
307        // property. We don't currently support the latter, so as with 'cf'
308        // above, we treat 'lc' as 'Cased_Letter'.
309        if norm != "cf" && norm != "sc" && norm != "lc" {
310            if let Some(canon) = canonical_prop(&norm)? {
311                return Ok(CanonicalClassQuery::Binary(canon));
312            }
313        }
314        if let Some(canon) = canonical_gencat(&norm)? {
315            return Ok(CanonicalClassQuery::GeneralCategory(canon));
316        }
317        if let Some(canon) = canonical_script(&norm)? {
318            return Ok(CanonicalClassQuery::Script(canon));
319        }
320        Err(Error::PropertyNotFound)
321    }
322}
323
324/// Like ClassQuery, but its parameters have been canonicalized. This also
325/// differentiates binary properties from flattened general categories and
326/// scripts.
327#[derive(Debug, Eq, PartialEq)]
328enum CanonicalClassQuery {
329    /// The canonical binary property name.
330    Binary(&'static str),
331    /// The canonical general category name.
332    GeneralCategory(&'static str),
333    /// The canonical script name.
334    Script(&'static str),
335    /// An arbitrary association between property and value, both of which
336    /// have been canonicalized.
337    ///
338    /// Note that by construction, the property name of ByValue will never
339    /// be General_Category or Script. Those two cases are subsumed by the
340    /// eponymous variants.
341    ByValue {
342        /// The canonical property name.
343        property_name: &'static str,
344        /// The canonical property value.
345        property_value: &'static str,
346    },
347}
348
349/// Looks up a Unicode class given a query. If one doesn't exist, then
350/// `None` is returned.
351pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode, Error> {
352    use self::CanonicalClassQuery::*;
353
354    match query.canonicalize()? {
355        Binary(name) => bool_property(name),
356        GeneralCategory(name) => gencat(name),
357        Script(name) => script(name),
358        ByValue { property_name: "Age", property_value } => {
359            let mut class = hir::ClassUnicode::empty();
360            for set in ages(property_value)? {
361                class.union(&hir_class(set));
362            }
363            Ok(class)
364        }
365        ByValue { property_name: "Script_Extensions", property_value } => {
366            script_extension(property_value)
367        }
368        ByValue {
369            property_name: "Grapheme_Cluster_Break",
370            property_value,
371        } => gcb(property_value),
372        ByValue { property_name: "Sentence_Break", property_value } => {
373            sb(property_value)
374        }
375        ByValue { property_name: "Word_Break", property_value } => {
376            wb(property_value)
377        }
378        _ => {
379            // What else should we support?
380            Err(Error::PropertyNotFound)
381        }
382    }
383}
384
385/// Returns a Unicode aware class for \w.
386///
387/// This returns an error if the data is not available for \w.
388pub fn perl_word() -> Result<hir::ClassUnicode, Error> {
389    #[cfg(not(feature = "unicode-perl"))]
390    fn imp() -> Result<hir::ClassUnicode, Error> {
391        Err(Error::PerlClassNotFound)
392    }
393
394    #[cfg(feature = "unicode-perl")]
395    fn imp() -> Result<hir::ClassUnicode, Error> {
396        use crate::unicode_tables::perl_word::PERL_WORD;
397        Ok(hir_class(PERL_WORD))
398    }
399
400    imp()
401}
402
403/// Returns a Unicode aware class for \s.
404///
405/// This returns an error if the data is not available for \s.
406pub fn perl_space() -> Result<hir::ClassUnicode, Error> {
407    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
408    fn imp() -> Result<hir::ClassUnicode, Error> {
409        Err(Error::PerlClassNotFound)
410    }
411
412    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
413    fn imp() -> Result<hir::ClassUnicode, Error> {
414        use crate::unicode_tables::perl_space::WHITE_SPACE;
415        Ok(hir_class(WHITE_SPACE))
416    }
417
418    #[cfg(feature = "unicode-bool")]
419    fn imp() -> Result<hir::ClassUnicode, Error> {
420        use crate::unicode_tables::property_bool::WHITE_SPACE;
421        Ok(hir_class(WHITE_SPACE))
422    }
423
424    imp()
425}
426
427/// Returns a Unicode aware class for \d.
428///
429/// This returns an error if the data is not available for \d.
430pub fn perl_digit() -> Result<hir::ClassUnicode, Error> {
431    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
432    fn imp() -> Result<hir::ClassUnicode, Error> {
433        Err(Error::PerlClassNotFound)
434    }
435
436    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
437    fn imp() -> Result<hir::ClassUnicode, Error> {
438        use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
439        Ok(hir_class(DECIMAL_NUMBER))
440    }
441
442    #[cfg(feature = "unicode-gencat")]
443    fn imp() -> Result<hir::ClassUnicode, Error> {
444        use crate::unicode_tables::general_category::DECIMAL_NUMBER;
445        Ok(hir_class(DECIMAL_NUMBER))
446    }
447
448    imp()
449}
450
451/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
452pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
453    let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
454        .iter()
455        .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
456        .collect();
457    hir::ClassUnicode::new(hir_ranges)
458}
459
460/// Returns true only if the given codepoint is in the `\w` character class.
461///
462/// If the `unicode-perl` feature is not enabled, then this returns an error.
463pub fn is_word_character(c: char) -> Result<bool, UnicodeWordError> {
464    #[cfg(not(feature = "unicode-perl"))]
465    fn imp(_: char) -> Result<bool, UnicodeWordError> {
466        Err(UnicodeWordError(()))
467    }
468
469    #[cfg(feature = "unicode-perl")]
470    fn imp(c: char) -> Result<bool, UnicodeWordError> {
471        use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD};
472
473        if u8::try_from(c).map_or(false, is_word_byte) {
474            return Ok(true);
475        }
476        Ok(PERL_WORD
477            .binary_search_by(|&(start, end)| {
478                use core::cmp::Ordering;
479
480                if start <= c && c <= end {
481                    Ordering::Equal
482                } else if start > c {
483                    Ordering::Greater
484                } else {
485                    Ordering::Less
486                }
487            })
488            .is_ok())
489    }
490
491    imp(c)
492}
493
494/// A mapping of property values for a specific property.
495///
496/// The first element of each tuple is a normalized property value while the
497/// second element of each tuple is the corresponding canonical property
498/// value.
499type PropertyValues = &'static [(&'static str, &'static str)];
500
501fn canonical_gencat(
502    normalized_value: &str,
503) -> Result<Option<&'static str>, Error> {
504    Ok(match normalized_value {
505        "any" => Some("Any"),
506        "assigned" => Some("Assigned"),
507        "ascii" => Some("ASCII"),
508        _ => {
509            let gencats = property_values("General_Category")?.unwrap();
510            canonical_value(gencats, normalized_value)
511        }
512    })
513}
514
515fn canonical_script(
516    normalized_value: &str,
517) -> Result<Option<&'static str>, Error> {
518    let scripts = property_values("Script")?.unwrap();
519    Ok(canonical_value(scripts, normalized_value))
520}
521
522/// Find the canonical property name for the given normalized property name.
523///
524/// If no such property exists, then `None` is returned.
525///
526/// The normalized property name must have been normalized according to
527/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
528///
529/// If the property names data is not available, then an error is returned.
530fn canonical_prop(
531    normalized_name: &str,
532) -> Result<Option<&'static str>, Error> {
533    #[cfg(not(any(
534        feature = "unicode-age",
535        feature = "unicode-bool",
536        feature = "unicode-gencat",
537        feature = "unicode-perl",
538        feature = "unicode-script",
539        feature = "unicode-segment",
540    )))]
541    fn imp(_: &str) -> Result<Option<&'static str>, Error> {
542        Err(Error::PropertyNotFound)
543    }
544
545    #[cfg(any(
546        feature = "unicode-age",
547        feature = "unicode-bool",
548        feature = "unicode-gencat",
549        feature = "unicode-perl",
550        feature = "unicode-script",
551        feature = "unicode-segment",
552    ))]
553    fn imp(name: &str) -> Result<Option<&'static str>, Error> {
554        use crate::unicode_tables::property_names::PROPERTY_NAMES;
555
556        Ok(PROPERTY_NAMES
557            .binary_search_by_key(&name, |&(n, _)| n)
558            .ok()
559            .map(|i| PROPERTY_NAMES[i].1))
560    }
561
562    imp(normalized_name)
563}
564
565/// Find the canonical property value for the given normalized property
566/// value.
567///
568/// The given property values should correspond to the values for the property
569/// under question, which can be found using `property_values`.
570///
571/// If no such property value exists, then `None` is returned.
572///
573/// The normalized property value must have been normalized according to
574/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
575fn canonical_value(
576    vals: PropertyValues,
577    normalized_value: &str,
578) -> Option<&'static str> {
579    vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
580        .ok()
581        .map(|i| vals[i].1)
582}
583
584/// Return the table of property values for the given property name.
585///
586/// If the property values data is not available, then an error is returned.
587fn property_values(
588    canonical_property_name: &'static str,
589) -> Result<Option<PropertyValues>, Error> {
590    #[cfg(not(any(
591        feature = "unicode-age",
592        feature = "unicode-bool",
593        feature = "unicode-gencat",
594        feature = "unicode-perl",
595        feature = "unicode-script",
596        feature = "unicode-segment",
597    )))]
598    fn imp(_: &'static str) -> Result<Option<PropertyValues>, Error> {
599        Err(Error::PropertyValueNotFound)
600    }
601
602    #[cfg(any(
603        feature = "unicode-age",
604        feature = "unicode-bool",
605        feature = "unicode-gencat",
606        feature = "unicode-perl",
607        feature = "unicode-script",
608        feature = "unicode-segment",
609    ))]
610    fn imp(name: &'static str) -> Result<Option<PropertyValues>, Error> {
611        use crate::unicode_tables::property_values::PROPERTY_VALUES;
612
613        Ok(PROPERTY_VALUES
614            .binary_search_by_key(&name, |&(n, _)| n)
615            .ok()
616            .map(|i| PROPERTY_VALUES[i].1))
617    }
618
619    imp(canonical_property_name)
620}
621
622// This is only used in some cases, but small enough to just let it be dead
623// instead of figuring out (and maintaining) the right set of features.
624#[allow(dead_code)]
625fn property_set(
626    name_map: &'static [(&'static str, Range)],
627    canonical: &'static str,
628) -> Option<Range> {
629    name_map
630        .binary_search_by_key(&canonical, |x| x.0)
631        .ok()
632        .map(|i| name_map[i].1)
633}
634
635/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
636/// of codepoints that were added in a particular revision of Unicode. The
637/// iterator yields items in chronological order.
638///
639/// If the given age value isn't valid or if the data isn't available, then an
640/// error is returned instead.
641fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
642    #[cfg(not(feature = "unicode-age"))]
643    fn imp(_: &str) -> Result<impl Iterator<Item = Range>, Error> {
644        use core::option::IntoIter;
645        Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
646    }
647
648    #[cfg(feature = "unicode-age")]
649    fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
650        use crate::unicode_tables::age;
651
652        const AGES: &[(&str, Range)] = &[
653            ("V1_1", age::V1_1),
654            ("V2_0", age::V2_0),
655            ("V2_1", age::V2_1),
656            ("V3_0", age::V3_0),
657            ("V3_1", age::V3_1),
658            ("V3_2", age::V3_2),
659            ("V4_0", age::V4_0),
660            ("V4_1", age::V4_1),
661            ("V5_0", age::V5_0),
662            ("V5_1", age::V5_1),
663            ("V5_2", age::V5_2),
664            ("V6_0", age::V6_0),
665            ("V6_1", age::V6_1),
666            ("V6_2", age::V6_2),
667            ("V6_3", age::V6_3),
668            ("V7_0", age::V7_0),
669            ("V8_0", age::V8_0),
670            ("V9_0", age::V9_0),
671            ("V10_0", age::V10_0),
672            ("V11_0", age::V11_0),
673            ("V12_0", age::V12_0),
674            ("V12_1", age::V12_1),
675            ("V13_0", age::V13_0),
676            ("V14_0", age::V14_0),
677            ("V15_0", age::V15_0),
678            ("V15_1", age::V15_1),
679            ("V16_0", age::V16_0),
680        ];
681        assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
682
683        let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
684        match pos {
685            None => Err(Error::PropertyValueNotFound),
686            Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)),
687        }
688    }
689
690    imp(canonical_age)
691}
692
693/// Returns the Unicode HIR class corresponding to the given general category.
694///
695/// Name canonicalization is assumed to be performed by the caller.
696///
697/// If the given general category could not be found, or if the general
698/// category data is not available, then an error is returned.
699fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
700    #[cfg(not(feature = "unicode-gencat"))]
701    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
702        Err(Error::PropertyNotFound)
703    }
704
705    #[cfg(feature = "unicode-gencat")]
706    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
707        use crate::unicode_tables::general_category::BY_NAME;
708        match name {
709            "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
710            "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
711            "Assigned" => {
712                let mut cls = gencat("Unassigned")?;
713                cls.negate();
714                Ok(cls)
715            }
716            name => property_set(BY_NAME, name)
717                .map(hir_class)
718                .ok_or(Error::PropertyValueNotFound),
719        }
720    }
721
722    match canonical_name {
723        "Decimal_Number" => perl_digit(),
724        name => imp(name),
725    }
726}
727
728/// Returns the Unicode HIR class corresponding to the given script.
729///
730/// Name canonicalization is assumed to be performed by the caller.
731///
732/// If the given script could not be found, or if the script data is not
733/// available, then an error is returned.
734fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
735    #[cfg(not(feature = "unicode-script"))]
736    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
737        Err(Error::PropertyNotFound)
738    }
739
740    #[cfg(feature = "unicode-script")]
741    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
742        use crate::unicode_tables::script::BY_NAME;
743        property_set(BY_NAME, name)
744            .map(hir_class)
745            .ok_or(Error::PropertyValueNotFound)
746    }
747
748    imp(canonical_name)
749}
750
751/// Returns the Unicode HIR class corresponding to the given script extension.
752///
753/// Name canonicalization is assumed to be performed by the caller.
754///
755/// If the given script extension could not be found, or if the script data is
756/// not available, then an error is returned.
757fn script_extension(
758    canonical_name: &'static str,
759) -> Result<hir::ClassUnicode, Error> {
760    #[cfg(not(feature = "unicode-script"))]
761    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
762        Err(Error::PropertyNotFound)
763    }
764
765    #[cfg(feature = "unicode-script")]
766    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
767        use crate::unicode_tables::script_extension::BY_NAME;
768        property_set(BY_NAME, name)
769            .map(hir_class)
770            .ok_or(Error::PropertyValueNotFound)
771    }
772
773    imp(canonical_name)
774}
775
776/// Returns the Unicode HIR class corresponding to the given Unicode boolean
777/// property.
778///
779/// Name canonicalization is assumed to be performed by the caller.
780///
781/// If the given boolean property could not be found, or if the boolean
782/// property data is not available, then an error is returned.
783fn bool_property(
784    canonical_name: &'static str,
785) -> Result<hir::ClassUnicode, Error> {
786    #[cfg(not(feature = "unicode-bool"))]
787    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
788        Err(Error::PropertyNotFound)
789    }
790
791    #[cfg(feature = "unicode-bool")]
792    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
793        use crate::unicode_tables::property_bool::BY_NAME;
794        property_set(BY_NAME, name)
795            .map(hir_class)
796            .ok_or(Error::PropertyNotFound)
797    }
798
799    match canonical_name {
800        "Decimal_Number" => perl_digit(),
801        "White_Space" => perl_space(),
802        name => imp(name),
803    }
804}
805
806/// Returns the Unicode HIR class corresponding to the given grapheme cluster
807/// break property.
808///
809/// Name canonicalization is assumed to be performed by the caller.
810///
811/// If the given property could not be found, or if the corresponding data is
812/// not available, then an error is returned.
813fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
814    #[cfg(not(feature = "unicode-segment"))]
815    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
816        Err(Error::PropertyNotFound)
817    }
818
819    #[cfg(feature = "unicode-segment")]
820    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
821        use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
822        property_set(BY_NAME, name)
823            .map(hir_class)
824            .ok_or(Error::PropertyValueNotFound)
825    }
826
827    imp(canonical_name)
828}
829
830/// Returns the Unicode HIR class corresponding to the given word break
831/// property.
832///
833/// Name canonicalization is assumed to be performed by the caller.
834///
835/// If the given property could not be found, or if the corresponding data is
836/// not available, then an error is returned.
837fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
838    #[cfg(not(feature = "unicode-segment"))]
839    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
840        Err(Error::PropertyNotFound)
841    }
842
843    #[cfg(feature = "unicode-segment")]
844    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
845        use crate::unicode_tables::word_break::BY_NAME;
846        property_set(BY_NAME, name)
847            .map(hir_class)
848            .ok_or(Error::PropertyValueNotFound)
849    }
850
851    imp(canonical_name)
852}
853
854/// Returns the Unicode HIR class corresponding to the given sentence
855/// break property.
856///
857/// Name canonicalization is assumed to be performed by the caller.
858///
859/// If the given property could not be found, or if the corresponding data is
860/// not available, then an error is returned.
861fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
862    #[cfg(not(feature = "unicode-segment"))]
863    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
864        Err(Error::PropertyNotFound)
865    }
866
867    #[cfg(feature = "unicode-segment")]
868    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
869        use crate::unicode_tables::sentence_break::BY_NAME;
870        property_set(BY_NAME, name)
871            .map(hir_class)
872            .ok_or(Error::PropertyValueNotFound)
873    }
874
875    imp(canonical_name)
876}
877
878/// Like symbolic_name_normalize_bytes, but operates on a string.
879fn symbolic_name_normalize(x: &str) -> String {
880    let mut tmp = x.as_bytes().to_vec();
881    let len = symbolic_name_normalize_bytes(&mut tmp).len();
882    tmp.truncate(len);
883    // This should always succeed because `symbolic_name_normalize_bytes`
884    // guarantees that `&tmp[..len]` is always valid UTF-8.
885    //
886    // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
887    // to be worth skipping the additional safety check. A benchmark must
888    // justify it first.
889    String::from_utf8(tmp).unwrap()
890}
891
892/// Normalize the given symbolic name in place according to UAX44-LM3.
893///
894/// A "symbolic name" typically corresponds to property names and property
895/// value aliases. Note, though, that it should not be applied to property
896/// string values.
897///
898/// The slice returned is guaranteed to be valid UTF-8 for all possible values
899/// of `slice`.
900///
901/// See: https://unicode.org/reports/tr44/#UAX44-LM3
902fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
903    // I couldn't find a place in the standard that specified that property
904    // names/aliases had a particular structure (unlike character names), but
905    // we assume that it's ASCII only and drop anything that isn't ASCII.
906    let mut start = 0;
907    let mut starts_with_is = false;
908    if slice.len() >= 2 {
909        // Ignore any "is" prefix.
910        starts_with_is = slice[0..2] == b"is"[..]
911            || slice[0..2] == b"IS"[..]
912            || slice[0..2] == b"iS"[..]
913            || slice[0..2] == b"Is"[..];
914        if starts_with_is {
915            start = 2;
916        }
917    }
918    let mut next_write = 0;
919    for i in start..slice.len() {
920        // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
921        // UTF-8, we ensure that the slice contains only ASCII bytes. In
922        // particular, we drop every non-ASCII byte from the normalized string.
923        let b = slice[i];
924        if b == b' ' || b == b'_' || b == b'-' {
925            continue;
926        } else if b'A' <= b && b <= b'Z' {
927            slice[next_write] = b + (b'a' - b'A');
928            next_write += 1;
929        } else if b <= 0x7F {
930            slice[next_write] = b;
931            next_write += 1;
932        }
933    }
934    // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
935    // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
936    // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
937    // is actually an alias for the 'Other' general category.
938    if starts_with_is && next_write == 1 && slice[0] == b'c' {
939        slice[0] = b'i';
940        slice[1] = b's';
941        slice[2] = b'c';
942        next_write = 3;
943    }
944    &mut slice[..next_write]
945}
946
947#[cfg(test)]
948mod tests {
949    use super::*;
950
951    #[cfg(feature = "unicode-case")]
952    fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
953        SimpleCaseFolder::new().unwrap().mapping(c).iter().copied()
954    }
955
956    #[cfg(feature = "unicode-case")]
957    fn contains_case_map(start: char, end: char) -> bool {
958        SimpleCaseFolder::new().unwrap().overlaps(start, end)
959    }
960
961    #[test]
962    #[cfg(feature = "unicode-case")]
963    fn simple_fold_k() {
964        let xs: Vec<char> = simple_fold_ok('k').collect();
965        assert_eq!(xs, alloc::vec!['K', 'K']);
966
967        let xs: Vec<char> = simple_fold_ok('K').collect();
968        assert_eq!(xs, alloc::vec!['k', 'K']);
969
970        let xs: Vec<char> = simple_fold_ok('K').collect();
971        assert_eq!(xs, alloc::vec!['K', 'k']);
972    }
973
974    #[test]
975    #[cfg(feature = "unicode-case")]
976    fn simple_fold_a() {
977        let xs: Vec<char> = simple_fold_ok('a').collect();
978        assert_eq!(xs, alloc::vec!['A']);
979
980        let xs: Vec<char> = simple_fold_ok('A').collect();
981        assert_eq!(xs, alloc::vec!['a']);
982    }
983
984    #[test]
985    #[cfg(not(feature = "unicode-case"))]
986    fn simple_fold_disabled() {
987        assert!(SimpleCaseFolder::new().is_err());
988    }
989
990    #[test]
991    #[cfg(feature = "unicode-case")]
992    fn range_contains() {
993        assert!(contains_case_map('A', 'A'));
994        assert!(contains_case_map('Z', 'Z'));
995        assert!(contains_case_map('A', 'Z'));
996        assert!(contains_case_map('@', 'A'));
997        assert!(contains_case_map('Z', '['));
998        assert!(contains_case_map('☃', 'Ⰰ'));
999
1000        assert!(!contains_case_map('[', '['));
1001        assert!(!contains_case_map('[', '`'));
1002
1003        assert!(!contains_case_map('☃', '☃'));
1004    }
1005
1006    #[test]
1007    #[cfg(feature = "unicode-gencat")]
1008    fn regression_466() {
1009        use super::{CanonicalClassQuery, ClassQuery};
1010
1011        let q = ClassQuery::OneLetter('C');
1012        assert_eq!(
1013            q.canonicalize().unwrap(),
1014            CanonicalClassQuery::GeneralCategory("Other")
1015        );
1016    }
1017
1018    #[test]
1019    fn sym_normalize() {
1020        let sym_norm = symbolic_name_normalize;
1021
1022        assert_eq!(sym_norm("Line_Break"), "linebreak");
1023        assert_eq!(sym_norm("Line-break"), "linebreak");
1024        assert_eq!(sym_norm("linebreak"), "linebreak");
1025        assert_eq!(sym_norm("BA"), "ba");
1026        assert_eq!(sym_norm("ba"), "ba");
1027        assert_eq!(sym_norm("Greek"), "greek");
1028        assert_eq!(sym_norm("isGreek"), "greek");
1029        assert_eq!(sym_norm("IS_Greek"), "greek");
1030        assert_eq!(sym_norm("isc"), "isc");
1031        assert_eq!(sym_norm("is c"), "isc");
1032        assert_eq!(sym_norm("is_c"), "isc");
1033    }
1034
1035    #[test]
1036    fn valid_utf8_symbolic() {
1037        let mut x = b"abc\xFFxyz".to_vec();
1038        let y = symbolic_name_normalize_bytes(&mut x);
1039        assert_eq!(y, b"abcxyz");
1040    }
1041}
regex_syntax/unicode.rs

regex_syntax/
unicode.rs