1use alloc::{
2 string::{String, ToString},
3vec::Vec,
4};
56use crate::hir;
78/// An inclusive range of codepoints from a generated file (hence the static
9/// lifetime).
10type Range = &'static [(char, char)];
1112/// An error that occurs when dealing with Unicode.
13///
14/// We don't impl the Error trait here because these always get converted
15/// into other public errors. (This error type isn't exported.)
16#[derive(#[automatically_derived]
impl ::core::fmt::Debug for Error {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::write_str(f,
match self {
Error::PropertyNotFound => "PropertyNotFound",
Error::PropertyValueNotFound => "PropertyValueNotFound",
Error::PerlClassNotFound => "PerlClassNotFound",
})
}
}Debug)]
17pub enum Error {
18 PropertyNotFound,
19 PropertyValueNotFound,
20// Not used when unicode-perl is enabled.
21#[allow(dead_code)]
22PerlClassNotFound,
23}
2425/// An error that occurs when Unicode-aware simple case folding fails.
26///
27/// This error can occur when the case mapping tables necessary for Unicode
28/// aware case folding are unavailable. This only occurs when the
29/// `unicode-case` feature is disabled. (The feature is enabled by default.)
30#[derive(#[automatically_derived]
impl ::core::fmt::Debug for CaseFoldError {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f, "CaseFoldError",
&&self.0)
}
}Debug)]
31pub struct CaseFoldError(());
3233#[cfg(feature = "std")]
34impl std::error::Errorfor CaseFoldError {}
3536impl core::fmt::Displayfor CaseFoldError {
37fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
38f.write_fmt(format_args!("Unicode-aware case folding is not available (probably because the unicode-case feature is not enabled)"))write!(
39f,
40"Unicode-aware case folding is not available \
41 (probably because the unicode-case feature is not enabled)"
42)43 }
44}
4546/// An error that occurs when the Unicode-aware `\w` class is unavailable.
47///
48/// This error can occur when the data tables necessary for the Unicode aware
49/// Perl character class `\w` are unavailable. This only occurs when the
50/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
51#[derive(#[automatically_derived]
impl ::core::fmt::Debug for UnicodeWordError {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"UnicodeWordError", &&self.0)
}
}Debug)]
52pub struct UnicodeWordError(());
5354#[cfg(feature = "std")]
55impl std::error::Errorfor UnicodeWordError {}
5657impl core::fmt::Displayfor UnicodeWordError {
58fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
59f.write_fmt(format_args!("Unicode-aware \\w class is not available (probably because the unicode-perl feature is not enabled)"))write!(
60f,
61"Unicode-aware \\w class is not available \
62 (probably because the unicode-perl feature is not enabled)"
63)64 }
65}
6667/// A state oriented traverser of the simple case folding table.
68///
69/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will
70/// return an error if the underlying case folding table is unavailable.
71///
72/// After construction, it is expected that callers will use
73/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly
74/// increasing order. For example, calling it on `b` and then on `a` is illegal
75/// and will result in a panic.
76///
77/// The main idea of this type is that it tries hard to make mapping lookups
78/// fast by exploiting the structure of the underlying table, and the ordering
79/// assumption enables this.
80#[derive(#[automatically_derived]
impl ::core::fmt::Debug for SimpleCaseFolder {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f,
"SimpleCaseFolder", "table", &self.table, "last", &self.last,
"next", &&self.next)
}
}Debug)]
81pub struct SimpleCaseFolder {
82/// The simple case fold table. It's a sorted association list, where the
83 /// keys are Unicode scalar values and the values are the corresponding
84 /// equivalence class (not including the key) of the "simple" case folded
85 /// Unicode scalar values.
86table: &'static [(char, &'static [char])],
87/// The last codepoint that was used for a lookup.
88last: Option<char>,
89/// The index to the entry in `table` corresponding to the smallest key `k`
90 /// such that `k > k0`, where `k0` is the most recent key lookup. Note that
91 /// in particular, `k0` may not be in the table!
92next: usize,
93}
9495impl SimpleCaseFolder {
96/// Create a new simple case folder, returning an error if the underlying
97 /// case folding table is unavailable.
98pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> {
99#[cfg(not(feature = "unicode-case"))]
100{
101Err(CaseFoldError(()))
102 }
103#[cfg(feature = "unicode-case")]
104{
105Ok(SimpleCaseFolder {
106 table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE,
107 last: None,
108 next: 0,
109 })
110 }
111 }
112113/// Return the equivalence class of case folded codepoints for the given
114 /// codepoint. The equivalence class returned never includes the codepoint
115 /// given. If the given codepoint has no case folded codepoints (i.e.,
116 /// no entry in the underlying case folding table), then this returns an
117 /// empty slice.
118 ///
119 /// # Panics
120 ///
121 /// This panics when called with a `c` that is less than or equal to the
122 /// previous call. In other words, callers need to use this method with
123 /// strictly increasing values of `c`.
124pub fn mapping(&mut self, c: char) -> &'static [char] {
125if let Some(last) = self.last {
126if !(last < c) {
{
::core::panicking::panic_fmt(format_args!("got codepoint U+{0:X} which occurs before last codepoint U+{1:X}",
u32::from(c), u32::from(last)));
}
};assert!(
127 last < c,
128"got codepoint U+{:X} which occurs before \
129 last codepoint U+{:X}",
130 u32::from(c),
131 u32::from(last),
132 );
133 }
134self.last = Some(c);
135if self.next >= self.table.len() {
136return &[];
137 }
138let (k, v) = self.table[self.next];
139if k == c {
140self.next += 1;
141return v;
142 }
143match self.get(c) {
144Err(i) => {
145self.next = i;
146&[]
147 }
148Ok(i) => {
149// Since we require lookups to proceed
150 // in order, anything we find should be
151 // after whatever we thought might be
152 // next. Otherwise, the caller is either
153 // going out of order or we would have
154 // found our next key at 'self.next'.
155if !(i > self.next) {
::core::panicking::panic("assertion failed: i > self.next")
};assert!(i > self.next);
156self.next = i + 1;
157self.table[i].1
158}
159 }
160 }
161162/// Returns true if and only if the given range overlaps with any region
163 /// of the underlying case folding table. That is, when true, there exists
164 /// at least one codepoint in the inclusive range `[start, end]` that has
165 /// a non-trivial equivalence class of case folded codepoints. Conversely,
166 /// when this returns false, all codepoints in the range `[start, end]`
167 /// correspond to the trivial equivalence class of case folded codepoints,
168 /// i.e., itself.
169 ///
170 /// This is useful to call before iterating over the codepoints in the
171 /// range and looking up the mapping for each. If you know none of the
172 /// mappings will return anything, then you might be able to skip doing it
173 /// altogether.
174 ///
175 /// # Panics
176 ///
177 /// This panics when `end < start`.
178pub fn overlaps(&self, start: char, end: char) -> bool {
179use core::cmp::Ordering;
180181if !(start <= end) {
::core::panicking::panic("assertion failed: start <= end")
};assert!(start <= end);
182self.table
183 .binary_search_by(|&(c, _)| {
184if start <= c && c <= end {
185 Ordering::Equal186 } else if c > end {
187 Ordering::Greater188 } else {
189 Ordering::Less190 }
191 })
192 .is_ok()
193 }
194195/// Returns the index at which `c` occurs in the simple case fold table. If
196 /// `c` does not occur, then this returns an `i` such that `table[i-1].0 <
197 /// c` and `table[i].0 > c`.
198fn get(&self, c: char) -> Result<usize, usize> {
199self.table.binary_search_by_key(&c, |&(c1, _)| c1)
200 }
201}
202203/// A query for finding a character class defined by Unicode. This supports
204/// either use of a property name directly, or lookup by property value. The
205/// former generally refers to Binary properties (see UTS#44, Table 8), but
206/// as a special exception (see UTS#18, Section 1.2) both general categories
207/// (an enumeration) and scripts (a catalog) are supported as if each of their
208/// possible values were a binary property.
209///
210/// In all circumstances, property names and values are normalized and
211/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
212///
213/// The lifetime `'a` refers to the shorter of the lifetimes of property name
214/// and property value.
215#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ClassQuery<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
match self {
ClassQuery::OneLetter(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"OneLetter", &__self_0),
ClassQuery::Binary(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Binary",
&__self_0),
ClassQuery::ByValue {
property_name: __self_0, property_value: __self_1 } =>
::core::fmt::Formatter::debug_struct_field2_finish(f,
"ByValue", "property_name", __self_0, "property_value",
&__self_1),
}
}
}Debug)]
216pub enum ClassQuery<'a> {
217/// Return a class corresponding to a Unicode binary property, named by
218 /// a single letter.
219OneLetter(char),
220/// Return a class corresponding to a Unicode binary property.
221 ///
222 /// Note that, by special exception (see UTS#18, Section 1.2), both
223 /// general category values and script values are permitted here as if
224 /// they were a binary property.
225Binary(&'a str),
226/// Return a class corresponding to all codepoints whose property
227 /// (identified by `property_name`) corresponds to the given value
228 /// (identified by `property_value`).
229ByValue {
230/// A property name.
231property_name: &'a str,
232/// A property value.
233property_value: &'a str,
234 },
235}
236237impl<'a> ClassQuery<'a> {
238fn canonicalize(&self) -> Result<CanonicalClassQuery, Error> {
239match *self {
240 ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
241 ClassQuery::Binary(name) => self.canonical_binary(name),
242 ClassQuery::ByValue { property_name, property_value } => {
243let property_name = symbolic_name_normalize(property_name);
244let property_value = symbolic_name_normalize(property_value);
245246let canon_name = match canonical_prop(&property_name)? {
247None => return Err(Error::PropertyNotFound),
248Some(canon_name) => canon_name,
249 };
250Ok(match canon_name {
251"General_Category" => {
252let canon = match canonical_gencat(&property_value)? {
253None => return Err(Error::PropertyValueNotFound),
254Some(canon) => canon,
255 };
256 CanonicalClassQuery::GeneralCategory(canon)
257 }
258"Script" => {
259let canon = match canonical_script(&property_value)? {
260None => return Err(Error::PropertyValueNotFound),
261Some(canon) => canon,
262 };
263 CanonicalClassQuery::Script(canon)
264 }
265_ => {
266let vals = match property_values(canon_name)? {
267None => return Err(Error::PropertyValueNotFound),
268Some(vals) => vals,
269 };
270let canon_val =
271match canonical_value(vals, &property_value) {
272None => {
273return Err(Error::PropertyValueNotFound)
274 }
275Some(canon_val) => canon_val,
276 };
277 CanonicalClassQuery::ByValue {
278 property_name: canon_name,
279 property_value: canon_val,
280 }
281 }
282 })
283 }
284 }
285 }
286287fn canonical_binary(
288&self,
289 name: &str,
290 ) -> Result<CanonicalClassQuery, Error> {
291let norm = symbolic_name_normalize(name);
292293// This is a special case where 'cf' refers to the 'Format' general
294 // category, but where the 'cf' abbreviation is also an abbreviation
295 // for the 'Case_Folding' property. But we want to treat it as
296 // a general category. (Currently, we don't even support the
297 // 'Case_Folding' property. But if we do in the future, users will be
298 // required to spell it out.)
299 //
300 // Also 'sc' refers to the 'Currency_Symbol' general category, but is
301 // also the abbreviation for the 'Script' property. So we avoid calling
302 // 'canonical_prop' for it too, which would erroneously normalize it
303 // to 'Script'.
304 //
305 // Another case: 'lc' is an abbreviation for the 'Cased_Letter'
306 // general category, but is also an abbreviation for the 'Lowercase_Mapping'
307 // property. We don't currently support the latter, so as with 'cf'
308 // above, we treat 'lc' as 'Cased_Letter'.
309if norm != "cf" && norm != "sc" && norm != "lc" {
310if let Some(canon) = canonical_prop(&norm)? {
311return Ok(CanonicalClassQuery::Binary(canon));
312 }
313 }
314if let Some(canon) = canonical_gencat(&norm)? {
315return Ok(CanonicalClassQuery::GeneralCategory(canon));
316 }
317if let Some(canon) = canonical_script(&norm)? {
318return Ok(CanonicalClassQuery::Script(canon));
319 }
320Err(Error::PropertyNotFound)
321 }
322}
323324/// Like ClassQuery, but its parameters have been canonicalized. This also
325/// differentiates binary properties from flattened general categories and
326/// scripts.
327#[derive(#[automatically_derived]
impl ::core::fmt::Debug for CanonicalClassQuery {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
match self {
CanonicalClassQuery::Binary(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Binary",
&__self_0),
CanonicalClassQuery::GeneralCategory(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"GeneralCategory", &__self_0),
CanonicalClassQuery::Script(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Script",
&__self_0),
CanonicalClassQuery::ByValue {
property_name: __self_0, property_value: __self_1 } =>
::core::fmt::Formatter::debug_struct_field2_finish(f,
"ByValue", "property_name", __self_0, "property_value",
&__self_1),
}
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for CanonicalClassQuery {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<&'static str>;
let _: ::core::cmp::AssertParamIsEq<&'static str>;
let _: ::core::cmp::AssertParamIsEq<&'static str>;
let _: ::core::cmp::AssertParamIsEq<&'static str>;
let _: ::core::cmp::AssertParamIsEq<&'static str>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for CanonicalClassQuery {
#[inline]
fn eq(&self, other: &CanonicalClassQuery) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr &&
match (self, other) {
(CanonicalClassQuery::Binary(__self_0),
CanonicalClassQuery::Binary(__arg1_0)) =>
__self_0 == __arg1_0,
(CanonicalClassQuery::GeneralCategory(__self_0),
CanonicalClassQuery::GeneralCategory(__arg1_0)) =>
__self_0 == __arg1_0,
(CanonicalClassQuery::Script(__self_0),
CanonicalClassQuery::Script(__arg1_0)) =>
__self_0 == __arg1_0,
(CanonicalClassQuery::ByValue {
property_name: __self_0, property_value: __self_1 },
CanonicalClassQuery::ByValue {
property_name: __arg1_0, property_value: __arg1_1 }) =>
__self_0 == __arg1_0 && __self_1 == __arg1_1,
_ => unsafe { ::core::intrinsics::unreachable() }
}
}
}PartialEq)]
328enum CanonicalClassQuery {
329/// The canonical binary property name.
330Binary(&'static str),
331/// The canonical general category name.
332GeneralCategory(&'static str),
333/// The canonical script name.
334Script(&'static str),
335/// An arbitrary association between property and value, both of which
336 /// have been canonicalized.
337 ///
338 /// Note that by construction, the property name of ByValue will never
339 /// be General_Category or Script. Those two cases are subsumed by the
340 /// eponymous variants.
341ByValue {
342/// The canonical property name.
343property_name: &'static str,
344/// The canonical property value.
345property_value: &'static str,
346 },
347}
348349/// Looks up a Unicode class given a query. If one doesn't exist, then
350/// `None` is returned.
351pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode, Error> {
352use self::CanonicalClassQuery::*;
353354match query.canonicalize()? {
355Binary(name) => bool_property(name),
356GeneralCategory(name) => gencat(name),
357Script(name) => script(name),
358ByValue { property_name: "Age", property_value } => {
359let mut class = hir::ClassUnicode::empty();
360for set in ages(property_value)? {
361 class.union(&hir_class(set));
362 }
363Ok(class)
364 }
365ByValue { property_name: "Script_Extensions", property_value } => {
366script_extension(property_value)
367 }
368ByValue {
369 property_name: "Grapheme_Cluster_Break",
370 property_value,
371 } => gcb(property_value),
372ByValue { property_name: "Sentence_Break", property_value } => {
373sb(property_value)
374 }
375ByValue { property_name: "Word_Break", property_value } => {
376wb(property_value)
377 }
378_ => {
379// What else should we support?
380Err(Error::PropertyNotFound)
381 }
382 }
383}
384385/// Returns a Unicode aware class for \w.
386///
387/// This returns an error if the data is not available for \w.
388pub fn perl_word() -> Result<hir::ClassUnicode, Error> {
389#[cfg(not(feature = "unicode-perl"))]
390fn imp() -> Result<hir::ClassUnicode, Error> {
391Err(Error::PerlClassNotFound)
392 }
393394#[cfg(feature = "unicode-perl")]
395fn imp() -> Result<hir::ClassUnicode, Error> {
396use crate::unicode_tables::perl_word::PERL_WORD;
397Ok(hir_class(PERL_WORD))
398 }
399400imp()
401}
402403/// Returns a Unicode aware class for \s.
404///
405/// This returns an error if the data is not available for \s.
406pub fn perl_space() -> Result<hir::ClassUnicode, Error> {
407#[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
408fn imp() -> Result<hir::ClassUnicode, Error> {
409Err(Error::PerlClassNotFound)
410 }
411412#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
413fn imp() -> Result<hir::ClassUnicode, Error> {
414use crate::unicode_tables::perl_space::WHITE_SPACE;
415Ok(hir_class(WHITE_SPACE))
416 }
417418#[cfg(feature = "unicode-bool")]
419fn imp() -> Result<hir::ClassUnicode, Error> {
420use crate::unicode_tables::property_bool::WHITE_SPACE;
421Ok(hir_class(WHITE_SPACE))
422 }
423424imp()
425}
426427/// Returns a Unicode aware class for \d.
428///
429/// This returns an error if the data is not available for \d.
430pub fn perl_digit() -> Result<hir::ClassUnicode, Error> {
431#[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
432fn imp() -> Result<hir::ClassUnicode, Error> {
433Err(Error::PerlClassNotFound)
434 }
435436#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
437fn imp() -> Result<hir::ClassUnicode, Error> {
438use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
439Ok(hir_class(DECIMAL_NUMBER))
440 }
441442#[cfg(feature = "unicode-gencat")]
443fn imp() -> Result<hir::ClassUnicode, Error> {
444use crate::unicode_tables::general_category::DECIMAL_NUMBER;
445Ok(hir_class(DECIMAL_NUMBER))
446 }
447448imp()
449}
450451/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
452pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
453let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges454 .iter()
455 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
456 .collect();
457 hir::ClassUnicode::new(hir_ranges)
458}
459460/// Returns true only if the given codepoint is in the `\w` character class.
461///
462/// If the `unicode-perl` feature is not enabled, then this returns an error.
463pub fn is_word_character(c: char) -> Result<bool, UnicodeWordError> {
464#[cfg(not(feature = "unicode-perl"))]
465fn imp(_: char) -> Result<bool, UnicodeWordError> {
466Err(UnicodeWordError(()))
467 }
468469#[cfg(feature = "unicode-perl")]
470fn imp(c: char) -> Result<bool, UnicodeWordError> {
471use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD};
472473if u8::try_from(c).map_or(false, is_word_byte) {
474return Ok(true);
475 }
476Ok(PERL_WORD477 .binary_search_by(|&(start, end)| {
478use core::cmp::Ordering;
479480if start <= c && c <= end {
481 Ordering::Equal482 } else if start > c {
483 Ordering::Greater484 } else {
485 Ordering::Less486 }
487 })
488 .is_ok())
489 }
490491imp(c)
492}
493494/// A mapping of property values for a specific property.
495///
496/// The first element of each tuple is a normalized property value while the
497/// second element of each tuple is the corresponding canonical property
498/// value.
499type PropertyValues = &'static [(&'static str, &'static str)];
500501fn canonical_gencat(
502 normalized_value: &str,
503) -> Result<Option<&'static str>, Error> {
504Ok(match normalized_value {
505"any" => Some("Any"),
506"assigned" => Some("Assigned"),
507"ascii" => Some("ASCII"),
508_ => {
509let gencats = property_values("General_Category")?.unwrap();
510canonical_value(gencats, normalized_value)
511 }
512 })
513}
514515fn canonical_script(
516 normalized_value: &str,
517) -> Result<Option<&'static str>, Error> {
518let scripts = property_values("Script")?.unwrap();
519Ok(canonical_value(scripts, normalized_value))
520}
521522/// Find the canonical property name for the given normalized property name.
523///
524/// If no such property exists, then `None` is returned.
525///
526/// The normalized property name must have been normalized according to
527/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
528///
529/// If the property names data is not available, then an error is returned.
530fn canonical_prop(
531 normalized_name: &str,
532) -> Result<Option<&'static str>, Error> {
533#[cfg(not(any(
534 feature = "unicode-age",
535 feature = "unicode-bool",
536 feature = "unicode-gencat",
537 feature = "unicode-perl",
538 feature = "unicode-script",
539 feature = "unicode-segment",
540 )))]
541fn imp(_: &str) -> Result<Option<&'static str>, Error> {
542Err(Error::PropertyNotFound)
543 }
544545#[cfg(any(
546 feature = "unicode-age",
547 feature = "unicode-bool",
548 feature = "unicode-gencat",
549 feature = "unicode-perl",
550 feature = "unicode-script",
551 feature = "unicode-segment",
552 ))]
553fn imp(name: &str) -> Result<Option<&'static str>, Error> {
554use crate::unicode_tables::property_names::PROPERTY_NAMES;
555556Ok(PROPERTY_NAMES557 .binary_search_by_key(&name, |&(n, _)| n)
558 .ok()
559 .map(|i| PROPERTY_NAMES[i].1))
560 }
561562imp(normalized_name)
563}
564565/// Find the canonical property value for the given normalized property
566/// value.
567///
568/// The given property values should correspond to the values for the property
569/// under question, which can be found using `property_values`.
570///
571/// If no such property value exists, then `None` is returned.
572///
573/// The normalized property value must have been normalized according to
574/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
575fn canonical_value(
576 vals: PropertyValues,
577 normalized_value: &str,
578) -> Option<&'static str> {
579vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
580 .ok()
581 .map(|i| vals[i].1)
582}
583584/// Return the table of property values for the given property name.
585///
586/// If the property values data is not available, then an error is returned.
587fn property_values(
588 canonical_property_name: &'static str,
589) -> Result<Option<PropertyValues>, Error> {
590#[cfg(not(any(
591 feature = "unicode-age",
592 feature = "unicode-bool",
593 feature = "unicode-gencat",
594 feature = "unicode-perl",
595 feature = "unicode-script",
596 feature = "unicode-segment",
597 )))]
598fn imp(_: &'static str) -> Result<Option<PropertyValues>, Error> {
599Err(Error::PropertyValueNotFound)
600 }
601602#[cfg(any(
603 feature = "unicode-age",
604 feature = "unicode-bool",
605 feature = "unicode-gencat",
606 feature = "unicode-perl",
607 feature = "unicode-script",
608 feature = "unicode-segment",
609 ))]
610fn imp(name: &'static str) -> Result<Option<PropertyValues>, Error> {
611use crate::unicode_tables::property_values::PROPERTY_VALUES;
612613Ok(PROPERTY_VALUES614 .binary_search_by_key(&name, |&(n, _)| n)
615 .ok()
616 .map(|i| PROPERTY_VALUES[i].1))
617 }
618619imp(canonical_property_name)
620}
621622// This is only used in some cases, but small enough to just let it be dead
623// instead of figuring out (and maintaining) the right set of features.
624#[allow(dead_code)]
625fn property_set(
626 name_map: &'static [(&'static str, Range)],
627 canonical: &'static str,
628) -> Option<Range> {
629name_map630 .binary_search_by_key(&canonical, |x| x.0)
631 .ok()
632 .map(|i| name_map[i].1)
633}
634635/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
636/// of codepoints that were added in a particular revision of Unicode. The
637/// iterator yields items in chronological order.
638///
639/// If the given age value isn't valid or if the data isn't available, then an
640/// error is returned instead.
641fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
642#[cfg(not(feature = "unicode-age"))]
643fn imp(_: &str) -> Result<impl Iterator<Item = Range>, Error> {
644use core::option::IntoIter;
645 Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
646 }
647648#[cfg(feature = "unicode-age")]
649fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
650use crate::unicode_tables::age;
651652const AGES: &[(&str, Range)] = &[
653 ("V1_1", age::V1_1),
654 ("V2_0", age::V2_0),
655 ("V2_1", age::V2_1),
656 ("V3_0", age::V3_0),
657 ("V3_1", age::V3_1),
658 ("V3_2", age::V3_2),
659 ("V4_0", age::V4_0),
660 ("V4_1", age::V4_1),
661 ("V5_0", age::V5_0),
662 ("V5_1", age::V5_1),
663 ("V5_2", age::V5_2),
664 ("V6_0", age::V6_0),
665 ("V6_1", age::V6_1),
666 ("V6_2", age::V6_2),
667 ("V6_3", age::V6_3),
668 ("V7_0", age::V7_0),
669 ("V8_0", age::V8_0),
670 ("V9_0", age::V9_0),
671 ("V10_0", age::V10_0),
672 ("V11_0", age::V11_0),
673 ("V12_0", age::V12_0),
674 ("V12_1", age::V12_1),
675 ("V13_0", age::V13_0),
676 ("V14_0", age::V14_0),
677 ("V15_0", age::V15_0),
678 ("V15_1", age::V15_1),
679 ("V16_0", age::V16_0),
680 ];
681match (&AGES.len(), &age::BY_NAME.len()) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::Some(format_args!("ages are out of sync")));
}
}
};assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
682683let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
684match pos {
685None => Err(Error::PropertyValueNotFound),
686Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)),
687 }
688 }
689690imp(canonical_age)
691}
692693/// Returns the Unicode HIR class corresponding to the given general category.
694///
695/// Name canonicalization is assumed to be performed by the caller.
696///
697/// If the given general category could not be found, or if the general
698/// category data is not available, then an error is returned.
699fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
700#[cfg(not(feature = "unicode-gencat"))]
701fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
702Err(Error::PropertyNotFound)
703 }
704705#[cfg(feature = "unicode-gencat")]
706fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
707use crate::unicode_tables::general_category::BY_NAME;
708match name {
709"ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
710"Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
711"Assigned" => {
712let mut cls = gencat("Unassigned")?;
713cls.negate();
714Ok(cls)
715 }
716 name => property_set(BY_NAME, name)
717 .map(hir_class)
718 .ok_or(Error::PropertyValueNotFound),
719 }
720 }
721722match canonical_name {
723"Decimal_Number" => perl_digit(),
724 name => imp(name),
725 }
726}
727728/// Returns the Unicode HIR class corresponding to the given script.
729///
730/// Name canonicalization is assumed to be performed by the caller.
731///
732/// If the given script could not be found, or if the script data is not
733/// available, then an error is returned.
734fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
735#[cfg(not(feature = "unicode-script"))]
736fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
737Err(Error::PropertyNotFound)
738 }
739740#[cfg(feature = "unicode-script")]
741fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
742use crate::unicode_tables::script::BY_NAME;
743property_set(BY_NAME, name)
744 .map(hir_class)
745 .ok_or(Error::PropertyValueNotFound)
746 }
747748imp(canonical_name)
749}
750751/// Returns the Unicode HIR class corresponding to the given script extension.
752///
753/// Name canonicalization is assumed to be performed by the caller.
754///
755/// If the given script extension could not be found, or if the script data is
756/// not available, then an error is returned.
757fn script_extension(
758 canonical_name: &'static str,
759) -> Result<hir::ClassUnicode, Error> {
760#[cfg(not(feature = "unicode-script"))]
761fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
762Err(Error::PropertyNotFound)
763 }
764765#[cfg(feature = "unicode-script")]
766fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
767use crate::unicode_tables::script_extension::BY_NAME;
768property_set(BY_NAME, name)
769 .map(hir_class)
770 .ok_or(Error::PropertyValueNotFound)
771 }
772773imp(canonical_name)
774}
775776/// Returns the Unicode HIR class corresponding to the given Unicode boolean
777/// property.
778///
779/// Name canonicalization is assumed to be performed by the caller.
780///
781/// If the given boolean property could not be found, or if the boolean
782/// property data is not available, then an error is returned.
783fn bool_property(
784 canonical_name: &'static str,
785) -> Result<hir::ClassUnicode, Error> {
786#[cfg(not(feature = "unicode-bool"))]
787fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
788Err(Error::PropertyNotFound)
789 }
790791#[cfg(feature = "unicode-bool")]
792fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
793use crate::unicode_tables::property_bool::BY_NAME;
794property_set(BY_NAME, name)
795 .map(hir_class)
796 .ok_or(Error::PropertyNotFound)
797 }
798799match canonical_name {
800"Decimal_Number" => perl_digit(),
801"White_Space" => perl_space(),
802 name => imp(name),
803 }
804}
805806/// Returns the Unicode HIR class corresponding to the given grapheme cluster
807/// break property.
808///
809/// Name canonicalization is assumed to be performed by the caller.
810///
811/// If the given property could not be found, or if the corresponding data is
812/// not available, then an error is returned.
813fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
814#[cfg(not(feature = "unicode-segment"))]
815fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
816Err(Error::PropertyNotFound)
817 }
818819#[cfg(feature = "unicode-segment")]
820fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
821use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
822property_set(BY_NAME, name)
823 .map(hir_class)
824 .ok_or(Error::PropertyValueNotFound)
825 }
826827imp(canonical_name)
828}
829830/// Returns the Unicode HIR class corresponding to the given word break
831/// property.
832///
833/// Name canonicalization is assumed to be performed by the caller.
834///
835/// If the given property could not be found, or if the corresponding data is
836/// not available, then an error is returned.
837fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
838#[cfg(not(feature = "unicode-segment"))]
839fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
840Err(Error::PropertyNotFound)
841 }
842843#[cfg(feature = "unicode-segment")]
844fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
845use crate::unicode_tables::word_break::BY_NAME;
846property_set(BY_NAME, name)
847 .map(hir_class)
848 .ok_or(Error::PropertyValueNotFound)
849 }
850851imp(canonical_name)
852}
853854/// Returns the Unicode HIR class corresponding to the given sentence
855/// break property.
856///
857/// Name canonicalization is assumed to be performed by the caller.
858///
859/// If the given property could not be found, or if the corresponding data is
860/// not available, then an error is returned.
861fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
862#[cfg(not(feature = "unicode-segment"))]
863fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
864Err(Error::PropertyNotFound)
865 }
866867#[cfg(feature = "unicode-segment")]
868fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
869use crate::unicode_tables::sentence_break::BY_NAME;
870property_set(BY_NAME, name)
871 .map(hir_class)
872 .ok_or(Error::PropertyValueNotFound)
873 }
874875imp(canonical_name)
876}
877878/// Like symbolic_name_normalize_bytes, but operates on a string.
879fn symbolic_name_normalize(x: &str) -> String {
880let mut tmp = x.as_bytes().to_vec();
881let len = symbolic_name_normalize_bytes(&mut tmp).len();
882tmp.truncate(len);
883// This should always succeed because `symbolic_name_normalize_bytes`
884 // guarantees that `&tmp[..len]` is always valid UTF-8.
885 //
886 // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
887 // to be worth skipping the additional safety check. A benchmark must
888 // justify it first.
889String::from_utf8(tmp).unwrap()
890}
891892/// Normalize the given symbolic name in place according to UAX44-LM3.
893///
894/// A "symbolic name" typically corresponds to property names and property
895/// value aliases. Note, though, that it should not be applied to property
896/// string values.
897///
898/// The slice returned is guaranteed to be valid UTF-8 for all possible values
899/// of `slice`.
900///
901/// See: https://unicode.org/reports/tr44/#UAX44-LM3
902fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
903// I couldn't find a place in the standard that specified that property
904 // names/aliases had a particular structure (unlike character names), but
905 // we assume that it's ASCII only and drop anything that isn't ASCII.
906let mut start = 0;
907let mut starts_with_is = false;
908if slice.len() >= 2 {
909// Ignore any "is" prefix.
910starts_with_is = slice[0..2] == b"is"[..]
911 || slice[0..2] == b"IS"[..]
912 || slice[0..2] == b"iS"[..]
913 || slice[0..2] == b"Is"[..];
914if starts_with_is {
915start = 2;
916 }
917 }
918let mut next_write = 0;
919for i in start..slice.len() {
920// VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
921 // UTF-8, we ensure that the slice contains only ASCII bytes. In
922 // particular, we drop every non-ASCII byte from the normalized string.
923let b = slice[i];
924if b == b' ' || b == b'_' || b == b'-' {
925continue;
926 } else if b'A' <= b && b <= b'Z' {
927 slice[next_write] = b + (b'a' - b'A');
928 next_write += 1;
929 } else if b <= 0x7F {
930 slice[next_write] = b;
931 next_write += 1;
932 }
933 }
934// Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
935 // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
936 // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
937 // is actually an alias for the 'Other' general category.
938if starts_with_is && next_write == 1 && slice[0] == b'c' {
939slice[0] = b'i';
940slice[1] = b's';
941slice[2] = b'c';
942next_write = 3;
943 }
944&mut slice[..next_write]
945}
946947#[cfg(test)]
948mod tests {
949use super::*;
950951#[cfg(feature = "unicode-case")]
952fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
953 SimpleCaseFolder::new().unwrap().mapping(c).iter().copied()
954 }
955956#[cfg(feature = "unicode-case")]
957fn contains_case_map(start: char, end: char) -> bool {
958 SimpleCaseFolder::new().unwrap().overlaps(start, end)
959 }
960961#[test]
962 #[cfg(feature = "unicode-case")]
963fn simple_fold_k() {
964let xs: Vec<char> = simple_fold_ok('k').collect();
965assert_eq!(xs, alloc::vec!['K', 'K']);
966967let xs: Vec<char> = simple_fold_ok('K').collect();
968assert_eq!(xs, alloc::vec!['k', 'K']);
969970let xs: Vec<char> = simple_fold_ok('K').collect();
971assert_eq!(xs, alloc::vec!['K', 'k']);
972 }
973974#[test]
975 #[cfg(feature = "unicode-case")]
976fn simple_fold_a() {
977let xs: Vec<char> = simple_fold_ok('a').collect();
978assert_eq!(xs, alloc::vec!['A']);
979980let xs: Vec<char> = simple_fold_ok('A').collect();
981assert_eq!(xs, alloc::vec!['a']);
982 }
983984#[test]
985 #[cfg(not(feature = "unicode-case"))]
986fn simple_fold_disabled() {
987assert!(SimpleCaseFolder::new().is_err());
988 }
989990#[test]
991 #[cfg(feature = "unicode-case")]
992fn range_contains() {
993assert!(contains_case_map('A', 'A'));
994assert!(contains_case_map('Z', 'Z'));
995assert!(contains_case_map('A', 'Z'));
996assert!(contains_case_map('@', 'A'));
997assert!(contains_case_map('Z', '['));
998assert!(contains_case_map('☃', 'Ⰰ'));
9991000assert!(!contains_case_map('[', '['));
1001assert!(!contains_case_map('[', '`'));
10021003assert!(!contains_case_map('☃', '☃'));
1004 }
10051006#[test]
1007 #[cfg(feature = "unicode-gencat")]
1008fn regression_466() {
1009use super::{CanonicalClassQuery, ClassQuery};
10101011let q = ClassQuery::OneLetter('C');
1012assert_eq!(
1013 q.canonicalize().unwrap(),
1014 CanonicalClassQuery::GeneralCategory("Other")
1015 );
1016 }
10171018#[test]
1019fn sym_normalize() {
1020let sym_norm = symbolic_name_normalize;
10211022assert_eq!(sym_norm("Line_Break"), "linebreak");
1023assert_eq!(sym_norm("Line-break"), "linebreak");
1024assert_eq!(sym_norm("linebreak"), "linebreak");
1025assert_eq!(sym_norm("BA"), "ba");
1026assert_eq!(sym_norm("ba"), "ba");
1027assert_eq!(sym_norm("Greek"), "greek");
1028assert_eq!(sym_norm("isGreek"), "greek");
1029assert_eq!(sym_norm("IS_Greek"), "greek");
1030assert_eq!(sym_norm("isc"), "isc");
1031assert_eq!(sym_norm("is c"), "isc");
1032assert_eq!(sym_norm("is_c"), "isc");
1033 }
10341035#[test]
1036fn valid_utf8_symbolic() {
1037let mut x = b"abc\xFFxyz".to_vec();
1038let y = symbolic_name_normalize_bytes(&mut x);
1039assert_eq!(y, b"abcxyz");
1040 }
1041}