Skip to main content

icu_normalizer/
uts46.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Bundles the part of UTS 46 that makes sense to implement as a
6//! normalization.
7//!
8//! This is meant to be used as a building block of an UTS 46
9//! implementation, such as the `idna` crate.
10
11use crate::ComposingNormalizer;
12use crate::ComposingNormalizerBorrowed;
13use crate::NormalizerNfcV1;
14use crate::NormalizerNfdTablesV1;
15use crate::NormalizerNfkdTablesV1;
16use crate::NormalizerUts46DataV1;
17use icu_provider::DataError;
18use icu_provider::DataProvider;
19
20// Implementation note: Despite merely wrapping a `ComposingNormalizer`,
21// having a `Uts46Mapper` serves two purposes:
22//
23// 1. Denying public access to parts of the `ComposingNormalizer` API
24//    that don't work when the data contains markers for ignorables.
25// 2. Providing a place where additional iterator pre-processing or
26//    post-processing can take place if needed in the future. (When
27//    writing this, it looked like such processing was needed but
28//    now isn't needed after all.)
29
30/// A borrowed version of a mapper that knows how to performs the
31/// subsets of UTS 46 processing documented on the methods.
32#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for Uts46MapperBorrowed<'a> {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "Uts46MapperBorrowed", "normalizer", &&self.normalizer)
    }
}Debug)]
33pub struct Uts46MapperBorrowed<'a> {
34    normalizer: ComposingNormalizerBorrowed<'a>,
35}
36
37#[cfg(feature = "compiled_data")]
38impl Default for Uts46MapperBorrowed<'static> {
39    fn default() -> Self {
40        Self::new()
41    }
42}
43
44impl Uts46MapperBorrowed<'static> {
45    /// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`].
46    ///
47    /// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some
48    /// compile-time optimizations that are possible with [`Uts46MapperBorrowed`].
49    pub const fn static_to_owned(self) -> Uts46Mapper {
50        Uts46Mapper {
51            normalizer: self.normalizer.static_to_owned(),
52        }
53    }
54
55    /// Construct with compiled data.
56    #[cfg(feature = "compiled_data")]
57    pub const fn new() -> Self {
58        Uts46MapperBorrowed {
59            normalizer: ComposingNormalizerBorrowed::new_uts46(),
60        }
61    }
62}
63
64impl Uts46MapperBorrowed<'_> {
65    /// Returns `true` iff the canonical combining class of `c` is 9 (Virama).
66    ///
67    /// This method uses the UTS 46 data and does not add a dependency on NFD
68    /// data like `CanonicalCombiningClassMapBorrowed` does.
69    #[inline]
70    pub fn is_virama(&self, c: char) -> bool {
71        let trie_val = self
72            .normalizer
73            .decomposing_normalizer
74            .decompositions
75            .trie
76            .get(c);
77        if crate::trie_value_has_ccc(trie_val) {
78            (trie_val as u8) == 9
79        } else {
80            false
81        }
82    }
83
84    /// Returns an iterator adaptor that turns an `Iterator` over `char`
85    /// into an iterator yielding a `char` sequence that gets the following
86    /// operations from the "Map" and "Normalize" steps of the "Processing"
87    /// section of UTS 46 lazily applied to it:
88    ///
89    /// 1. The `ignored` characters are ignored.
90    /// 2. The `mapped` characters are mapped.
91    /// 3. The `disallowed` characters are replaced with U+FFFD,
92    ///    which itself is a disallowed character.
93    /// 4. The `deviation` characters are treated as `mapped` or `valid`
94    ///    as appropriate.
95    /// 5. The `disallowed_STD3_valid` characters are treated as allowed.
96    /// 6. The `disallowed_STD3_mapped` characters are treated as
97    ///    `mapped`.
98    /// 7. The result is normalized to NFC.
99    ///
100    /// Notably:
101    ///
102    /// * The STD3 or WHATWG ASCII deny list should be implemented as a
103    ///   post-processing step.
104    /// * Transitional processing is not performed. Transitional mapping
105    ///   would be a pre-processing step, but transitional processing is
106    ///   deprecated, and none of Firefox, Safari, or Chrome use it.
107    pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>(
108        &'delegate self,
109        iter: I,
110    ) -> impl Iterator<Item = char> + 'delegate {
111        self.normalizer
112            .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored)
113    }
114
115    /// Returns an iterator adaptor that turns an `Iterator` over `char`
116    /// into an iterator yielding a `char` sequence that gets the following
117    /// operations from the NFC check and statucs steps of the "Validity
118    /// Criteria" section of UTS 46 lazily applied to it:
119    ///
120    /// 1. The `ignored` characters are treated as `disallowed`.
121    /// 2. The `mapped` characters are mapped.
122    /// 3. The `disallowed` characters are replaced with U+FFFD,
123    ///    which itself is a disallowed character.
124    /// 4. The `deviation` characters are treated as `mapped` or `valid`
125    ///    as appropriate.
126    /// 5. The `disallowed_STD3_valid` characters are treated as allowed.
127    /// 6. The `disallowed_STD3_mapped` characters are treated as
128    ///    `mapped`.
129    /// 7. The result is normalized to NFC.
130    ///
131    /// Notably:
132    ///
133    /// * The STD3 or WHATWG ASCII deny list should be implemented as a
134    ///   post-processing step.
135    /// * Transitional processing is not performed. Transitional mapping
136    ///   would be a pre-processing step, but transitional processing is
137    ///   deprecated, and none of Firefox, Safari, or Chrome use it.
138    /// * The output needs to be compared with input to see if anything
139    ///   changed. This check catches failures to adhere to the normalization
140    ///   and status requirements. In particular, this comparison results
141    ///   in _mapped_ characters resulting in error like "Validity Criteria"
142    ///   requires.
143    pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>(
144        &'delegate self,
145        iter: I,
146    ) -> impl Iterator<Item = char> + 'delegate {
147        self.normalizer
148            .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter)
149    }
150}
151
152/// A mapper that knows how to performs the subsets of UTS 46 processing
153/// documented on the methods.
154#[derive(#[automatically_derived]
impl ::core::fmt::Debug for Uts46Mapper {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f, "Uts46Mapper",
            "normalizer", &&self.normalizer)
    }
}Debug)]
155pub struct Uts46Mapper {
156    normalizer: ComposingNormalizer,
157}
158
159#[cfg(feature = "compiled_data")]
160impl Default for Uts46Mapper {
161    fn default() -> Self {
162        Self::new().static_to_owned()
163    }
164}
165
166impl Uts46Mapper {
167    /// Constructs a borrowed version of this type for more efficient querying.
168    pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> {
169        Uts46MapperBorrowed {
170            normalizer: self.normalizer.as_borrowed(),
171        }
172    }
173
174    /// Construct with compiled data.
175    #[cfg(feature = "compiled_data")]
176    #[expect(clippy::new_ret_no_self)]
177    pub const fn new() -> Uts46MapperBorrowed<'static> {
178        Uts46MapperBorrowed::new()
179    }
180
181    /// Construct with provider.
182    #[doc = "A version of [`Self::new`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
183    pub fn try_new<D>(provider: &D) -> Result<Self, DataError>
184    where
185        D: DataProvider<NormalizerUts46DataV1>
186            + DataProvider<NormalizerNfdTablesV1>
187            + DataProvider<NormalizerNfkdTablesV1>
188            // UTS 46 tables merged into NormalizerNfkdTablesV1
189            + DataProvider<NormalizerNfcV1>
190            + ?Sized,
191    {
192        let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?;
193
194        Ok(Uts46Mapper { normalizer })
195    }
196}