icu_normalizer/uts46.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Bundles the part of UTS 46 that makes sense to implement as a
6//! normalization.
7//!
8//! This is meant to be used as a building block of an UTS 46
9//! implementation, such as the `idna` crate.
10
11use crate::ComposingNormalizer;
12use crate::ComposingNormalizerBorrowed;
13use crate::NormalizerNfcV1;
14use crate::NormalizerNfdTablesV1;
15use crate::NormalizerNfkdTablesV1;
16use crate::NormalizerUts46DataV1;
17use icu_provider::DataError;
18use icu_provider::DataProvider;
19
20// Implementation note: Despite merely wrapping a `ComposingNormalizer`,
21// having a `Uts46Mapper` serves two purposes:
22//
23// 1. Denying public access to parts of the `ComposingNormalizer` API
24// that don't work when the data contains markers for ignorables.
25// 2. Providing a place where additional iterator pre-processing or
26// post-processing can take place if needed in the future. (When
27// writing this, it looked like such processing was needed but
28// now isn't needed after all.)
29
30/// A borrowed version of a mapper that knows how to performs the
31/// subsets of UTS 46 processing documented on the methods.
32#[derive(Debug)]
33pub struct Uts46MapperBorrowed<'a> {
34 normalizer: ComposingNormalizerBorrowed<'a>,
35}
36
37#[cfg(feature = "compiled_data")]
38impl Default for Uts46MapperBorrowed<'static> {
39 fn default() -> Self {
40 Self::new()
41 }
42}
43
44impl Uts46MapperBorrowed<'static> {
45 /// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`].
46 ///
47 /// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some
48 /// compile-time optimizations that are possible with [`Uts46MapperBorrowed`].
49 pub const fn static_to_owned(self) -> Uts46Mapper {
50 Uts46Mapper {
51 normalizer: self.normalizer.static_to_owned(),
52 }
53 }
54
55 /// Construct with compiled data.
56 #[cfg(feature = "compiled_data")]
57 pub const fn new() -> Self {
58 Uts46MapperBorrowed {
59 normalizer: ComposingNormalizerBorrowed::new_uts46(),
60 }
61 }
62}
63
64impl Uts46MapperBorrowed<'_> {
65 /// Returns `true` iff the canonical combining class of `c` is 9 (Virama).
66 ///
67 /// This method uses the UTS 46 data and does not add a dependency on NFD
68 /// data like `CanonicalCombiningClassMapBorrowed` does.
69 #[inline]
70 pub fn is_virama(&self, c: char) -> bool {
71 let trie_val = self
72 .normalizer
73 .decomposing_normalizer
74 .decompositions
75 .trie
76 .get(c);
77 if crate::trie_value_has_ccc(trie_val) {
78 (trie_val as u8) == 9
79 } else {
80 false
81 }
82 }
83
84 /// Returns an iterator adaptor that turns an `Iterator` over `char`
85 /// into an iterator yielding a `char` sequence that gets the following
86 /// operations from the "Map" and "Normalize" steps of the "Processing"
87 /// section of UTS 46 lazily applied to it:
88 ///
89 /// 1. The `ignored` characters are ignored.
90 /// 2. The `mapped` characters are mapped.
91 /// 3. The `disallowed` characters are replaced with U+FFFD,
92 /// which itself is a disallowed character.
93 /// 4. The `deviation` characters are treated as `mapped` or `valid`
94 /// as appropriate.
95 /// 5. The `disallowed_STD3_valid` characters are treated as allowed.
96 /// 6. The `disallowed_STD3_mapped` characters are treated as
97 /// `mapped`.
98 /// 7. The result is normalized to NFC.
99 ///
100 /// Notably:
101 ///
102 /// * The STD3 or WHATWG ASCII deny list should be implemented as a
103 /// post-processing step.
104 /// * Transitional processing is not performed. Transitional mapping
105 /// would be a pre-processing step, but transitional processing is
106 /// deprecated, and none of Firefox, Safari, or Chrome use it.
107 pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>(
108 &'delegate self,
109 iter: I,
110 ) -> impl Iterator<Item = char> + 'delegate {
111 self.normalizer
112 .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored)
113 }
114
115 /// Returns an iterator adaptor that turns an `Iterator` over `char`
116 /// into an iterator yielding a `char` sequence that gets the following
117 /// operations from the NFC check and statucs steps of the "Validity
118 /// Criteria" section of UTS 46 lazily applied to it:
119 ///
120 /// 1. The `ignored` characters are treated as `disallowed`.
121 /// 2. The `mapped` characters are mapped.
122 /// 3. The `disallowed` characters are replaced with U+FFFD,
123 /// which itself is a disallowed character.
124 /// 4. The `deviation` characters are treated as `mapped` or `valid`
125 /// as appropriate.
126 /// 5. The `disallowed_STD3_valid` characters are treated as allowed.
127 /// 6. The `disallowed_STD3_mapped` characters are treated as
128 /// `mapped`.
129 /// 7. The result is normalized to NFC.
130 ///
131 /// Notably:
132 ///
133 /// * The STD3 or WHATWG ASCII deny list should be implemented as a
134 /// post-processing step.
135 /// * Transitional processing is not performed. Transitional mapping
136 /// would be a pre-processing step, but transitional processing is
137 /// deprecated, and none of Firefox, Safari, or Chrome use it.
138 /// * The output needs to be compared with input to see if anything
139 /// changed. This check catches failures to adhere to the normalization
140 /// and status requirements. In particular, this comparison results
141 /// in _mapped_ characters resulting in error like "Validity Criteria"
142 /// requires.
143 pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>(
144 &'delegate self,
145 iter: I,
146 ) -> impl Iterator<Item = char> + 'delegate {
147 self.normalizer
148 .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter)
149 }
150}
151
152/// A mapper that knows how to performs the subsets of UTS 46 processing
153/// documented on the methods.
154#[derive(Debug)]
155pub struct Uts46Mapper {
156 normalizer: ComposingNormalizer,
157}
158
159#[cfg(feature = "compiled_data")]
160impl Default for Uts46Mapper {
161 fn default() -> Self {
162 Self::new().static_to_owned()
163 }
164}
165
166impl Uts46Mapper {
167 /// Constructs a borrowed version of this type for more efficient querying.
168 pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> {
169 Uts46MapperBorrowed {
170 normalizer: self.normalizer.as_borrowed(),
171 }
172 }
173
174 /// Construct with compiled data.
175 #[cfg(feature = "compiled_data")]
176 #[expect(clippy::new_ret_no_self)]
177 pub const fn new() -> Uts46MapperBorrowed<'static> {
178 Uts46MapperBorrowed::new()
179 }
180
181 /// Construct with provider.
182 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
183 pub fn try_new<D>(provider: &D) -> Result<Self, DataError>
184 where
185 D: DataProvider<NormalizerUts46DataV1>
186 + DataProvider<NormalizerNfdTablesV1>
187 + DataProvider<NormalizerNfkdTablesV1>
188 // UTS 46 tables merged into NormalizerNfkdTablesV1
189 + DataProvider<NormalizerNfcV1>
190 + ?Sized,
191 {
192 let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?;
193
194 Ok(Uts46Mapper { normalizer })
195 }
196}