icu_normalizer/properties.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Access to the Unicode properties or property-based operations that
6//! are required for NFC and NFD.
7//!
8//! Applications should generally use the full normalizers that are
9//! provided at the top level of this crate. However, the APIs in this
10//! module are provided for callers such as HarfBuzz that specifically
11//! want access to the raw canonical composition operation e.g. for use in a
12//! glyph-availability-guided custom normalizer.
13
14use crate::char_from_u16;
15use crate::error::NormalizerError;
16use crate::in_inclusive_range;
17use crate::provider::CanonicalCompositionsV1Marker;
18use crate::provider::CanonicalDecompositionDataV1Marker;
19use crate::provider::CanonicalDecompositionTablesV1Marker;
20use crate::provider::NonRecursiveDecompositionSupplementV1Marker;
21use crate::trie_value_has_ccc;
22use crate::trie_value_indicates_special_non_starter_decomposition;
23use crate::BACKWARD_COMBINING_STARTER_MARKER;
24use crate::FDFA_MARKER;
25use crate::HANGUL_L_BASE;
26use crate::HANGUL_N_COUNT;
27use crate::HANGUL_S_BASE;
28use crate::HANGUL_S_COUNT;
29use crate::HANGUL_T_BASE;
30use crate::HANGUL_T_COUNT;
31use crate::HANGUL_V_BASE;
32use crate::NON_ROUND_TRIP_MARKER;
33use crate::SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16;
34/// want access to the underlying properties e.g. for use in a
35/// glyph-availability-guided custom normalizer.
36use icu_properties::CanonicalCombiningClass;
37use icu_provider::prelude::*;
38
39/// The raw canonical composition operation.
40///
41/// Callers should generally use `ComposingNormalizer` instead of this API.
42/// However, this API is provided for callers such as HarfBuzz that specifically
43/// want access to the raw canonical composition operation e.g. for use in a
44/// glyph-availability-guided custom normalizer.
45#[derive(Debug)]
46pub struct CanonicalComposition {
47 canonical_compositions: DataPayload<CanonicalCompositionsV1Marker>,
48}
49
50#[cfg(feature = "compiled_data")]
51impl Default for CanonicalComposition {
52 fn default() -> Self {
53 Self::new()
54 }
55}
56
57impl CanonicalComposition {
58 /// Performs canonical composition (including Hangul) on a pair of
59 /// characters or returns `None` if these characters don't compose.
60 /// Composition exclusions are taken into account.
61 ///
62 /// # Examples
63 ///
64 /// ```
65 /// let comp = icu::normalizer::properties::CanonicalComposition::new();
66 ///
67 /// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters
68 /// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä'));
69 /// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ'));
70 /// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion
71 /// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter
72 /// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV
73 /// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT
74 /// ```
75 #[inline(always)]
76 pub fn compose(&self, starter: char, second: char) -> Option<char> {
77 crate::compose(
78 self.canonical_compositions
79 .get()
80 .canonical_compositions
81 .iter(),
82 starter,
83 second,
84 )
85 }
86
87 /// Constructs a new `CanonicalComposition` using compiled data.
88 ///
89 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
90 ///
91 /// [📚 Help choosing a constructor](icu_provider::constructors)
92 #[cfg(feature = "compiled_data")]
93 pub const fn new() -> Self {
94 Self {
95 canonical_compositions: DataPayload::from_static_ref(
96 crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
97 ),
98 }
99 }
100
101 icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
102 #[cfg(skip)]
103 functions: [
104 new,
105 try_new_with_any_provider,
106 try_new_with_buffer_provider,
107 try_new_unstable,
108 Self,
109 ]
110 );
111
112 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
113 pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
114 where
115 D: DataProvider<CanonicalCompositionsV1Marker> + ?Sized,
116 {
117 let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
118 provider.load(Default::default())?.take_payload()?;
119 Ok(CanonicalComposition {
120 canonical_compositions,
121 })
122 }
123}
124
125/// The outcome of non-recursive canonical decomposition of a character.
126#[allow(clippy::exhaustive_enums)]
127#[derive(Debug, PartialEq, Eq)]
128pub enum Decomposed {
129 /// The character is its own canonical decomposition.
130 Default,
131 /// The character decomposes to a single different character.
132 Singleton(char),
133 /// The character decomposes to two characters.
134 Expansion(char, char),
135}
136
137/// The raw (non-recursive) canonical decomposition operation.
138///
139/// Callers should generally use `DecomposingNormalizer` instead of this API.
140/// However, this API is provided for callers such as HarfBuzz that specifically
141/// want access to non-recursive canonical decomposition e.g. for use in a
142/// glyph-availability-guided custom normalizer.
143#[derive(Debug)]
144pub struct CanonicalDecomposition {
145 decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
146 tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
147 non_recursive: DataPayload<NonRecursiveDecompositionSupplementV1Marker>,
148}
149
150#[cfg(feature = "compiled_data")]
151impl Default for CanonicalDecomposition {
152 fn default() -> Self {
153 Self::new()
154 }
155}
156
157impl CanonicalDecomposition {
158 /// Performs non-recursive canonical decomposition (including for Hangul).
159 ///
160 /// ```
161 /// use icu::normalizer::properties::Decomposed;
162 /// let decomp = icu::normalizer::properties::CanonicalDecomposition::new();
163 ///
164 /// assert_eq!(decomp.decompose('e'), Decomposed::Default);
165 /// assert_eq!(
166 /// decomp.decompose('ệ'),
167 /// Decomposed::Expansion('ẹ', '\u{0302}')
168 /// );
169 /// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ'));
170 /// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN
171 /// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN
172 /// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia
173 /// ```
174 #[inline]
175 pub fn decompose(&self, c: char) -> Decomposed {
176 let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE);
177 if lvt >= HANGUL_S_COUNT {
178 return self.decompose_non_hangul(c);
179 }
180 let t = lvt % HANGUL_T_COUNT;
181 if t == 0 {
182 let l = lvt / HANGUL_N_COUNT;
183 let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT;
184 // Safe because values known to be in range
185 return Decomposed::Expansion(
186 unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) },
187 unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) },
188 );
189 }
190 let lv = lvt - t;
191 // Safe because values known to be in range
192 Decomposed::Expansion(
193 unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) },
194 unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) },
195 )
196 }
197
198 /// Performs non-recursive canonical decomposition except Hangul syllables
199 /// are reported as `Decomposed::Default`.
200 #[inline(always)]
201 fn decompose_non_hangul(&self, c: char) -> Decomposed {
202 let decomposition = self.decompositions.get().trie.get(c);
203 if decomposition <= BACKWARD_COMBINING_STARTER_MARKER {
204 return Decomposed::Default;
205 }
206 // The loop is only broken out of as goto forward
207 #[allow(clippy::never_loop)]
208 loop {
209 let trail_or_complex = (decomposition >> 16) as u16;
210 let lead = decomposition as u16;
211 if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
212 // Decomposition into two BMP characters: starter and non-starter
213 if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') {
214 // Look in the other trie due to oxia singleton
215 // mappings to corresponding character with tonos.
216 break;
217 }
218 return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
219 }
220 if lead > NON_ROUND_TRIP_MARKER {
221 // Decomposition into one BMP character or non-starter
222 debug_assert_ne!(
223 lead, FDFA_MARKER,
224 "How come we got the U+FDFA NFKD marker here?"
225 );
226 if lead == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16 {
227 // Non-starter
228 if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') {
229 return Decomposed::Default;
230 }
231 return match c {
232 '\u{0340}' => {
233 // COMBINING GRAVE TONE MARK
234 Decomposed::Singleton('\u{0300}')
235 }
236 '\u{0341}' => {
237 // COMBINING ACUTE TONE MARK
238 Decomposed::Singleton('\u{0301}')
239 }
240 '\u{0343}' => {
241 // COMBINING GREEK KORONIS
242 Decomposed::Singleton('\u{0313}')
243 }
244 '\u{0344}' => {
245 // COMBINING GREEK DIALYTIKA TONOS
246 Decomposed::Expansion('\u{0308}', '\u{0301}')
247 }
248 '\u{0F73}' => {
249 // TIBETAN VOWEL SIGN II
250 Decomposed::Expansion('\u{0F71}', '\u{0F72}')
251 }
252 '\u{0F75}' => {
253 // TIBETAN VOWEL SIGN UU
254 Decomposed::Expansion('\u{0F71}', '\u{0F74}')
255 }
256 '\u{0F81}' => {
257 // TIBETAN VOWEL SIGN REVERSED II
258 Decomposed::Expansion('\u{0F71}', '\u{0F80}')
259 }
260 _ => Decomposed::Default,
261 };
262 }
263 return Decomposed::Singleton(char_from_u16(lead));
264 }
265 // The recursive decomposition of ANGSTROM SIGN is in the complex
266 // decomposition structure to avoid a branch in `potential_passthrough`
267 // for the BMP case.
268 if c == '\u{212B}' {
269 // ANGSTROM SIGN
270 return Decomposed::Singleton('\u{00C5}');
271 }
272 // Complex decomposition
273 // Format for 16-bit value:
274 // 15..13: length minus two for 16-bit case and length minus one for
275 // the 32-bit case. Length 8 needs to fit in three bits in
276 // the 16-bit case, and this way the value is future-proofed
277 // up to 9 in the 16-bit case. Zero is unused and length one
278 // in the 16-bit case goes directly into the trie.
279 // 12: 1 if all trailing characters are guaranteed non-starters,
280 // 0 if no guarantees about non-starterness.
281 // Note: The bit choice is this way around to allow for
282 // dynamically falling back to not having this but instead
283 // having one more bit for length by merely choosing
284 // different masks.
285 // 11..0: Start offset in storage. The offset is to the logical
286 // sequence of scalars16, scalars32, supplementary_scalars16,
287 // supplementary_scalars32.
288 let offset = usize::from(trail_or_complex & 0xFFF);
289 let tables = self.tables.get();
290 if offset < tables.scalars16.len() {
291 if usize::from(trail_or_complex >> 13) != 0 {
292 // i.e. logical len isn't 2
293 break;
294 }
295 if let Some(first) = tables.scalars16.get(offset) {
296 if let Some(second) = tables.scalars16.get(offset + 1) {
297 // Two BMP starters
298 return Decomposed::Expansion(char_from_u16(first), char_from_u16(second));
299 }
300 }
301 // GIGO case
302 debug_assert!(false);
303 return Decomposed::Default;
304 }
305 let len = usize::from(trail_or_complex >> 13) + 1;
306 if len > 2 {
307 break;
308 }
309 let offset24 = offset - tables.scalars16.len();
310 if let Some(first_c) = tables.scalars24.get(offset24) {
311 if len == 1 {
312 if c != first_c {
313 return Decomposed::Singleton(first_c);
314 } else {
315 // Singleton representation used to avoid
316 // NFC passthrough of characters that combine
317 // with starters that can occur as the first
318 // character of an expansion decomposition.
319 // See section 5 of
320 // https://www.unicode.org/L2/L2024/24009-utc178-properties-recs.pdf
321 return Decomposed::Default;
322 }
323 }
324 if let Some(second_c) = tables.scalars24.get(offset24 + 1) {
325 return Decomposed::Expansion(first_c, second_c);
326 }
327 }
328 // GIGO case
329 debug_assert!(false);
330 return Decomposed::Default;
331 }
332 let non_recursive = self.non_recursive.get();
333 let non_recursive_decomposition = non_recursive.trie.get(c);
334 if non_recursive_decomposition == 0 {
335 // GIGO case
336 debug_assert!(false);
337 return Decomposed::Default;
338 }
339 let trail_or_complex = (non_recursive_decomposition >> 16) as u16;
340 let lead = non_recursive_decomposition as u16;
341 if lead != 0 && trail_or_complex != 0 {
342 // Decomposition into two BMP characters
343 return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
344 }
345 if lead != 0 {
346 // Decomposition into one BMP character
347 return Decomposed::Singleton(char_from_u16(lead));
348 }
349 // Decomposition into two non-BMP characters
350 // Low is offset into a table plus one to keep it non-zero.
351 let offset = usize::from(trail_or_complex - 1);
352 if let Some(first) = non_recursive.scalars24.get(offset) {
353 if let Some(second) = non_recursive.scalars24.get(offset + 1) {
354 return Decomposed::Expansion(first, second);
355 }
356 }
357 // GIGO case
358 debug_assert!(false);
359 Decomposed::Default
360 }
361
362 /// Construct from compiled data.
363 ///
364 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
365 ///
366 /// [📚 Help choosing a constructor](icu_provider::constructors)
367 #[cfg(feature = "compiled_data")]
368 pub const fn new() -> Self {
369 const _: () = assert!(
370 crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
371 .scalars16
372 .const_len()
373 + crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
374 .scalars24
375 .const_len()
376 <= 0xFFF,
377 "NormalizerError::FutureExtension"
378 );
379
380 Self {
381 decompositions: DataPayload::from_static_ref(
382 crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
383 ),
384 tables: DataPayload::from_static_ref(
385 crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
386 ),
387 non_recursive: DataPayload::from_static_ref(
388 crate::provider::Baked::SINGLETON_NORMALIZER_DECOMP_V1,
389 ),
390 }
391 }
392
393 icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
394 #[cfg(skip)]
395 functions: [
396 new,
397 try_new_with_any_provider,
398 try_new_with_buffer_provider,
399 try_new_unstable,
400 Self,
401 ]
402 );
403
404 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
405 pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
406 where
407 D: DataProvider<CanonicalDecompositionDataV1Marker>
408 + DataProvider<CanonicalDecompositionTablesV1Marker>
409 + DataProvider<NonRecursiveDecompositionSupplementV1Marker>
410 + ?Sized,
411 {
412 let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
413 provider.load(Default::default())?.take_payload()?;
414 let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
415 provider.load(Default::default())?.take_payload()?;
416
417 if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
418 // The data is from a future where there exists a normalization flavor whose
419 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
420 // of space. If a good use case from such a decomposition flavor arises, we can
421 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
422 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
423 // since for now the masks are hard-coded, error out.
424 return Err(NormalizerError::FutureExtension);
425 }
426
427 let non_recursive: DataPayload<NonRecursiveDecompositionSupplementV1Marker> =
428 provider.load(Default::default())?.take_payload()?;
429
430 Ok(CanonicalDecomposition {
431 decompositions,
432 tables,
433 non_recursive,
434 })
435 }
436}
437
438/// Lookup of the Canonical_Combining_Class Unicode property.
439///
440/// # Example
441///
442/// ```
443/// use icu::properties::CanonicalCombiningClass;
444/// use icu::normalizer::properties::CanonicalCombiningClassMap;
445///
446/// let map = CanonicalCombiningClassMap::new();
447/// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A
448/// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT
449/// ```
450#[derive(Debug)]
451pub struct CanonicalCombiningClassMap {
452 /// The data trie
453 decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
454}
455
456#[cfg(feature = "compiled_data")]
457impl Default for CanonicalCombiningClassMap {
458 fn default() -> Self {
459 Self::new()
460 }
461}
462
463impl CanonicalCombiningClassMap {
464 /// Look up the canonical combining class for a scalar value
465 #[inline(always)]
466 pub fn get(&self, c: char) -> CanonicalCombiningClass {
467 self.get32(u32::from(c))
468 }
469
470 /// Look up the canonical combining class for a scalar value
471 /// represented as `u32`. If the argument is outside the scalar
472 /// value range, `CanonicalCombiningClass::NotReordered` is returned.
473 pub fn get32(&self, c: u32) -> CanonicalCombiningClass {
474 let trie_value = self.decompositions.get().trie.get32(c);
475 if trie_value_has_ccc(trie_value) {
476 CanonicalCombiningClass(trie_value as u8)
477 } else if trie_value_indicates_special_non_starter_decomposition(trie_value) {
478 match c {
479 0x0340 | 0x0341 | 0x0343 | 0x0344 => CanonicalCombiningClass::Above,
480 _ => CanonicalCombiningClass::NotReordered,
481 }
482 } else {
483 CanonicalCombiningClass::NotReordered
484 }
485 }
486
487 /// Construct from compiled data.
488 ///
489 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
490 ///
491 /// [📚 Help choosing a constructor](icu_provider::constructors)
492 #[cfg(feature = "compiled_data")]
493 pub const fn new() -> Self {
494 CanonicalCombiningClassMap {
495 decompositions: DataPayload::from_static_ref(
496 crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
497 ),
498 }
499 }
500
501 icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
502 #[cfg(skip)]
503 functions: [
504 new,
505 try_new_with_any_provider,
506 try_new_with_buffer_provider,
507 try_new_unstable,
508 Self,
509 ]);
510
511 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
512 pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
513 where
514 D: DataProvider<CanonicalDecompositionDataV1Marker> + ?Sized,
515 {
516 let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
517 provider.load(Default::default())?.take_payload()?;
518 Ok(CanonicalCombiningClassMap { decompositions })
519 }
520}