icu_normalizer/
properties.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Access to the Unicode properties or property-based operations that
6//! are required for NFC and NFD.
7//!
8//! Applications should generally use the full normalizers that are
9//! provided at the top level of this crate. However, the APIs in this
10//! module are provided for callers such as HarfBuzz that specifically
11//! want access to the raw canonical composition operation e.g. for use in a
12//! glyph-availability-guided custom normalizer.
13
14use crate::char_from_u16;
15use crate::error::NormalizerError;
16use crate::in_inclusive_range;
17use crate::provider::CanonicalCompositionsV1Marker;
18use crate::provider::CanonicalDecompositionDataV1Marker;
19use crate::provider::CanonicalDecompositionTablesV1Marker;
20use crate::provider::NonRecursiveDecompositionSupplementV1Marker;
21use crate::trie_value_has_ccc;
22use crate::trie_value_indicates_special_non_starter_decomposition;
23use crate::BACKWARD_COMBINING_STARTER_MARKER;
24use crate::FDFA_MARKER;
25use crate::HANGUL_L_BASE;
26use crate::HANGUL_N_COUNT;
27use crate::HANGUL_S_BASE;
28use crate::HANGUL_S_COUNT;
29use crate::HANGUL_T_BASE;
30use crate::HANGUL_T_COUNT;
31use crate::HANGUL_V_BASE;
32use crate::NON_ROUND_TRIP_MARKER;
33use crate::SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16;
34/// want access to the underlying properties e.g. for use in a
35/// glyph-availability-guided custom normalizer.
36use icu_properties::CanonicalCombiningClass;
37use icu_provider::prelude::*;
38
39/// The raw canonical composition operation.
40///
41/// Callers should generally use `ComposingNormalizer` instead of this API.
42/// However, this API is provided for callers such as HarfBuzz that specifically
43/// want access to the raw canonical composition operation e.g. for use in a
44/// glyph-availability-guided custom normalizer.
45#[derive(Debug)]
46pub struct CanonicalComposition {
47    canonical_compositions: DataPayload<CanonicalCompositionsV1Marker>,
48}
49
50#[cfg(feature = "compiled_data")]
51impl Default for CanonicalComposition {
52    fn default() -> Self {
53        Self::new()
54    }
55}
56
57impl CanonicalComposition {
58    /// Performs canonical composition (including Hangul) on a pair of
59    /// characters or returns `None` if these characters don't compose.
60    /// Composition exclusions are taken into account.
61    ///
62    /// # Examples
63    ///
64    /// ```
65    /// let comp = icu::normalizer::properties::CanonicalComposition::new();
66    ///
67    /// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters
68    /// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä'));
69    /// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ'));
70    /// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion
71    /// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter
72    /// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV
73    /// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT
74    /// ```
75    #[inline(always)]
76    pub fn compose(&self, starter: char, second: char) -> Option<char> {
77        crate::compose(
78            self.canonical_compositions
79                .get()
80                .canonical_compositions
81                .iter(),
82            starter,
83            second,
84        )
85    }
86
87    /// Constructs a new `CanonicalComposition` using compiled data.
88    ///
89    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
90    ///
91    /// [📚 Help choosing a constructor](icu_provider::constructors)
92    #[cfg(feature = "compiled_data")]
93    pub const fn new() -> Self {
94        Self {
95            canonical_compositions: DataPayload::from_static_ref(
96                crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
97            ),
98        }
99    }
100
101    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
102        #[cfg(skip)]
103        functions: [
104            new,
105            try_new_with_any_provider,
106            try_new_with_buffer_provider,
107            try_new_unstable,
108            Self,
109        ]
110    );
111
112    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
113    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
114    where
115        D: DataProvider<CanonicalCompositionsV1Marker> + ?Sized,
116    {
117        let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
118            provider.load(Default::default())?.take_payload()?;
119        Ok(CanonicalComposition {
120            canonical_compositions,
121        })
122    }
123}
124
125/// The outcome of non-recursive canonical decomposition of a character.
126#[allow(clippy::exhaustive_enums)]
127#[derive(Debug, PartialEq, Eq)]
128pub enum Decomposed {
129    /// The character is its own canonical decomposition.
130    Default,
131    /// The character decomposes to a single different character.
132    Singleton(char),
133    /// The character decomposes to two characters.
134    Expansion(char, char),
135}
136
137/// The raw (non-recursive) canonical decomposition operation.
138///
139/// Callers should generally use `DecomposingNormalizer` instead of this API.
140/// However, this API is provided for callers such as HarfBuzz that specifically
141/// want access to non-recursive canonical decomposition e.g. for use in a
142/// glyph-availability-guided custom normalizer.
143#[derive(Debug)]
144pub struct CanonicalDecomposition {
145    decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
146    tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
147    non_recursive: DataPayload<NonRecursiveDecompositionSupplementV1Marker>,
148}
149
150#[cfg(feature = "compiled_data")]
151impl Default for CanonicalDecomposition {
152    fn default() -> Self {
153        Self::new()
154    }
155}
156
157impl CanonicalDecomposition {
158    /// Performs non-recursive canonical decomposition (including for Hangul).
159    ///
160    /// ```
161    ///     use icu::normalizer::properties::Decomposed;
162    ///     let decomp = icu::normalizer::properties::CanonicalDecomposition::new();
163    ///
164    ///     assert_eq!(decomp.decompose('e'), Decomposed::Default);
165    ///     assert_eq!(
166    ///         decomp.decompose('ệ'),
167    ///         Decomposed::Expansion('ẹ', '\u{0302}')
168    ///     );
169    ///     assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ'));
170    ///     assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN
171    ///     assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN
172    ///     assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia
173    /// ```
174    #[inline]
175    pub fn decompose(&self, c: char) -> Decomposed {
176        let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE);
177        if lvt >= HANGUL_S_COUNT {
178            return self.decompose_non_hangul(c);
179        }
180        let t = lvt % HANGUL_T_COUNT;
181        if t == 0 {
182            let l = lvt / HANGUL_N_COUNT;
183            let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT;
184            // Safe because values known to be in range
185            return Decomposed::Expansion(
186                unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) },
187                unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) },
188            );
189        }
190        let lv = lvt - t;
191        // Safe because values known to be in range
192        Decomposed::Expansion(
193            unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) },
194            unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) },
195        )
196    }
197
198    /// Performs non-recursive canonical decomposition except Hangul syllables
199    /// are reported as `Decomposed::Default`.
200    #[inline(always)]
201    fn decompose_non_hangul(&self, c: char) -> Decomposed {
202        let decomposition = self.decompositions.get().trie.get(c);
203        if decomposition <= BACKWARD_COMBINING_STARTER_MARKER {
204            return Decomposed::Default;
205        }
206        // The loop is only broken out of as goto forward
207        #[allow(clippy::never_loop)]
208        loop {
209            let trail_or_complex = (decomposition >> 16) as u16;
210            let lead = decomposition as u16;
211            if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
212                // Decomposition into two BMP characters: starter and non-starter
213                if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') {
214                    // Look in the other trie due to oxia singleton
215                    // mappings to corresponding character with tonos.
216                    break;
217                }
218                return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
219            }
220            if lead > NON_ROUND_TRIP_MARKER {
221                // Decomposition into one BMP character or non-starter
222                debug_assert_ne!(
223                    lead, FDFA_MARKER,
224                    "How come we got the U+FDFA NFKD marker here?"
225                );
226                if lead == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16 {
227                    // Non-starter
228                    if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') {
229                        return Decomposed::Default;
230                    }
231                    return match c {
232                        '\u{0340}' => {
233                            // COMBINING GRAVE TONE MARK
234                            Decomposed::Singleton('\u{0300}')
235                        }
236                        '\u{0341}' => {
237                            // COMBINING ACUTE TONE MARK
238                            Decomposed::Singleton('\u{0301}')
239                        }
240                        '\u{0343}' => {
241                            // COMBINING GREEK KORONIS
242                            Decomposed::Singleton('\u{0313}')
243                        }
244                        '\u{0344}' => {
245                            // COMBINING GREEK DIALYTIKA TONOS
246                            Decomposed::Expansion('\u{0308}', '\u{0301}')
247                        }
248                        '\u{0F73}' => {
249                            // TIBETAN VOWEL SIGN II
250                            Decomposed::Expansion('\u{0F71}', '\u{0F72}')
251                        }
252                        '\u{0F75}' => {
253                            // TIBETAN VOWEL SIGN UU
254                            Decomposed::Expansion('\u{0F71}', '\u{0F74}')
255                        }
256                        '\u{0F81}' => {
257                            // TIBETAN VOWEL SIGN REVERSED II
258                            Decomposed::Expansion('\u{0F71}', '\u{0F80}')
259                        }
260                        _ => Decomposed::Default,
261                    };
262                }
263                return Decomposed::Singleton(char_from_u16(lead));
264            }
265            // The recursive decomposition of ANGSTROM SIGN is in the complex
266            // decomposition structure to avoid a branch in `potential_passthrough`
267            // for the BMP case.
268            if c == '\u{212B}' {
269                // ANGSTROM SIGN
270                return Decomposed::Singleton('\u{00C5}');
271            }
272            // Complex decomposition
273            // Format for 16-bit value:
274            // 15..13: length minus two for 16-bit case and length minus one for
275            //         the 32-bit case. Length 8 needs to fit in three bits in
276            //         the 16-bit case, and this way the value is future-proofed
277            //         up to 9 in the 16-bit case. Zero is unused and length one
278            //         in the 16-bit case goes directly into the trie.
279            //     12: 1 if all trailing characters are guaranteed non-starters,
280            //         0 if no guarantees about non-starterness.
281            //         Note: The bit choice is this way around to allow for
282            //         dynamically falling back to not having this but instead
283            //         having one more bit for length by merely choosing
284            //         different masks.
285            //  11..0: Start offset in storage. The offset is to the logical
286            //         sequence of scalars16, scalars32, supplementary_scalars16,
287            //         supplementary_scalars32.
288            let offset = usize::from(trail_or_complex & 0xFFF);
289            let tables = self.tables.get();
290            if offset < tables.scalars16.len() {
291                if usize::from(trail_or_complex >> 13) != 0 {
292                    // i.e. logical len isn't 2
293                    break;
294                }
295                if let Some(first) = tables.scalars16.get(offset) {
296                    if let Some(second) = tables.scalars16.get(offset + 1) {
297                        // Two BMP starters
298                        return Decomposed::Expansion(char_from_u16(first), char_from_u16(second));
299                    }
300                }
301                // GIGO case
302                debug_assert!(false);
303                return Decomposed::Default;
304            }
305            let len = usize::from(trail_or_complex >> 13) + 1;
306            if len > 2 {
307                break;
308            }
309            let offset24 = offset - tables.scalars16.len();
310            if let Some(first_c) = tables.scalars24.get(offset24) {
311                if len == 1 {
312                    if c != first_c {
313                        return Decomposed::Singleton(first_c);
314                    } else {
315                        // Singleton representation used to avoid
316                        // NFC passthrough of characters that combine
317                        // with starters that can occur as the first
318                        // character of an expansion decomposition.
319                        // See section 5 of
320                        // https://www.unicode.org/L2/L2024/24009-utc178-properties-recs.pdf
321                        return Decomposed::Default;
322                    }
323                }
324                if let Some(second_c) = tables.scalars24.get(offset24 + 1) {
325                    return Decomposed::Expansion(first_c, second_c);
326                }
327            }
328            // GIGO case
329            debug_assert!(false);
330            return Decomposed::Default;
331        }
332        let non_recursive = self.non_recursive.get();
333        let non_recursive_decomposition = non_recursive.trie.get(c);
334        if non_recursive_decomposition == 0 {
335            // GIGO case
336            debug_assert!(false);
337            return Decomposed::Default;
338        }
339        let trail_or_complex = (non_recursive_decomposition >> 16) as u16;
340        let lead = non_recursive_decomposition as u16;
341        if lead != 0 && trail_or_complex != 0 {
342            // Decomposition into two BMP characters
343            return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
344        }
345        if lead != 0 {
346            // Decomposition into one BMP character
347            return Decomposed::Singleton(char_from_u16(lead));
348        }
349        // Decomposition into two non-BMP characters
350        // Low is offset into a table plus one to keep it non-zero.
351        let offset = usize::from(trail_or_complex - 1);
352        if let Some(first) = non_recursive.scalars24.get(offset) {
353            if let Some(second) = non_recursive.scalars24.get(offset + 1) {
354                return Decomposed::Expansion(first, second);
355            }
356        }
357        // GIGO case
358        debug_assert!(false);
359        Decomposed::Default
360    }
361
362    /// Construct from compiled data.
363    ///
364    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
365    ///
366    /// [📚 Help choosing a constructor](icu_provider::constructors)
367    #[cfg(feature = "compiled_data")]
368    pub const fn new() -> Self {
369        const _: () = assert!(
370            crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
371                .scalars16
372                .const_len()
373                + crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
374                    .scalars24
375                    .const_len()
376                <= 0xFFF,
377            "NormalizerError::FutureExtension"
378        );
379
380        Self {
381            decompositions: DataPayload::from_static_ref(
382                crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
383            ),
384            tables: DataPayload::from_static_ref(
385                crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
386            ),
387            non_recursive: DataPayload::from_static_ref(
388                crate::provider::Baked::SINGLETON_NORMALIZER_DECOMP_V1,
389            ),
390        }
391    }
392
393    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
394        #[cfg(skip)]
395        functions: [
396            new,
397            try_new_with_any_provider,
398            try_new_with_buffer_provider,
399            try_new_unstable,
400            Self,
401        ]
402    );
403
404    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
405    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
406    where
407        D: DataProvider<CanonicalDecompositionDataV1Marker>
408            + DataProvider<CanonicalDecompositionTablesV1Marker>
409            + DataProvider<NonRecursiveDecompositionSupplementV1Marker>
410            + ?Sized,
411    {
412        let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
413            provider.load(Default::default())?.take_payload()?;
414        let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
415            provider.load(Default::default())?.take_payload()?;
416
417        if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
418            // The data is from a future where there exists a normalization flavor whose
419            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
420            // of space. If a good use case from such a decomposition flavor arises, we can
421            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
422            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
423            // since for now the masks are hard-coded, error out.
424            return Err(NormalizerError::FutureExtension);
425        }
426
427        let non_recursive: DataPayload<NonRecursiveDecompositionSupplementV1Marker> =
428            provider.load(Default::default())?.take_payload()?;
429
430        Ok(CanonicalDecomposition {
431            decompositions,
432            tables,
433            non_recursive,
434        })
435    }
436}
437
438/// Lookup of the Canonical_Combining_Class Unicode property.
439///
440/// # Example
441///
442/// ```
443/// use icu::properties::CanonicalCombiningClass;
444/// use icu::normalizer::properties::CanonicalCombiningClassMap;
445///
446/// let map = CanonicalCombiningClassMap::new();
447/// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A
448/// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT
449/// ```
450#[derive(Debug)]
451pub struct CanonicalCombiningClassMap {
452    /// The data trie
453    decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
454}
455
456#[cfg(feature = "compiled_data")]
457impl Default for CanonicalCombiningClassMap {
458    fn default() -> Self {
459        Self::new()
460    }
461}
462
463impl CanonicalCombiningClassMap {
464    /// Look up the canonical combining class for a scalar value
465    #[inline(always)]
466    pub fn get(&self, c: char) -> CanonicalCombiningClass {
467        self.get32(u32::from(c))
468    }
469
470    /// Look up the canonical combining class for a scalar value
471    /// represented as `u32`. If the argument is outside the scalar
472    /// value range, `CanonicalCombiningClass::NotReordered` is returned.
473    pub fn get32(&self, c: u32) -> CanonicalCombiningClass {
474        let trie_value = self.decompositions.get().trie.get32(c);
475        if trie_value_has_ccc(trie_value) {
476            CanonicalCombiningClass(trie_value as u8)
477        } else if trie_value_indicates_special_non_starter_decomposition(trie_value) {
478            match c {
479                0x0340 | 0x0341 | 0x0343 | 0x0344 => CanonicalCombiningClass::Above,
480                _ => CanonicalCombiningClass::NotReordered,
481            }
482        } else {
483            CanonicalCombiningClass::NotReordered
484        }
485    }
486
487    /// Construct from compiled data.
488    ///
489    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
490    ///
491    /// [📚 Help choosing a constructor](icu_provider::constructors)
492    #[cfg(feature = "compiled_data")]
493    pub const fn new() -> Self {
494        CanonicalCombiningClassMap {
495            decompositions: DataPayload::from_static_ref(
496                crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
497            ),
498        }
499    }
500
501    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
502        #[cfg(skip)]
503        functions: [
504            new,
505            try_new_with_any_provider,
506            try_new_with_buffer_provider,
507            try_new_unstable,
508            Self,
509    ]);
510
511    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
512    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
513    where
514        D: DataProvider<CanonicalDecompositionDataV1Marker> + ?Sized,
515    {
516        let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
517            provider.load(Default::default())?.take_payload()?;
518        Ok(CanonicalCombiningClassMap { decompositions })
519    }
520}