icu_normalizer/
lib.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
6#![cfg_attr(not(any(test, feature = "std")), no_std)]
7#![cfg_attr(
8    not(test),
9    deny(
10        clippy::indexing_slicing,
11        clippy::unwrap_used,
12        clippy::expect_used,
13        clippy::panic,
14        clippy::exhaustive_structs,
15        clippy::exhaustive_enums,
16        missing_debug_implementations,
17    )
18)]
19#![warn(missing_docs)]
20
21//! Normalizing text into Unicode Normalization Forms.
22//!
23//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
24//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
25//!
26//! # Implementation notes
27//!
28//! The normalizer operates on a lazy iterator over Unicode scalar values (Rust `char`) internally
29//! and iterating over guaranteed-valid UTF-8, potentially-invalid UTF-8, and potentially-invalid
30//! UTF-16 is a step that doesn’t leak into the normalizer internals. Ill-formed byte sequences are
31//! treated as U+FFFD.
32//!
33//! The normalizer data layout is not based on the ICU4C design at all. Instead, the normalization
34//! data layout is a clean-slate design optimized for the concept of fusing the NFD decomposition
35//! into the collator. That is, the decomposing normalizer is a by-product of the collator-motivated
36//! data layout.
37//!
38//! Notably, the decomposition data structure is optimized for a starter decomposing to itself,
39//! which is the most common case, and for a starter decomposing to a starter and a non-starter
40//! on the Basic Multilingual Plane. Notably, in this case, the collator makes use of the
41//! knowledge that the second character of such a decomposition is a non-starter. Therefore,
42//! decomposition into two starters is handled by generic fallback path that looks the
43//! decomposition from an array by offset and length instead of baking a BMP starter pair directly
44//! into a trie value.
45//!
46//! The decompositions into non-starters are hard-coded. At present in Unicode, these appear
47//! to be special cases falling into three categories:
48//!
49//! 1. Deprecated combining marks.
50//! 2. Particular Tibetan vowel sings.
51//! 3. NFKD only: half-width kana voicing marks.
52//!
53//! Hopefully Unicode never adds more decompositions into non-starters (other than a character
54//! decomposing to itself), but if it does, a code update is needed instead of a mere data update.
55//!
56//! The composing normalizer builds on the decomposing normalizer by performing the canonical
57//! composition post-processing per spec. As an optimization, though, the composing normalizer
58//! attempts to pass through already-normalized text consisting of starters that never combine
59//! backwards and that map to themselves if followed by a character whose decomposition starts
60//! with a starter that never combines backwards.
61//!
62//! As a difference with ICU4C, the composing normalizer has only the simplest possible
63//! passthrough (only one inversion list lookup per character in the best case) and the full
64//! decompose-then-canonically-compose behavior, whereas ICU4C has other paths between these
65//! extremes. The ICU4X collator doesn't make use of the FCD concept at all in order to avoid
66//! doing the work of checking whether the FCD condition holds.
67
68extern crate alloc;
69
70mod error;
71pub mod properties;
72pub mod provider;
73pub mod uts46;
74
75pub use crate::error::NormalizerError;
76
77#[doc(no_inline)]
78pub use NormalizerError as Error;
79
80use crate::provider::CanonicalDecompositionDataV1Marker;
81use crate::provider::CompatibilityDecompositionSupplementV1Marker;
82use crate::provider::DecompositionDataV1;
83use crate::provider::Uts46DecompositionSupplementV1Marker;
84use alloc::string::String;
85use alloc::vec::Vec;
86use core::char::REPLACEMENT_CHARACTER;
87use core::str::from_utf8_unchecked;
88use icu_collections::char16trie::Char16Trie;
89use icu_collections::char16trie::Char16TrieIterator;
90use icu_collections::char16trie::TrieResult;
91use icu_collections::codepointtrie::CodePointTrie;
92use icu_properties::CanonicalCombiningClass;
93use icu_provider::prelude::*;
94use provider::CanonicalCompositionsV1Marker;
95use provider::CanonicalDecompositionTablesV1Marker;
96use provider::CompatibilityDecompositionTablesV1Marker;
97use provider::DecompositionSupplementV1;
98use provider::DecompositionTablesV1;
99use smallvec::SmallVec;
100use utf16_iter::Utf16CharsEx;
101use utf8_iter::Utf8CharsEx;
102use write16::Write16;
103use zerofrom::ZeroFrom;
104use zerovec::{zeroslice, ZeroSlice};
105
106#[derive(Debug)]
107enum SupplementPayloadHolder {
108    Compatibility(DataPayload<CompatibilityDecompositionSupplementV1Marker>),
109    Uts46(DataPayload<Uts46DecompositionSupplementV1Marker>),
110}
111
112impl SupplementPayloadHolder {
113    fn get(&self) -> &DecompositionSupplementV1 {
114        match self {
115            SupplementPayloadHolder::Compatibility(d) => d.get(),
116            SupplementPayloadHolder::Uts46(d) => d.get(),
117        }
118    }
119}
120
121/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
122#[derive(Debug, PartialEq, Eq)]
123enum IgnorableBehavior {
124    /// 0xFFFFFFFF in data is not supported.
125    Unsupported,
126    /// Ignorables are ignored.
127    Ignored,
128    /// Ignorables are treated as singleton decompositions
129    /// to the REPLACEMENT CHARACTER.
130    ReplacementCharacter,
131}
132
133/// Number of iterations allowed on the fast path before flushing.
134/// Since a typical UTF-16 iteration advances over a 2-byte BMP
135/// character, this means two memory pages.
136/// Intel Core i7-4770 had the best results between 2 and 4 pages
137/// when testing powers of two. Apple M1 didn't seem to care
138/// about 1, 2, 4, or 8 pages.
139///
140/// Curiously, the `str` case does not appear to benefit from
141/// similar flushing, though the tested monomorphization never
142/// passes an error through from `Write`.
143const UTF16_FAST_PATH_FLUSH_THRESHOLD: usize = 4096;
144
145/// Marker for UTS 46 ignorables.
146const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
147
148/// Marker for starters that decompose to themselves but may
149/// combine backwards under canonical composition.
150/// (Main trie only; not used in the supplementary trie.)
151const BACKWARD_COMBINING_STARTER_MARKER: u32 = 1;
152
153/// Magic marker trie value for characters whose decomposition
154/// starts with a non-starter. The actual decomposition is
155/// hard-coded.
156const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER: u32 = 2;
157
158/// `u16` version of the previous marker value.
159const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16: u16 = 2;
160
161/// Marker that a complex decomposition isn't round-trippable
162/// under re-composition.
163const NON_ROUND_TRIP_MARKER: u16 = 1;
164
165/// Checks if a trie value carries a (non-zero) canonical
166/// combining class.
167fn trie_value_has_ccc(trie_value: u32) -> bool {
168    (trie_value & 0xFFFFFF00) == 0xD800
169}
170
171/// Checks if the trie signifies a special non-starter decomposition.
172fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
173    trie_value == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER
174}
175
176/// Checks if a trie value signifies a character whose decomposition
177/// starts with a non-starter.
178fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
179    trie_value_has_ccc(trie_value)
180        || trie_value_indicates_special_non_starter_decomposition(trie_value)
181}
182
183/// Extracts a canonical combining class (possibly zero) from a trie value.
184///
185/// # Panics
186///
187/// The trie value must not be one that signifies a special non-starter
188/// decomposition. (Debug-only)
189fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
190    if trie_value_has_ccc(trie_value) {
191        CanonicalCombiningClass(trie_value as u8)
192    } else {
193        debug_assert_ne!(trie_value, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER);
194        CanonicalCombiningClass::NotReordered
195    }
196}
197
198/// The tail (everything after the first character) of the NFKD form U+FDFA
199/// as 16-bit units.
200static FDFA_NFKD: [u16; 17] = [
201    0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
202    0x633, 0x644, 0x645,
203];
204
205/// Marker value for U+FDFA in NFKD
206const FDFA_MARKER: u16 = 3;
207
208// These constants originate from page 143 of Unicode 14.0
209/// Syllable base
210const HANGUL_S_BASE: u32 = 0xAC00;
211/// Lead jamo base
212const HANGUL_L_BASE: u32 = 0x1100;
213/// Vowel jamo base
214const HANGUL_V_BASE: u32 = 0x1161;
215/// Trail jamo base (deliberately off by one to account for the absence of a trail)
216const HANGUL_T_BASE: u32 = 0x11A7;
217/// Lead jamo count
218const HANGUL_L_COUNT: u32 = 19;
219/// Vowel jamo count
220const HANGUL_V_COUNT: u32 = 21;
221/// Trail jamo count (deliberately off by one to account for the absence of a trail)
222const HANGUL_T_COUNT: u32 = 28;
223/// Vowel jamo count times trail jamo count
224const HANGUL_N_COUNT: u32 = 588;
225/// Syllable count
226const HANGUL_S_COUNT: u32 = 11172;
227
228/// One past the conjoining jamo block
229const HANGUL_JAMO_LIMIT: u32 = 0x1200;
230
231/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
232/// are enabled and return `default` if debug assertions are not enabled.
233///
234/// Use this only if the only reason why `opt` could be `None` is bogus
235/// data from the provider.
236#[inline(always)]
237fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
238    if let Some(val) = opt {
239        val
240    } else {
241        // GIGO case
242        debug_assert!(false);
243        default
244    }
245}
246
247/// Convert a `u32` _obtained from data provider data_ to `char`.
248#[inline(always)]
249fn char_from_u32(u: u32) -> char {
250    unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
251}
252
253/// Convert a `u16` _obtained from data provider data_ to `char`.
254#[inline(always)]
255fn char_from_u16(u: u16) -> char {
256    char_from_u32(u32::from(u))
257}
258
259const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
260
261const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
262
263#[inline(always)]
264fn in_inclusive_range(c: char, start: char, end: char) -> bool {
265    u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
266}
267
268#[inline(always)]
269fn in_inclusive_range32(u: u32, start: u32, end: u32) -> bool {
270    u.wrapping_sub(start) <= (end - start)
271}
272
273#[inline(always)]
274fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
275    u.wrapping_sub(start) <= (end - start)
276}
277
278/// Performs canonical composition (including Hangul) on a pair of
279/// characters or returns `None` if these characters don't compose.
280/// Composition exclusions are taken into account.
281#[inline]
282fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
283    let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
284    if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
285        return compose_non_hangul(iter, starter, second);
286    }
287    if v < HANGUL_V_COUNT {
288        let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
289        if l < HANGUL_L_COUNT {
290            let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
291            // Safe, because the inputs are known to be in range.
292            return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
293        }
294        return None;
295    }
296    if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
297        let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
298        if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
299            let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
300            // Safe, because the inputs are known to be in range.
301            return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
302        }
303    }
304    None
305}
306
307/// Performs (non-Hangul) canonical composition on a pair of characters
308/// or returns `None` if these characters don't compose. Composition
309/// exclusions are taken into account.
310fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
311    // To make the trie smaller, the pairs are stored second character first.
312    // Given how this method is used in ways where it's known that `second`
313    // is or isn't a starter. We could potentially split the trie into two
314    // tries depending on whether `second` is a starter.
315    match iter.next(second) {
316        TrieResult::NoMatch => None,
317        TrieResult::NoValue => match iter.next(starter) {
318            TrieResult::NoMatch => None,
319            TrieResult::FinalValue(i) => {
320                if let Some(c) = char::from_u32(i as u32) {
321                    Some(c)
322                } else {
323                    // GIGO case
324                    debug_assert!(false);
325                    None
326                }
327            }
328            TrieResult::NoValue | TrieResult::Intermediate(_) => {
329                // GIGO case
330                debug_assert!(false);
331                None
332            }
333        },
334        TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
335            // GIGO case
336            debug_assert!(false);
337            None
338        }
339    }
340}
341
342/// Struct for holding together a character and the value
343/// looked up for it from the NFD trie in a more explicit
344/// way than an anonymous pair.
345/// Also holds a flag about the supplementary-trie provenance.
346#[derive(Debug, PartialEq, Eq)]
347struct CharacterAndTrieValue {
348    character: char,
349    trie_val: u32,
350    from_supplement: bool,
351}
352
353impl CharacterAndTrieValue {
354    #[inline(always)]
355    pub fn new(c: char, trie_value: u32) -> Self {
356        CharacterAndTrieValue {
357            character: c,
358            trie_val: trie_value,
359            from_supplement: false,
360        }
361    }
362    #[inline(always)]
363    pub fn new_from_supplement(c: char, trie_value: u32) -> Self {
364        CharacterAndTrieValue {
365            character: c,
366            trie_val: trie_value,
367            from_supplement: true,
368        }
369    }
370    #[inline(always)]
371    pub fn starter_and_decomposes_to_self(&self) -> bool {
372        if self.trie_val > BACKWARD_COMBINING_STARTER_MARKER {
373            return false;
374        }
375        // Hangul syllables get 0 as their trie value
376        u32::from(self.character).wrapping_sub(HANGUL_S_BASE) >= HANGUL_S_COUNT
377    }
378    #[inline(always)]
379    pub fn can_combine_backwards(&self) -> bool {
380        decomposition_starts_with_non_starter(self.trie_val)
381            || self.trie_val == BACKWARD_COMBINING_STARTER_MARKER
382            || in_inclusive_range32(self.trie_val, 0x1161, 0x11C2)
383    }
384    #[inline(always)]
385    pub fn potential_passthrough(&self) -> bool {
386        self.potential_passthrough_impl(BACKWARD_COMBINING_STARTER_MARKER)
387    }
388    #[inline(always)]
389    pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
390        self.potential_passthrough_impl(0)
391    }
392    #[inline(always)]
393    fn potential_passthrough_impl(&self, bound: u32) -> bool {
394        // This methods looks badly branchy, but most characters
395        // take the first return.
396        if self.trie_val <= bound {
397            return true;
398        }
399        if self.from_supplement {
400            return false;
401        }
402        let trail_or_complex = (self.trie_val >> 16) as u16;
403        if trail_or_complex == 0 {
404            return false;
405        }
406        let lead = self.trie_val as u16;
407        if lead == 0 {
408            return true;
409        }
410        if lead == NON_ROUND_TRIP_MARKER {
411            return false;
412        }
413        if (trail_or_complex & 0x7F) == 0x3C
414            && in_inclusive_range16(trail_or_complex, 0x0900, 0x0BFF)
415        {
416            // Nukta
417            return false;
418        }
419        if in_inclusive_range(self.character, '\u{FB1D}', '\u{FB4E}') {
420            // Hebrew presentation forms
421            return false;
422        }
423        if in_inclusive_range(self.character, '\u{1F71}', '\u{1FFB}') {
424            // Polytonic Greek with oxia
425            return false;
426        }
427        // To avoid more branchiness, 4 characters that decompose to
428        // a BMP starter followed by a BMP non-starter are excluded
429        // from being encoded directly into the trie value and are
430        // handled as complex decompositions instead. These are:
431        // U+0F76 TIBETAN VOWEL SIGN VOCALIC R
432        // U+0F78 TIBETAN VOWEL SIGN VOCALIC L
433        // U+212B ANGSTROM SIGN
434        // U+2ADC FORKING
435        true
436    }
437}
438
439/// Pack a `char` and a `CanonicalCombiningClass` in
440/// 32 bits (the former in the lower 24 bits and the
441/// latter in the high 8 bits). The latter can be
442/// initialized to 0xFF upon creation, in which case
443/// it can be actually set later by calling
444/// `set_ccc_from_trie_if_not_already_set`. This is
445/// a micro optimization to avoid the Canonical
446/// Combining Class trie lookup when there is only
447/// one combining character in a sequence. This type
448/// is intentionally non-`Copy` to get compiler help
449/// in making sure that the class is set on the
450/// instance on which it is intended to be set
451/// and not on a temporary copy.
452///
453/// Note that 0xFF is won't be assigned to an actual
454/// canonical combining class per definition D104
455/// in The Unicode Standard.
456//
457// NOTE: The Pernosco debugger has special knowledge
458// of this struct. Please do not change the bit layout
459// or the crate-module-qualified name of this struct
460// without coordination.
461#[derive(Debug)]
462struct CharacterAndClass(u32);
463
464impl CharacterAndClass {
465    pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
466        CharacterAndClass(u32::from(c) | (u32::from(ccc.0) << 24))
467    }
468    pub fn new_with_placeholder(c: char) -> Self {
469        CharacterAndClass(u32::from(c) | ((0xFF) << 24))
470    }
471    pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
472        Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
473    }
474    pub fn new_starter(c: char) -> Self {
475        CharacterAndClass(u32::from(c))
476    }
477    pub fn character(&self) -> char {
478        // Safe, because the low 24 bits came from a `char`
479        // originally.
480        unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
481    }
482    pub fn ccc(&self) -> CanonicalCombiningClass {
483        CanonicalCombiningClass((self.0 >> 24) as u8)
484    }
485    pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
486        (self.character(), self.ccc())
487    }
488    pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &CodePointTrie<u32>) {
489        if self.0 >> 24 != 0xFF {
490            return;
491        }
492        let scalar = self.0 & 0xFFFFFF;
493        self.0 = ((ccc_from_trie_value(trie.get32_u32(scalar)).0 as u32) << 24) | scalar;
494    }
495}
496
497// This function exists as a borrow check helper.
498#[inline(always)]
499fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &CodePointTrie<u32>) {
500    // We don't look up the canonical combining class for starters
501    // of for single combining characters between starters. When
502    // there's more than one combining character between starters,
503    // we look up the canonical combining class for each character
504    // exactly once.
505    if slice.len() < 2 {
506        return;
507    }
508    slice
509        .iter_mut()
510        .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
511    slice.sort_by_key(|cc| cc.ccc());
512}
513
514/// An iterator adaptor that turns an `Iterator` over `char` into
515/// a lazily-decomposed `char` sequence.
516#[derive(Debug)]
517pub struct Decomposition<'data, I>
518where
519    I: Iterator<Item = char>,
520{
521    delegate: I,
522    buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
523    /// The index of the next item to be read from `buffer`.
524    /// The purpose if this index is to avoid having to move
525    /// the rest upon every read.
526    buffer_pos: usize,
527    // At the start of `next()` if not `None`, this is a pending unnormalized
528    // starter. When `Decomposition` appears alone, this is never a non-starter.
529    // However, when `Decomposition` appears inside a `Composition`, this
530    // may become a non-starter before `decomposing_next()` is called.
531    pending: Option<CharacterAndTrieValue>, // None at end of stream
532    trie: &'data CodePointTrie<'data, u32>,
533    supplementary_trie: Option<&'data CodePointTrie<'data, u32>>,
534    scalars16: &'data ZeroSlice<u16>,
535    scalars24: &'data ZeroSlice<char>,
536    supplementary_scalars16: &'data ZeroSlice<u16>,
537    supplementary_scalars24: &'data ZeroSlice<char>,
538    half_width_voicing_marks_become_non_starters: bool,
539    /// The lowest character for which either of the following does
540    /// not hold:
541    /// 1. Decomposes to self.
542    /// 2. Decomposition starts with a non-starter
543    decomposition_passthrough_bound: u32, // never above 0xC0
544    ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
545}
546
547impl<'data, I> Decomposition<'data, I>
548where
549    I: Iterator<Item = char>,
550{
551    /// Constructs a decomposing iterator adapter from a delegate
552    /// iterator and references to the necessary data, without
553    /// supplementary data.
554    ///
555    /// Use `DecomposingNormalizer::normalize_iter()` instead unless
556    /// there's a good reason to use this constructor directly.
557    ///
558    /// Public but hidden in order to be able to use this from the
559    /// collator.
560    #[doc(hidden)]
561    pub fn new(
562        delegate: I,
563        decompositions: &'data DecompositionDataV1,
564        tables: &'data DecompositionTablesV1,
565    ) -> Self {
566        Self::new_with_supplements(
567            delegate,
568            decompositions,
569            None,
570            tables,
571            None,
572            0xC0,
573            IgnorableBehavior::Unsupported,
574        )
575    }
576
577    /// Constructs a decomposing iterator adapter from a delegate
578    /// iterator and references to the necessary data, including
579    /// supplementary data.
580    ///
581    /// Use `DecomposingNormalizer::normalize_iter()` instead unless
582    /// there's a good reason to use this constructor directly.
583    fn new_with_supplements(
584        delegate: I,
585        decompositions: &'data DecompositionDataV1,
586        supplementary_decompositions: Option<&'data DecompositionSupplementV1>,
587        tables: &'data DecompositionTablesV1,
588        supplementary_tables: Option<&'data DecompositionTablesV1>,
589        decomposition_passthrough_bound: u8,
590        ignorable_behavior: IgnorableBehavior,
591    ) -> Self {
592        let half_width_voicing_marks_become_non_starters =
593            if let Some(supplementary) = supplementary_decompositions {
594                supplementary.half_width_voicing_marks_become_non_starters()
595            } else {
596                false
597            };
598        let mut ret = Decomposition::<I> {
599            delegate,
600            buffer: SmallVec::new(), // Normalized
601            buffer_pos: 0,
602            // Initialize with a placeholder starter in case
603            // the real stream starts with a non-starter.
604            pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
605            trie: &decompositions.trie,
606            supplementary_trie: supplementary_decompositions.map(|s| &s.trie),
607            scalars16: &tables.scalars16,
608            scalars24: &tables.scalars24,
609            supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
610                &supplementary.scalars16
611            } else {
612                EMPTY_U16
613            },
614            supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
615                &supplementary.scalars24
616            } else {
617                EMPTY_CHAR
618            },
619            half_width_voicing_marks_become_non_starters,
620            decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
621            ignorable_behavior,
622        };
623        let _ = ret.next(); // Remove the U+FFFF placeholder
624        ret
625    }
626
627    fn push_decomposition16(
628        &mut self,
629        low: u16,
630        offset: usize,
631        slice16: &ZeroSlice<u16>,
632    ) -> (char, usize) {
633        let len = usize::from(low >> 13) + 2;
634        let (starter, tail) = slice16
635            .get_subslice(offset..offset + len)
636            .and_then(|slice| slice.split_first())
637            .map_or_else(
638                || {
639                    // GIGO case
640                    debug_assert!(false);
641                    (REPLACEMENT_CHARACTER, EMPTY_U16)
642                },
643                |(first, trail)| (char_from_u16(first), trail),
644            );
645        if low & 0x1000 != 0 {
646            // All the rest are combining
647            self.buffer.extend(
648                tail.iter()
649                    .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
650            );
651            (starter, 0)
652        } else {
653            let mut i = 0;
654            let mut combining_start = 0;
655            for u in tail.iter() {
656                let ch = char_from_u16(u);
657                let trie_value = self.trie.get(ch);
658                self.buffer.push(CharacterAndClass::new_with_trie_value(
659                    CharacterAndTrieValue::new(ch, trie_value),
660                ));
661                i += 1;
662                // Half-width kana and iota subscript don't occur in the tails
663                // of these multicharacter decompositions.
664                if !decomposition_starts_with_non_starter(trie_value) {
665                    combining_start = i;
666                }
667            }
668            (starter, combining_start)
669        }
670    }
671
672    fn push_decomposition32(
673        &mut self,
674        low: u16,
675        offset: usize,
676        slice32: &ZeroSlice<char>,
677    ) -> (char, usize) {
678        let len = usize::from(low >> 13) + 1;
679        let (starter, tail) = slice32
680            .get_subslice(offset..offset + len)
681            .and_then(|slice| slice.split_first())
682            .unwrap_or_else(|| {
683                // GIGO case
684                debug_assert!(false);
685                (REPLACEMENT_CHARACTER, EMPTY_CHAR)
686            });
687        if low & 0x1000 != 0 {
688            // All the rest are combining
689            self.buffer
690                .extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
691            (starter, 0)
692        } else {
693            let mut i = 0;
694            let mut combining_start = 0;
695            for ch in tail.iter() {
696                let trie_value = self.trie.get(ch);
697                self.buffer.push(CharacterAndClass::new_with_trie_value(
698                    CharacterAndTrieValue::new(ch, trie_value),
699                ));
700                i += 1;
701                // Half-width kana and iota subscript don't occur in the tails
702                // of these multicharacter decompositions.
703                if !decomposition_starts_with_non_starter(trie_value) {
704                    combining_start = i;
705                }
706            }
707            (starter, combining_start)
708        }
709    }
710
711    #[inline(always)]
712    fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
713        if let Some(supplementary) = self.supplementary_trie {
714            if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
715                return value;
716            }
717        }
718
719        CharacterAndTrieValue::new(c, self.trie.get(c))
720    }
721
722    #[inline(never)]
723    fn attach_supplementary_trie_value(
724        &self,
725        c: char,
726        supplementary: &CodePointTrie<u32>,
727    ) -> Option<CharacterAndTrieValue> {
728        let voicing_mark = u32::from(c).wrapping_sub(0xFF9E);
729        if voicing_mark <= 1 && self.half_width_voicing_marks_become_non_starters {
730            return Some(CharacterAndTrieValue::new(
731                if voicing_mark == 0 {
732                    '\u{3099}'
733                } else {
734                    '\u{309A}'
735                },
736                0xD800 | u32::from(CanonicalCombiningClass::KanaVoicing.0),
737            ));
738        }
739        let trie_value = supplementary.get32(u32::from(c));
740        if trie_value != 0 {
741            return Some(CharacterAndTrieValue::new_from_supplement(c, trie_value));
742        }
743        None
744    }
745
746    fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
747        debug_assert!(self.pending.is_none());
748        loop {
749            let c = self.delegate.next()?;
750
751            // TODO(#2384): Measure if this check is actually an optimization even in the
752            // non-supplementary case of if this should go inside the supplementary
753            // `if` below.
754            if u32::from(c) < self.decomposition_passthrough_bound {
755                return Some(CharacterAndTrieValue::new(c, 0));
756            }
757
758            if let Some(supplementary) = self.supplementary_trie {
759                if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
760                    if value.trie_val == IGNORABLE_MARKER {
761                        match self.ignorable_behavior {
762                            IgnorableBehavior::Unsupported => {
763                                debug_assert!(false);
764                            }
765                            IgnorableBehavior::ReplacementCharacter => {
766                                return Some(CharacterAndTrieValue::new(
767                                    c,
768                                    u32::from(REPLACEMENT_CHARACTER),
769                                ));
770                            }
771                            IgnorableBehavior::Ignored => {
772                                // Else ignore this character by reading the next one from the delegate.
773                                continue;
774                            }
775                        }
776                    }
777                    return Some(value);
778                }
779            }
780            let trie_val = self.trie.get(c);
781            debug_assert_ne!(trie_val, IGNORABLE_MARKER);
782            return Some(CharacterAndTrieValue::new(c, trie_val));
783        }
784    }
785
786    fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
787        if let Some(pending) = self.pending.take() {
788            // Only happens as part of `Composition` and as part of
789            // the contiguous-buffer methods of `DecomposingNormalizer`.
790            // I.e. does not happen as part of standalone iterator
791            // usage of `Decomposition`.
792            Some(pending)
793        } else {
794            self.delegate_next_no_pending()
795        }
796    }
797
798    fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
799        let (starter, combining_start) = {
800            let c = c_and_trie_val.character;
801            let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
802            if hangul_offset >= HANGUL_S_COUNT {
803                let decomposition = c_and_trie_val.trie_val;
804                if decomposition <= BACKWARD_COMBINING_STARTER_MARKER {
805                    // The character is its own decomposition
806                    (c, 0)
807                } else {
808                    let trail_or_complex = (decomposition >> 16) as u16;
809                    let lead = decomposition as u16;
810                    if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
811                        // Decomposition into two BMP characters: starter and non-starter
812                        let starter = char_from_u16(lead);
813                        let combining = char_from_u16(trail_or_complex);
814                        self.buffer
815                            .push(CharacterAndClass::new_with_placeholder(combining));
816                        (starter, 0)
817                    } else if lead > NON_ROUND_TRIP_MARKER {
818                        if lead != FDFA_MARKER {
819                            debug_assert_ne!(
820                                lead, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16,
821                                "Should not reach this point with non-starter marker"
822                            );
823                            // Decomposition into one BMP character
824                            let starter = char_from_u16(lead);
825                            (starter, 0)
826                        } else {
827                            // Special case for the NFKD form of U+FDFA.
828                            self.buffer.extend(FDFA_NFKD.map(|u| {
829                                // Safe, because `FDFA_NFKD` is known not to contain
830                                // surrogates.
831                                CharacterAndClass::new_starter(unsafe {
832                                    core::char::from_u32_unchecked(u32::from(u))
833                                })
834                            }));
835                            ('\u{0635}', 17)
836                        }
837                    } else {
838                        // Complex decomposition
839                        // Format for 16-bit value:
840                        // 15..13: length minus two for 16-bit case and length minus one for
841                        //         the 32-bit case. Length 8 needs to fit in three bits in
842                        //         the 16-bit case, and this way the value is future-proofed
843                        //         up to 9 in the 16-bit case. Zero is unused and length one
844                        //         in the 16-bit case goes directly into the trie.
845                        //     12: 1 if all trailing characters are guaranteed non-starters,
846                        //         0 if no guarantees about non-starterness.
847                        //         Note: The bit choice is this way around to allow for
848                        //         dynamically falling back to not having this but instead
849                        //         having one more bit for length by merely choosing
850                        //         different masks.
851                        //  11..0: Start offset in storage. The offset is to the logical
852                        //         sequence of scalars16, scalars32, supplementary_scalars16,
853                        //         supplementary_scalars32.
854                        let offset = usize::from(trail_or_complex & 0xFFF);
855                        if offset < self.scalars16.len() {
856                            self.push_decomposition16(trail_or_complex, offset, self.scalars16)
857                        } else if offset < self.scalars16.len() + self.scalars24.len() {
858                            self.push_decomposition32(
859                                trail_or_complex,
860                                offset - self.scalars16.len(),
861                                self.scalars24,
862                            )
863                        } else if offset
864                            < self.scalars16.len()
865                                + self.scalars24.len()
866                                + self.supplementary_scalars16.len()
867                        {
868                            self.push_decomposition16(
869                                trail_or_complex,
870                                offset - (self.scalars16.len() + self.scalars24.len()),
871                                self.supplementary_scalars16,
872                            )
873                        } else {
874                            self.push_decomposition32(
875                                trail_or_complex,
876                                offset
877                                    - (self.scalars16.len()
878                                        + self.scalars24.len()
879                                        + self.supplementary_scalars16.len()),
880                                self.supplementary_scalars24,
881                            )
882                        }
883                    }
884                }
885            } else {
886                // Hangul syllable
887                // The math here comes from page 144 of Unicode 14.0
888                let l = hangul_offset / HANGUL_N_COUNT;
889                let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
890                let t = hangul_offset % HANGUL_T_COUNT;
891
892                // The unsafe blocks here are OK, because the values stay
893                // within the Hangul jamo block and, therefore, the scalar
894                // value range by construction.
895                self.buffer.push(CharacterAndClass::new_starter(unsafe {
896                    core::char::from_u32_unchecked(HANGUL_V_BASE + v)
897                }));
898                let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
899                if t != 0 {
900                    self.buffer.push(CharacterAndClass::new_starter(unsafe {
901                        core::char::from_u32_unchecked(HANGUL_T_BASE + t)
902                    }));
903                    (first, 2)
904                } else {
905                    (first, 1)
906                }
907            }
908        };
909        // Either we're inside `Composition` or `self.pending.is_none()`.
910
911        self.gather_and_sort_combining(combining_start);
912        starter
913    }
914
915    fn gather_and_sort_combining(&mut self, combining_start: usize) {
916        // Not a `for` loop to avoid holding a mutable reference to `self` across
917        // the loop body.
918        while let Some(ch_and_trie_val) = self.delegate_next() {
919            if trie_value_has_ccc(ch_and_trie_val.trie_val) {
920                self.buffer
921                    .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
922            } else if trie_value_indicates_special_non_starter_decomposition(
923                ch_and_trie_val.trie_val,
924            ) {
925                // The Tibetan special cases are starters that decompose into non-starters.
926                let mapped = match ch_and_trie_val.character {
927                    '\u{0340}' => {
928                        // COMBINING GRAVE TONE MARK
929                        CharacterAndClass::new('\u{0300}', CanonicalCombiningClass::Above)
930                    }
931                    '\u{0341}' => {
932                        // COMBINING ACUTE TONE MARK
933                        CharacterAndClass::new('\u{0301}', CanonicalCombiningClass::Above)
934                    }
935                    '\u{0343}' => {
936                        // COMBINING GREEK KORONIS
937                        CharacterAndClass::new('\u{0313}', CanonicalCombiningClass::Above)
938                    }
939                    '\u{0344}' => {
940                        // COMBINING GREEK DIALYTIKA TONOS
941                        self.buffer.push(CharacterAndClass::new(
942                            '\u{0308}',
943                            CanonicalCombiningClass::Above,
944                        ));
945                        CharacterAndClass::new('\u{0301}', CanonicalCombiningClass::Above)
946                    }
947                    '\u{0F73}' => {
948                        // TIBETAN VOWEL SIGN II
949                        self.buffer.push(CharacterAndClass::new(
950                            '\u{0F71}',
951                            CanonicalCombiningClass::CCC129,
952                        ));
953                        CharacterAndClass::new('\u{0F72}', CanonicalCombiningClass::CCC130)
954                    }
955                    '\u{0F75}' => {
956                        // TIBETAN VOWEL SIGN UU
957                        self.buffer.push(CharacterAndClass::new(
958                            '\u{0F71}',
959                            CanonicalCombiningClass::CCC129,
960                        ));
961                        CharacterAndClass::new('\u{0F74}', CanonicalCombiningClass::CCC132)
962                    }
963                    '\u{0F81}' => {
964                        // TIBETAN VOWEL SIGN REVERSED II
965                        self.buffer.push(CharacterAndClass::new(
966                            '\u{0F71}',
967                            CanonicalCombiningClass::CCC129,
968                        ));
969                        CharacterAndClass::new('\u{0F80}', CanonicalCombiningClass::CCC130)
970                    }
971                    _ => {
972                        // GIGO case
973                        debug_assert!(false);
974                        CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
975                    }
976                };
977                self.buffer.push(mapped);
978            } else {
979                self.pending = Some(ch_and_trie_val);
980                break;
981            }
982        }
983        // Slicing succeeds by construction; we've always ensured that `combining_start`
984        // is in permissible range.
985        #[allow(clippy::indexing_slicing)]
986        sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
987    }
988}
989
990impl<'data, I> Iterator for Decomposition<'data, I>
991where
992    I: Iterator<Item = char>,
993{
994    type Item = char;
995
996    fn next(&mut self) -> Option<char> {
997        if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
998            self.buffer_pos += 1;
999            if self.buffer_pos == self.buffer.len() {
1000                self.buffer.clear();
1001                self.buffer_pos = 0;
1002            }
1003            return Some(ret);
1004        }
1005        debug_assert_eq!(self.buffer_pos, 0);
1006        let c_and_trie_val = self.pending.take()?;
1007        Some(self.decomposing_next(c_and_trie_val))
1008    }
1009}
1010
1011/// An iterator adaptor that turns an `Iterator` over `char` into
1012/// a lazily-decomposed and then canonically composed `char` sequence.
1013#[derive(Debug)]
1014pub struct Composition<'data, I>
1015where
1016    I: Iterator<Item = char>,
1017{
1018    /// The decomposing part of the normalizer than operates before
1019    /// the canonical composition is performed on its output.
1020    decomposition: Decomposition<'data, I>,
1021    /// Non-Hangul canonical composition data.
1022    canonical_compositions: Char16Trie<'data>,
1023    /// To make `next()` yield in cases where there's a non-composing
1024    /// starter in the decomposition buffer, we put it here to let it
1025    /// wait for the next `next()` call (or a jump forward within the
1026    /// `next()` call).
1027    unprocessed_starter: Option<char>,
1028    /// The lowest character for which any one of the following does
1029    /// not hold:
1030    /// 1. Roundtrips via decomposition and recomposition.
1031    /// 2. Decomposition starts with a non-starter
1032    /// 3. Is not a backward-combining starter
1033    composition_passthrough_bound: u32,
1034}
1035
1036impl<'data, I> Composition<'data, I>
1037where
1038    I: Iterator<Item = char>,
1039{
1040    fn new(
1041        decomposition: Decomposition<'data, I>,
1042        canonical_compositions: Char16Trie<'data>,
1043        composition_passthrough_bound: u16,
1044    ) -> Self {
1045        Self {
1046            decomposition,
1047            canonical_compositions,
1048            unprocessed_starter: None,
1049            composition_passthrough_bound: u32::from(composition_passthrough_bound),
1050        }
1051    }
1052
1053    /// Performs canonical composition (including Hangul) on a pair of
1054    /// characters or returns `None` if these characters don't compose.
1055    /// Composition exclusions are taken into account.
1056    #[inline(always)]
1057    pub fn compose(&self, starter: char, second: char) -> Option<char> {
1058        compose(self.canonical_compositions.iter(), starter, second)
1059    }
1060
1061    /// Performs (non-Hangul) canonical composition on a pair of characters
1062    /// or returns `None` if these characters don't compose. Composition
1063    /// exclusions are taken into account.
1064    #[inline(always)]
1065    fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
1066        compose_non_hangul(self.canonical_compositions.iter(), starter, second)
1067    }
1068}
1069
1070impl<'data, I> Iterator for Composition<'data, I>
1071where
1072    I: Iterator<Item = char>,
1073{
1074    type Item = char;
1075
1076    #[inline]
1077    fn next(&mut self) -> Option<char> {
1078        let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
1079        if self.unprocessed_starter.is_none() {
1080            // The loop is only broken out of as goto forward
1081            #[allow(clippy::never_loop)]
1082            loop {
1083                if let Some((character, ccc)) = self
1084                    .decomposition
1085                    .buffer
1086                    .get(self.decomposition.buffer_pos)
1087                    .map(|c| c.character_and_ccc())
1088                {
1089                    self.decomposition.buffer_pos += 1;
1090                    if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
1091                        self.decomposition.buffer.clear();
1092                        self.decomposition.buffer_pos = 0;
1093                    }
1094                    if ccc == CanonicalCombiningClass::NotReordered {
1095                        // Previous decomposition contains a starter. This must
1096                        // now become the `unprocessed_starter` for it to have
1097                        // a chance to compose with the upcoming characters.
1098                        //
1099                        // E.g. parenthesized Hangul in NFKC comes through here,
1100                        // but suitable composition exclusion could exercise this
1101                        // in NFC.
1102                        self.unprocessed_starter = Some(character);
1103                        break; // We already have a starter, so skip taking one from `pending`.
1104                    }
1105                    return Some(character);
1106                }
1107                debug_assert_eq!(self.decomposition.buffer_pos, 0);
1108                undecomposed_starter = self.decomposition.pending.take()?;
1109                if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
1110                    || undecomposed_starter.potential_passthrough()
1111                {
1112                    // TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
1113                    // character is not below `decomposition_passthrough_bound` but is
1114                    // below `composition_passthrough_bound`, we read from the trie
1115                    // unnecessarily.
1116                    if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
1117                        let cannot_combine_backwards = u32::from(upcoming.character)
1118                            < self.composition_passthrough_bound
1119                            || !upcoming.can_combine_backwards();
1120                        self.decomposition.pending = Some(upcoming);
1121                        if cannot_combine_backwards {
1122                            // Fast-track succeeded!
1123                            return Some(undecomposed_starter.character);
1124                        }
1125                    } else {
1126                        // End of stream
1127                        return Some(undecomposed_starter.character);
1128                    }
1129                }
1130                break; // Not actually looping
1131            }
1132        }
1133        let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
1134
1135        // The point of having this boolean is to have only one call site to
1136        // `self.decomposition.decomposing_next`, which is hopefully beneficial for
1137        // code size under inlining.
1138        let mut attempt_composition = false;
1139        loop {
1140            if let Some(unprocessed) = self.unprocessed_starter.take() {
1141                debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
1142                debug_assert_eq!(starter, '\u{0}');
1143                starter = unprocessed;
1144            } else {
1145                debug_assert_eq!(self.decomposition.buffer_pos, 0);
1146                let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
1147                if !attempt_composition {
1148                    starter = next_starter;
1149                } else if let Some(composed) = self.compose(starter, next_starter) {
1150                    starter = composed;
1151                } else {
1152                    // This is our yield point. We'll pick this up above in the
1153                    // next call to `next()`.
1154                    self.unprocessed_starter = Some(next_starter);
1155                    return Some(starter);
1156                }
1157            }
1158            // We first loop by index to avoid moving the contents of `buffer`, but
1159            // if there's a discontiguous match, we'll start modifying `buffer` instead.
1160            loop {
1161                let (character, ccc) = if let Some((character, ccc)) = self
1162                    .decomposition
1163                    .buffer
1164                    .get(self.decomposition.buffer_pos)
1165                    .map(|c| c.character_and_ccc())
1166                {
1167                    (character, ccc)
1168                } else {
1169                    self.decomposition.buffer.clear();
1170                    self.decomposition.buffer_pos = 0;
1171                    break;
1172                };
1173                if let Some(composed) = self.compose(starter, character) {
1174                    starter = composed;
1175                    self.decomposition.buffer_pos += 1;
1176                    continue;
1177                }
1178                let mut most_recent_skipped_ccc = ccc;
1179                {
1180                    let _ = self
1181                        .decomposition
1182                        .buffer
1183                        .drain(0..self.decomposition.buffer_pos);
1184                }
1185                self.decomposition.buffer_pos = 0;
1186                if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered {
1187                    // We failed to compose a starter. Discontiguous match not allowed.
1188                    // We leave the starter in `buffer` for `next()` to find.
1189                    return Some(starter);
1190                }
1191                let mut i = 1; // We have skipped one non-starter.
1192                while let Some((character, ccc)) = self
1193                    .decomposition
1194                    .buffer
1195                    .get(i)
1196                    .map(|c| c.character_and_ccc())
1197                {
1198                    if ccc == CanonicalCombiningClass::NotReordered {
1199                        // Discontiguous match not allowed.
1200                        return Some(starter);
1201                    }
1202                    debug_assert!(ccc >= most_recent_skipped_ccc);
1203                    if ccc != most_recent_skipped_ccc {
1204                        // Using the non-Hangul version as a micro-optimization, since
1205                        // we already rejected the case where `second` is a starter
1206                        // above, and conjoining jamo are starters.
1207                        if let Some(composed) = self.compose_non_hangul(starter, character) {
1208                            self.decomposition.buffer.remove(i);
1209                            starter = composed;
1210                            continue;
1211                        }
1212                    }
1213                    most_recent_skipped_ccc = ccc;
1214                    i += 1;
1215                }
1216                break;
1217            }
1218
1219            debug_assert_eq!(self.decomposition.buffer_pos, 0);
1220
1221            if !self.decomposition.buffer.is_empty() {
1222                return Some(starter);
1223            }
1224            // Now we need to check if composition with an upcoming starter is possible.
1225            #[allow(clippy::unwrap_used)]
1226            if self.decomposition.pending.is_some() {
1227                // We know that `pending_starter` decomposes to start with a starter.
1228                // Otherwise, it would have been moved to `self.decomposition.buffer`
1229                // by `self.decomposing_next()`. We do this set lookup here in order
1230                // to get an opportunity to go back to the fast track.
1231                // Note that this check has to happen _after_ checking that `pending`
1232                // holds a character, because this flag isn't defined to be meaningful
1233                // when `pending` isn't holding a character.
1234                let pending = self.decomposition.pending.as_ref().unwrap();
1235                if u32::from(pending.character) < self.composition_passthrough_bound
1236                    || !pending.can_combine_backwards()
1237                {
1238                    // Won't combine backwards anyway.
1239                    return Some(starter);
1240                }
1241                // Consume what we peeked. `unwrap` OK, because we checked `is_some()`
1242                // above.
1243                undecomposed_starter = self.decomposition.pending.take().unwrap();
1244                // The following line is OK, because we're about to loop back
1245                // to `self.decomposition.decomposing_next(c);`, which will
1246                // restore the between-`next()`-calls invariant of `pending`
1247                // before this function returns.
1248                attempt_composition = true;
1249                continue;
1250            }
1251            // End of input
1252            return Some(starter);
1253        }
1254    }
1255}
1256
1257macro_rules! composing_normalize_to {
1258    ($(#[$meta:meta])*,
1259     $normalize_to:ident,
1260     $write:path,
1261     $slice:ty,
1262     $prolog:block,
1263     $always_valid_utf:literal,
1264     $as_slice:ident,
1265     $fast:block,
1266     $text:ident,
1267     $sink:ident,
1268     $composition:ident,
1269     $composition_passthrough_bound:ident,
1270     $undecomposed_starter:ident,
1271     $pending_slice:ident,
1272     $len_utf:ident,
1273    ) => {
1274        $(#[$meta])*
1275        pub fn $normalize_to<W: $write + ?Sized>(
1276            &self,
1277            $text: $slice,
1278            $sink: &mut W,
1279        ) -> core::fmt::Result {
1280            $prolog
1281            let mut $composition = self.normalize_iter($text.chars());
1282            debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1283            for cc in $composition.decomposition.buffer.drain(..) {
1284                $sink.write_char(cc.character())?;
1285            }
1286
1287            // Try to get the compiler to hoist the bound to a register.
1288            let $composition_passthrough_bound = $composition.composition_passthrough_bound;
1289            'outer: loop {
1290                debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1291                let mut $undecomposed_starter =
1292                    if let Some(pending) = $composition.decomposition.pending.take() {
1293                        pending
1294                    } else {
1295                        return Ok(());
1296                    };
1297                // Allowing indexed slicing, because a failure would be a code bug and
1298                // not a data issue.
1299                #[allow(clippy::indexing_slicing)]
1300                if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
1301                    $undecomposed_starter.potential_passthrough()
1302                {
1303                    // We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
1304                    // was returned in response to an error by the iterator. Assume the
1305                    // latter for correctness even though it pessimizes the former.
1306                    if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
1307                        let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
1308                        // The `$fast` block must either:
1309                        // 1. Return due to reaching EOF
1310                        // 2. Leave a starter with its trie value in `$undecomposed_starter`
1311                        //    and, if there is still more input, leave the next character
1312                        //    and its trie value in `$composition.decomposition.pending`.
1313                        $fast
1314                    }
1315                }
1316                // Fast track above, full algorithm below
1317                let mut starter = $composition
1318                    .decomposition
1319                    .decomposing_next($undecomposed_starter);
1320                'bufferloop: loop {
1321                    // We first loop by index to avoid moving the contents of `buffer`, but
1322                    // if there's a discontiguous match, we'll start modifying `buffer` instead.
1323                    loop {
1324                        let (character, ccc) = if let Some((character, ccc)) = $composition
1325                            .decomposition
1326                            .buffer
1327                            .get($composition.decomposition.buffer_pos)
1328                            .map(|c| c.character_and_ccc())
1329                        {
1330                            (character, ccc)
1331                        } else {
1332                            $composition.decomposition.buffer.clear();
1333                            $composition.decomposition.buffer_pos = 0;
1334                            break;
1335                        };
1336                        if let Some(composed) = $composition.compose(starter, character) {
1337                            starter = composed;
1338                            $composition.decomposition.buffer_pos += 1;
1339                            continue;
1340                        }
1341                        let mut most_recent_skipped_ccc = ccc;
1342                        if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered {
1343                            // We failed to compose a starter. Discontiguous match not allowed.
1344                            // Write the current `starter` we've been composing, make the unmatched
1345                            // starter in the buffer the new `starter` (we know it's been decomposed)
1346                            // and process the rest of the buffer with that as the starter.
1347                            $sink.write_char(starter)?;
1348                            starter = character;
1349                            $composition.decomposition.buffer_pos += 1;
1350                            continue 'bufferloop;
1351                        } else {
1352                            {
1353                                let _ = $composition
1354                                    .decomposition
1355                                    .buffer
1356                                    .drain(0..$composition.decomposition.buffer_pos);
1357                            }
1358                            $composition.decomposition.buffer_pos = 0;
1359                        }
1360                        let mut i = 1; // We have skipped one non-starter.
1361                        while let Some((character, ccc)) = $composition
1362                            .decomposition
1363                            .buffer
1364                            .get(i)
1365                            .map(|c| c.character_and_ccc())
1366                        {
1367                            if ccc == CanonicalCombiningClass::NotReordered {
1368                                // Discontiguous match not allowed.
1369                                $sink.write_char(starter)?;
1370                                for cc in $composition.decomposition.buffer.drain(..i) {
1371                                    $sink.write_char(cc.character())?;
1372                                }
1373                                starter = character;
1374                                {
1375                                    let removed = $composition.decomposition.buffer.remove(0);
1376                                    debug_assert_eq!(starter, removed.character());
1377                                }
1378                                debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1379                                continue 'bufferloop;
1380                            }
1381                            debug_assert!(ccc >= most_recent_skipped_ccc);
1382                            if ccc != most_recent_skipped_ccc {
1383                                // Using the non-Hangul version as a micro-optimization, since
1384                                // we already rejected the case where `second` is a starter
1385                                // above, and conjoining jamo are starters.
1386                                if let Some(composed) =
1387                                    $composition.compose_non_hangul(starter, character)
1388                                {
1389                                    $composition.decomposition.buffer.remove(i);
1390                                    starter = composed;
1391                                    continue;
1392                                }
1393                            }
1394                            most_recent_skipped_ccc = ccc;
1395                            i += 1;
1396                        }
1397                        break;
1398                    }
1399                    debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1400
1401                    if !$composition.decomposition.buffer.is_empty() {
1402                        $sink.write_char(starter)?;
1403                        for cc in $composition.decomposition.buffer.drain(..) {
1404                            $sink.write_char(cc.character())?;
1405                        }
1406                        // We had non-empty buffer, so can't compose with upcoming.
1407                        continue 'outer;
1408                    }
1409                    // Now we need to check if composition with an upcoming starter is possible.
1410                    if $composition.decomposition.pending.is_some() {
1411                        // We know that `pending_starter` decomposes to start with a starter.
1412                        // Otherwise, it would have been moved to `composition.decomposition.buffer`
1413                        // by `composition.decomposing_next()`. We do this set lookup here in order
1414                        // to get an opportunity to go back to the fast track.
1415                        // Note that this check has to happen _after_ checking that `pending`
1416                        // holds a character, because this flag isn't defined to be meaningful
1417                        // when `pending` isn't holding a character.
1418                        let pending = $composition.decomposition.pending.as_ref().unwrap();
1419                        if u32::from(pending.character) < $composition.composition_passthrough_bound
1420                            || !pending.can_combine_backwards()
1421                        {
1422                            // Won't combine backwards anyway.
1423                            $sink.write_char(starter)?;
1424                            continue 'outer;
1425                        }
1426                        let pending_starter = $composition.decomposition.pending.take().unwrap();
1427                        let decomposed = $composition.decomposition.decomposing_next(pending_starter);
1428                        if let Some(composed) = $composition.compose(starter, decomposed) {
1429                            starter = composed;
1430                        } else {
1431                            $sink.write_char(starter)?;
1432                            starter = decomposed;
1433                        }
1434                        continue 'bufferloop;
1435                    }
1436                    // End of input
1437                    $sink.write_char(starter)?;
1438                    return Ok(());
1439                } // 'bufferloop
1440            }
1441        }
1442    };
1443}
1444
1445macro_rules! decomposing_normalize_to {
1446    ($(#[$meta:meta])*,
1447     $normalize_to:ident,
1448     $write:path,
1449     $slice:ty,
1450     $prolog:block,
1451     $as_slice:ident,
1452     $fast:block,
1453     $text:ident,
1454     $sink:ident,
1455     $decomposition:ident,
1456     $decomposition_passthrough_bound:ident,
1457     $undecomposed_starter:ident,
1458     $pending_slice:ident,
1459     $outer:lifetime, // loop labels use lifetime tokens
1460    ) => {
1461        $(#[$meta])*
1462        pub fn $normalize_to<W: $write + ?Sized>(
1463            &self,
1464            $text: $slice,
1465            $sink: &mut W,
1466        ) -> core::fmt::Result {
1467            $prolog
1468
1469            let mut $decomposition = self.normalize_iter($text.chars());
1470            debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1471
1472            // Try to get the compiler to hoist the bound to a register.
1473            let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
1474            $outer: loop {
1475                for cc in $decomposition.buffer.drain(..) {
1476                    $sink.write_char(cc.character())?;
1477                }
1478                debug_assert_eq!($decomposition.buffer_pos, 0);
1479                let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
1480                    pending
1481                } else {
1482                    return Ok(());
1483                };
1484                // Allowing indexed slicing, because a failure would be a code bug and
1485                // not a data issue.
1486                #[allow(clippy::indexing_slicing)]
1487                if $undecomposed_starter.starter_and_decomposes_to_self() {
1488                    // Don't bother including `undecomposed_starter` in a contiguous buffer
1489                    // write: Just write it right away:
1490                    $sink.write_char($undecomposed_starter.character)?;
1491
1492                    let $pending_slice = $decomposition.delegate.$as_slice();
1493                    $fast
1494                }
1495                let starter = $decomposition.decomposing_next($undecomposed_starter);
1496                $sink.write_char(starter)?;
1497            }
1498        }
1499    };
1500}
1501
1502macro_rules! normalizer_methods {
1503    () => {
1504        /// Normalize a string slice into a `String`.
1505        pub fn normalize(&self, text: &str) -> String {
1506            let mut ret = String::new();
1507            ret.reserve(text.len());
1508            let _ = self.normalize_to(text, &mut ret);
1509            ret
1510        }
1511
1512        /// Check whether a string slice is normalized.
1513        pub fn is_normalized(&self, text: &str) -> bool {
1514            let mut sink = IsNormalizedSinkStr::new(text);
1515            if self.normalize_to(text, &mut sink).is_err() {
1516                return false;
1517            }
1518            sink.finished()
1519        }
1520
1521        /// Normalize a slice of potentially-invalid UTF-16 into a `Vec`.
1522        ///
1523        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1524        /// before normalizing.
1525        pub fn normalize_utf16(&self, text: &[u16]) -> Vec<u16> {
1526            let mut ret = Vec::new();
1527            let _ = self.normalize_utf16_to(text, &mut ret);
1528            ret
1529        }
1530
1531        /// Checks whether a slice of potentially-invalid UTF-16 is normalized.
1532        ///
1533        /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
1534        pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
1535            let mut sink = IsNormalizedSinkUtf16::new(text);
1536            if self.normalize_utf16_to(text, &mut sink).is_err() {
1537                return false;
1538            }
1539            sink.finished()
1540        }
1541
1542        /// Normalize a slice of potentially-invalid UTF-8 into a `String`.
1543        ///
1544        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1545        /// according to the WHATWG Encoding Standard.
1546        pub fn normalize_utf8(&self, text: &[u8]) -> String {
1547            let mut ret = String::new();
1548            ret.reserve(text.len());
1549            let _ = self.normalize_utf8_to(text, &mut ret);
1550            ret
1551        }
1552
1553        /// Check if a slice of potentially-invalid UTF-8 is normalized.
1554        ///
1555        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1556        /// according to the WHATWG Encoding Standard before checking.
1557        pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
1558            let mut sink = IsNormalizedSinkUtf8::new(text);
1559            if self.normalize_utf8_to(text, &mut sink).is_err() {
1560                return false;
1561            }
1562            sink.finished()
1563        }
1564    };
1565}
1566
1567/// A normalizer for performing decomposing normalization.
1568#[derive(Debug)]
1569pub struct DecomposingNormalizer {
1570    decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
1571    supplementary_decompositions: Option<SupplementPayloadHolder>,
1572    tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
1573    supplementary_tables: Option<DataPayload<CompatibilityDecompositionTablesV1Marker>>,
1574    decomposition_passthrough_bound: u8, // never above 0xC0
1575    composition_passthrough_bound: u16,  // never above 0x0300
1576}
1577
1578impl DecomposingNormalizer {
1579    /// NFD constructor using compiled data.
1580    ///
1581    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1582    ///
1583    /// [📚 Help choosing a constructor](icu_provider::constructors)
1584    #[cfg(feature = "compiled_data")]
1585    pub const fn new_nfd() -> Self {
1586        const _: () = assert!(
1587            crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1588                .scalars16
1589                .const_len()
1590                + crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1591                    .scalars24
1592                    .const_len()
1593                <= 0xFFF,
1594            "NormalizerError::FutureExtension"
1595        );
1596
1597        DecomposingNormalizer {
1598            decompositions: DataPayload::from_static_ref(
1599                crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
1600            ),
1601            supplementary_decompositions: None,
1602            tables: DataPayload::from_static_ref(
1603                crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
1604            ),
1605            supplementary_tables: None,
1606            decomposition_passthrough_bound: 0xC0,
1607            composition_passthrough_bound: 0x0300,
1608        }
1609    }
1610
1611    icu_provider::gen_any_buffer_data_constructors!(
1612        locale: skip,
1613        options: skip,
1614        error: NormalizerError,
1615        #[cfg(skip)]
1616        functions: [
1617            new_nfd,
1618            try_new_nfd_with_any_provider,
1619            try_new_nfd_with_buffer_provider,
1620            try_new_nfd_unstable,
1621            Self,
1622        ]
1623    );
1624
1625    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
1626    pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
1627    where
1628        D: DataProvider<CanonicalDecompositionDataV1Marker>
1629            + DataProvider<CanonicalDecompositionTablesV1Marker>
1630            + ?Sized,
1631    {
1632        let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
1633            provider.load(Default::default())?.take_payload()?;
1634        let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
1635            provider.load(Default::default())?.take_payload()?;
1636
1637        if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
1638            // The data is from a future where there exists a normalization flavor whose
1639            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
1640            // of space. If a good use case from such a decomposition flavor arises, we can
1641            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
1642            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
1643            // since for now the masks are hard-coded, error out.
1644            return Err(NormalizerError::FutureExtension);
1645        }
1646
1647        Ok(DecomposingNormalizer {
1648            decompositions,
1649            supplementary_decompositions: None,
1650            tables,
1651            supplementary_tables: None,
1652            decomposition_passthrough_bound: 0xC0,
1653            composition_passthrough_bound: 0x0300,
1654        })
1655    }
1656
1657    /// NFKD constructor using compiled data.
1658    ///
1659    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1660    ///
1661    /// [📚 Help choosing a constructor](icu_provider::constructors)
1662    #[cfg(feature = "compiled_data")]
1663    pub const fn new_nfkd() -> Self {
1664        const _: () = assert!(
1665            crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1666                .scalars16
1667                .const_len()
1668                + crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1669                    .scalars24
1670                    .const_len()
1671                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
1672                    .scalars16
1673                    .const_len()
1674                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
1675                    .scalars24
1676                    .const_len()
1677                <= 0xFFF,
1678            "NormalizerError::FutureExtension"
1679        );
1680
1681        const _: () = assert!(
1682            crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap <= 0x0300,
1683            "NormalizerError::ValidationError"
1684        );
1685
1686        let decomposition_capped =
1687            if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap < 0xC0 {
1688                crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap
1689            } else {
1690                0xC0
1691            };
1692        let composition_capped =
1693            if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap < 0x0300 {
1694                crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap
1695            } else {
1696                0x0300
1697            };
1698
1699        DecomposingNormalizer {
1700            decompositions: DataPayload::from_static_ref(
1701                crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
1702            ),
1703            supplementary_decompositions: Some(SupplementPayloadHolder::Compatibility(
1704                DataPayload::from_static_ref(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1),
1705            )),
1706            tables: DataPayload::from_static_ref(
1707                crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
1708            ),
1709            supplementary_tables: Some(DataPayload::from_static_ref(
1710                crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1,
1711            )),
1712            decomposition_passthrough_bound: decomposition_capped as u8,
1713            composition_passthrough_bound: composition_capped,
1714        }
1715    }
1716
1717    icu_provider::gen_any_buffer_data_constructors!(
1718        locale: skip,
1719        options: skip,
1720        error: NormalizerError,
1721        #[cfg(skip)]
1722        functions: [
1723            new_nfkd,
1724            try_new_nfkd_with_any_provider,
1725            try_new_nfkd_with_buffer_provider,
1726            try_new_nfkd_unstable,
1727            Self,
1728        ]
1729    );
1730
1731    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
1732    pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
1733    where
1734        D: DataProvider<CanonicalDecompositionDataV1Marker>
1735            + DataProvider<CompatibilityDecompositionSupplementV1Marker>
1736            + DataProvider<CanonicalDecompositionTablesV1Marker>
1737            + DataProvider<CompatibilityDecompositionTablesV1Marker>
1738            + ?Sized,
1739    {
1740        let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
1741            provider.load(Default::default())?.take_payload()?;
1742        let supplementary_decompositions: DataPayload<
1743            CompatibilityDecompositionSupplementV1Marker,
1744        > = provider.load(Default::default())?.take_payload()?;
1745        let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
1746            provider.load(Default::default())?.take_payload()?;
1747        let supplementary_tables: DataPayload<CompatibilityDecompositionTablesV1Marker> =
1748            provider.load(Default::default())?.take_payload()?;
1749
1750        if tables.get().scalars16.len()
1751            + tables.get().scalars24.len()
1752            + supplementary_tables.get().scalars16.len()
1753            + supplementary_tables.get().scalars24.len()
1754            > 0xFFF
1755        {
1756            // The data is from a future where there exists a normalization flavor whose
1757            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
1758            // of space. If a good use case from such a decomposition flavor arises, we can
1759            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
1760            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
1761            // since for now the masks are hard-coded, error out.
1762            return Err(NormalizerError::FutureExtension);
1763        }
1764
1765        let cap = supplementary_decompositions.get().passthrough_cap;
1766        if cap > 0x0300 {
1767            return Err(NormalizerError::ValidationError);
1768        }
1769        let decomposition_capped = cap.min(0xC0);
1770        let composition_capped = cap.min(0x0300);
1771
1772        Ok(DecomposingNormalizer {
1773            decompositions,
1774            supplementary_decompositions: Some(SupplementPayloadHolder::Compatibility(
1775                supplementary_decompositions,
1776            )),
1777            tables,
1778            supplementary_tables: Some(supplementary_tables),
1779            decomposition_passthrough_bound: decomposition_capped as u8,
1780            composition_passthrough_bound: composition_capped,
1781        })
1782    }
1783
1784    #[doc(hidden)]
1785    #[cfg(feature = "compiled_data")]
1786    pub(crate) const fn new_uts46_decomposed() -> Self {
1787        const _: () = assert!(
1788            crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1789                .scalars16
1790                .const_len()
1791                + crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1792                    .scalars24
1793                    .const_len()
1794                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
1795                    .scalars16
1796                    .const_len()
1797                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
1798                    .scalars24
1799                    .const_len()
1800                <= 0xFFF,
1801            "NormalizerError::FutureExtension"
1802        );
1803
1804        const _: () = assert!(
1805            crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap <= 0x0300,
1806            "NormalizerError::ValidationError"
1807        );
1808
1809        let decomposition_capped =
1810            if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap < 0xC0 {
1811                crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap
1812            } else {
1813                0xC0
1814            };
1815        let composition_capped =
1816            if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap < 0x0300 {
1817                crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap
1818            } else {
1819                0x0300
1820            };
1821
1822        DecomposingNormalizer {
1823            decompositions: DataPayload::from_static_ref(
1824                crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
1825            ),
1826            supplementary_decompositions: Some(SupplementPayloadHolder::Uts46(
1827                DataPayload::from_static_ref(
1828                    crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1,
1829                ),
1830            )),
1831            tables: DataPayload::from_static_ref(
1832                crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
1833            ),
1834            supplementary_tables: Some(DataPayload::from_static_ref(
1835                crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1,
1836            )),
1837            decomposition_passthrough_bound: decomposition_capped as u8,
1838            composition_passthrough_bound: composition_capped,
1839        }
1840    }
1841
1842    /// UTS 46 decomposed constructor (testing only)
1843    ///
1844    /// This is a special building block normalization for IDNA. It is the decomposed counterpart of
1845    /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
1846    /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
1847    /// NFD in this normalization. In both cases, the previous UTS 46 processing before using
1848    /// normalization is expected to deal with these characters. Making the disallowed characters
1849    /// behave like this is beneficial to data size, and this normalizer implementation cannot
1850    /// deal with a character normalizing to the empty string, which doesn't happen in NFD or
1851    /// NFKD as of Unicode 14.
1852    ///
1853    /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
1854    /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
1855    /// U+0345 from a reordered character into a non-reordered character before reordering happens.
1856    /// Therefore, the output of this normalization may differ for different inputs that are
1857    /// canonically equivalent with each other if they differ by how U+0345 is ordered relative
1858    /// to other reorderable characters.
1859    ///
1860    /// Public for testing only.
1861    #[doc(hidden)]
1862    pub(crate) fn try_new_uts46_decomposed_unstable<D>(
1863        provider: &D,
1864    ) -> Result<Self, NormalizerError>
1865    where
1866        D: DataProvider<CanonicalDecompositionDataV1Marker>
1867            + DataProvider<Uts46DecompositionSupplementV1Marker>
1868            + DataProvider<CanonicalDecompositionTablesV1Marker>
1869            + DataProvider<CompatibilityDecompositionTablesV1Marker>
1870            // UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
1871            + ?Sized,
1872    {
1873        let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
1874            provider.load(Default::default())?.take_payload()?;
1875        let supplementary_decompositions: DataPayload<Uts46DecompositionSupplementV1Marker> =
1876            provider.load(Default::default())?.take_payload()?;
1877        let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
1878            provider.load(Default::default())?.take_payload()?;
1879        let supplementary_tables: DataPayload<CompatibilityDecompositionTablesV1Marker> =
1880            provider.load(Default::default())?.take_payload()?;
1881
1882        if tables.get().scalars16.len()
1883            + tables.get().scalars24.len()
1884            + supplementary_tables.get().scalars16.len()
1885            + supplementary_tables.get().scalars24.len()
1886            > 0xFFF
1887        {
1888            // The data is from a future where there exists a normalization flavor whose
1889            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
1890            // of space. If a good use case from such a decomposition flavor arises, we can
1891            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
1892            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
1893            // since for now the masks are hard-coded, error out.
1894            return Err(NormalizerError::FutureExtension);
1895        }
1896
1897        let cap = supplementary_decompositions.get().passthrough_cap;
1898        if cap > 0x0300 {
1899            return Err(NormalizerError::ValidationError);
1900        }
1901        let decomposition_capped = cap.min(0xC0);
1902        let composition_capped = cap.min(0x0300);
1903
1904        Ok(DecomposingNormalizer {
1905            decompositions,
1906            supplementary_decompositions: Some(SupplementPayloadHolder::Uts46(
1907                supplementary_decompositions,
1908            )),
1909            tables,
1910            supplementary_tables: Some(supplementary_tables),
1911            decomposition_passthrough_bound: decomposition_capped as u8,
1912            composition_passthrough_bound: composition_capped,
1913        })
1914    }
1915
1916    /// Wraps a delegate iterator into a decomposing iterator
1917    /// adapter by using the data already held by this normalizer.
1918    pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<I> {
1919        Decomposition::new_with_supplements(
1920            iter,
1921            self.decompositions.get(),
1922            self.supplementary_decompositions.as_ref().map(|s| s.get()),
1923            self.tables.get(),
1924            self.supplementary_tables.as_ref().map(|s| s.get()),
1925            self.decomposition_passthrough_bound,
1926            IgnorableBehavior::Unsupported,
1927        )
1928    }
1929
1930    normalizer_methods!();
1931
1932    decomposing_normalize_to!(
1933        /// Normalize a string slice into a `Write` sink.
1934        ,
1935        normalize_to,
1936        core::fmt::Write,
1937        &str,
1938        {
1939        },
1940        as_str,
1941        {
1942            let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
1943                0xC3u8
1944            } else {
1945                decomposition_passthrough_bound.min(0x80) as u8
1946            };
1947            // The attribute belongs on an inner statement, but Rust doesn't allow it there.
1948            #[allow(clippy::unwrap_used)]
1949            'fast: loop {
1950                let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
1951                'fastest: loop {
1952                    if let Some(&upcoming_byte) = code_unit_iter.next() {
1953                        if upcoming_byte < decomposition_passthrough_byte_bound {
1954                            // Fast-track succeeded!
1955                            continue 'fastest;
1956                        }
1957                        decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
1958                        break 'fastest;
1959                    }
1960                    // End of stream
1961                    sink.write_str(pending_slice)?;
1962                    return Ok(());
1963                }
1964
1965                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1966                // is an upcoming byte.
1967                let upcoming = decomposition.delegate.next().unwrap();
1968                let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1969                if upcoming_with_trie_value.starter_and_decomposes_to_self() {
1970                    continue 'fast;
1971                }
1972                let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1973                    - decomposition.delegate.as_str().len()
1974                    - upcoming.len_utf8()];
1975                sink.write_str(consumed_so_far_slice)?;
1976
1977                // Now let's figure out if we got a starter or a non-starter.
1978                if decomposition_starts_with_non_starter(
1979                    upcoming_with_trie_value.trie_val,
1980                ) {
1981                    // Let this trie value to be reprocessed in case it is
1982                    // one of the rare decomposing ones.
1983                    decomposition.pending = Some(upcoming_with_trie_value);
1984                    decomposition.gather_and_sort_combining(0);
1985                    continue 'outer;
1986                }
1987                undecomposed_starter = upcoming_with_trie_value;
1988                debug_assert!(decomposition.pending.is_none());
1989                break 'fast;
1990            }
1991        },
1992        text,
1993        sink,
1994        decomposition,
1995        decomposition_passthrough_bound,
1996        undecomposed_starter,
1997        pending_slice,
1998        'outer,
1999    );
2000
2001    decomposing_normalize_to!(
2002        /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2003        ///
2004        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2005        /// according to the WHATWG Encoding Standard.
2006        ,
2007        normalize_utf8_to,
2008        core::fmt::Write,
2009        &[u8],
2010        {
2011        },
2012        as_slice,
2013        {
2014            let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
2015            // The attribute belongs on an inner statement, but Rust doesn't allow it there.
2016            #[allow(clippy::unwrap_used)]
2017            'fast: loop {
2018                let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2019                'fastest: loop {
2020                    if let Some(&upcoming_byte) = code_unit_iter.next() {
2021                        if upcoming_byte < decomposition_passthrough_byte_bound {
2022                            // Fast-track succeeded!
2023                            continue 'fastest;
2024                        }
2025                        break 'fastest;
2026                    }
2027                    // End of stream
2028                    sink.write_str(unsafe { from_utf8_unchecked(pending_slice) })?;
2029                    return Ok(());
2030                }
2031                decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
2032
2033                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
2034                // is an upcoming byte.
2035                let upcoming = decomposition.delegate.next().unwrap();
2036                let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
2037                if upcoming_with_trie_value.starter_and_decomposes_to_self() {
2038                    if upcoming != REPLACEMENT_CHARACTER {
2039                        continue 'fast;
2040                    }
2041                    // We might have an error, so fall out of the fast path.
2042
2043                    // Since the U+FFFD might signify an error, we can't
2044                    // assume `upcoming.len_utf8()` for the backoff length.
2045                    let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
2046                    let back = consumed_so_far.next_back();
2047                    debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2048                    let consumed_so_far_slice = consumed_so_far.as_slice();
2049                    sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?;
2050
2051                    // We could call `gather_and_sort_combining` here and
2052                    // `continue 'outer`, but this should be better for code
2053                    // size.
2054                    undecomposed_starter = upcoming_with_trie_value;
2055                    debug_assert!(decomposition.pending.is_none());
2056                    break 'fast;
2057                }
2058                let consumed_so_far_slice = &pending_slice[..pending_slice.len()
2059                    - decomposition.delegate.as_slice().len()
2060                    - upcoming.len_utf8()];
2061                sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?;
2062
2063                // Now let's figure out if we got a starter or a non-starter.
2064                if decomposition_starts_with_non_starter(
2065                    upcoming_with_trie_value.trie_val,
2066                ) {
2067                    // Let this trie value to be reprocessed in case it is
2068                    // one of the rare decomposing ones.
2069                    decomposition.pending = Some(upcoming_with_trie_value);
2070                    decomposition.gather_and_sort_combining(0);
2071                    continue 'outer;
2072                }
2073                undecomposed_starter = upcoming_with_trie_value;
2074                debug_assert!(decomposition.pending.is_none());
2075                break 'fast;
2076            }
2077        },
2078        text,
2079        sink,
2080        decomposition,
2081        decomposition_passthrough_bound,
2082        undecomposed_starter,
2083        pending_slice,
2084        'outer,
2085    );
2086
2087    decomposing_normalize_to!(
2088        /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2089        ///
2090        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2091        /// before normalizing.
2092        ,
2093        normalize_utf16_to,
2094        write16::Write16,
2095        &[u16],
2096        {
2097            sink.size_hint(text.len())?;
2098        },
2099        as_slice,
2100        {
2101            let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2102            // The purpose of the counter is to flush once in a while. If we flush
2103            // too much, there is too much flushing overhead. If we flush too rarely,
2104            // the flush starts reading from too far behind compared to the hot
2105            // recently-read memory.
2106            let mut counter = UTF16_FAST_PATH_FLUSH_THRESHOLD;
2107            'fast: loop {
2108                counter -= 1;
2109                if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2110                    let mut upcoming32 = u32::from(upcoming_code_unit);
2111                    if upcoming32 < decomposition_passthrough_bound && counter != 0 {
2112                        continue 'fast;
2113                    }
2114                    // The loop is only broken out of as goto forward
2115                    #[allow(clippy::never_loop)]
2116                    'surrogateloop: loop {
2117                        let surrogate_base = upcoming32.wrapping_sub(0xD800);
2118                        if surrogate_base > (0xDFFF - 0xD800) {
2119                            // Not surrogate
2120                            break 'surrogateloop;
2121                        }
2122                        if surrogate_base <= (0xDBFF - 0xD800) {
2123                            let iter_backup = code_unit_iter.clone();
2124                            if let Some(&low) = code_unit_iter.next() {
2125                                if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
2126                                    upcoming32 = (upcoming32 << 10) + u32::from(low)
2127                                        - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2128                                    break 'surrogateloop;
2129                                } else {
2130                                    code_unit_iter = iter_backup;
2131                                }
2132                            }
2133                        }
2134                        // unpaired surrogate
2135                        let slice_to_write = &pending_slice
2136                            [..pending_slice.len() - code_unit_iter.as_slice().len() - 1];
2137                        sink.write_slice(slice_to_write)?;
2138                        undecomposed_starter =
2139                            CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2140                        debug_assert!(decomposition.pending.is_none());
2141                        // We could instead call `gather_and_sort_combining` and `continue 'outer`, but
2142                        // assuming this is better for code size.
2143                        break 'fast;
2144                    }
2145                    // Not unpaired surrogate
2146                    let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2147                    let upcoming_with_trie_value =
2148                        decomposition.attach_trie_value(upcoming);
2149                    if upcoming_with_trie_value.starter_and_decomposes_to_self() && counter != 0 {
2150                        continue 'fast;
2151                    }
2152                    let consumed_so_far_slice = &pending_slice[..pending_slice.len()
2153                        - code_unit_iter.as_slice().len()
2154                        - upcoming.len_utf16()];
2155                    sink.write_slice(consumed_so_far_slice)?;
2156
2157                    // Now let's figure out if we got a starter or a non-starter.
2158                    if decomposition_starts_with_non_starter(
2159                        upcoming_with_trie_value.trie_val,
2160                    ) {
2161                        // Sync with main iterator
2162                        decomposition.delegate = code_unit_iter.as_slice().chars();
2163                        // Let this trie value to be reprocessed in case it is
2164                        // one of the rare decomposing ones.
2165                        decomposition.pending = Some(upcoming_with_trie_value);
2166                        decomposition.gather_and_sort_combining(0);
2167                        continue 'outer;
2168                    }
2169                    undecomposed_starter = upcoming_with_trie_value;
2170                    debug_assert!(decomposition.pending.is_none());
2171                    break 'fast;
2172                }
2173                // End of stream
2174                sink.write_slice(pending_slice)?;
2175                return Ok(());
2176            }
2177            // Sync the main iterator
2178            decomposition.delegate = code_unit_iter.as_slice().chars();
2179        },
2180        text,
2181        sink,
2182        decomposition,
2183        decomposition_passthrough_bound,
2184        undecomposed_starter,
2185        pending_slice,
2186        'outer,
2187    );
2188}
2189
2190/// A normalizer for performing composing normalization.
2191#[derive(Debug)]
2192pub struct ComposingNormalizer {
2193    decomposing_normalizer: DecomposingNormalizer,
2194    canonical_compositions: DataPayload<CanonicalCompositionsV1Marker>,
2195}
2196
2197impl ComposingNormalizer {
2198    /// NFC constructor using compiled data.
2199    ///
2200    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2201    ///
2202    /// [📚 Help choosing a constructor](icu_provider::constructors)
2203    #[cfg(feature = "compiled_data")]
2204    pub const fn new_nfc() -> Self {
2205        ComposingNormalizer {
2206            decomposing_normalizer: DecomposingNormalizer::new_nfd(),
2207            canonical_compositions: DataPayload::from_static_ref(
2208                crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
2209            ),
2210        }
2211    }
2212
2213    icu_provider::gen_any_buffer_data_constructors!(
2214        locale: skip,
2215        options: skip,
2216        error: NormalizerError,
2217        #[cfg(skip)]
2218        functions: [
2219            new_nfc,
2220            try_new_nfc_with_any_provider,
2221            try_new_nfc_with_buffer_provider,
2222            try_new_nfc_unstable,
2223            Self,
2224        ]
2225    );
2226
2227    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
2228    pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
2229    where
2230        D: DataProvider<CanonicalDecompositionDataV1Marker>
2231            + DataProvider<CanonicalDecompositionTablesV1Marker>
2232            + DataProvider<CanonicalCompositionsV1Marker>
2233            + ?Sized,
2234    {
2235        let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
2236
2237        let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
2238            provider.load(Default::default())?.take_payload()?;
2239
2240        Ok(ComposingNormalizer {
2241            decomposing_normalizer,
2242            canonical_compositions,
2243        })
2244    }
2245
2246    /// NFKC constructor using compiled data.
2247    ///
2248    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2249    ///
2250    /// [📚 Help choosing a constructor](icu_provider::constructors)
2251    #[cfg(feature = "compiled_data")]
2252    pub const fn new_nfkc() -> Self {
2253        ComposingNormalizer {
2254            decomposing_normalizer: DecomposingNormalizer::new_nfkd(),
2255            canonical_compositions: DataPayload::from_static_ref(
2256                crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
2257            ),
2258        }
2259    }
2260
2261    icu_provider::gen_any_buffer_data_constructors!(
2262        locale: skip,
2263        options: skip,
2264        error: NormalizerError,
2265        #[cfg(skip)]
2266        functions: [
2267            new_nfkc,
2268            try_new_nfkc_with_any_provider,
2269            try_new_nfkc_with_buffer_provider,
2270            try_new_nfkc_unstable,
2271            Self,
2272        ]
2273    );
2274
2275    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
2276    pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
2277    where
2278        D: DataProvider<CanonicalDecompositionDataV1Marker>
2279            + DataProvider<CompatibilityDecompositionSupplementV1Marker>
2280            + DataProvider<CanonicalDecompositionTablesV1Marker>
2281            + DataProvider<CompatibilityDecompositionTablesV1Marker>
2282            + DataProvider<CanonicalCompositionsV1Marker>
2283            + ?Sized,
2284    {
2285        let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
2286
2287        let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
2288            provider.load(Default::default())?.take_payload()?;
2289
2290        Ok(ComposingNormalizer {
2291            decomposing_normalizer,
2292            canonical_compositions,
2293        })
2294    }
2295
2296    /// This is a special building block normalization for IDNA that implements parts of the Map
2297    /// step and the following Normalize step.
2298    ///
2299    /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2300    /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2301    /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2302    /// Therefore, the output of this normalization may differ for different inputs that are
2303    /// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2304    /// to other reorderable characters.
2305    #[cfg(feature = "compiled_data")]
2306    pub(crate) const fn new_uts46() -> Self {
2307        ComposingNormalizer {
2308            decomposing_normalizer: DecomposingNormalizer::new_uts46_decomposed(),
2309            canonical_compositions: DataPayload::from_static_ref(
2310                crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
2311            ),
2312        }
2313    }
2314
2315    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2316    pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
2317    where
2318        D: DataProvider<CanonicalDecompositionDataV1Marker>
2319            + DataProvider<Uts46DecompositionSupplementV1Marker>
2320            + DataProvider<CanonicalDecompositionTablesV1Marker>
2321            + DataProvider<CompatibilityDecompositionTablesV1Marker>
2322            // UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
2323            + DataProvider<CanonicalCompositionsV1Marker>
2324            + ?Sized,
2325    {
2326        let decomposing_normalizer =
2327            DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
2328
2329        let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
2330            provider.load(Default::default())?.take_payload()?;
2331
2332        Ok(ComposingNormalizer {
2333            decomposing_normalizer,
2334            canonical_compositions,
2335        })
2336    }
2337
2338    /// Wraps a delegate iterator into a composing iterator
2339    /// adapter by using the data already held by this normalizer.
2340    pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<I> {
2341        self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2342    }
2343
2344    fn normalize_iter_private<I: Iterator<Item = char>>(
2345        &self,
2346        iter: I,
2347        ignorable_behavior: IgnorableBehavior,
2348    ) -> Composition<I> {
2349        Composition::new(
2350            Decomposition::new_with_supplements(
2351                iter,
2352                self.decomposing_normalizer.decompositions.get(),
2353                self.decomposing_normalizer
2354                    .supplementary_decompositions
2355                    .as_ref()
2356                    .map(|s| s.get()),
2357                self.decomposing_normalizer.tables.get(),
2358                self.decomposing_normalizer
2359                    .supplementary_tables
2360                    .as_ref()
2361                    .map(|s| s.get()),
2362                self.decomposing_normalizer.decomposition_passthrough_bound,
2363                ignorable_behavior,
2364            ),
2365            ZeroFrom::zero_from(&self.canonical_compositions.get().canonical_compositions),
2366            self.decomposing_normalizer.composition_passthrough_bound,
2367        )
2368    }
2369
2370    normalizer_methods!();
2371
2372    composing_normalize_to!(
2373        /// Normalize a string slice into a `Write` sink.
2374        ,
2375        normalize_to,
2376        core::fmt::Write,
2377        &str,
2378        {},
2379        true,
2380        as_str,
2381        {
2382            // Let's hope LICM hoists this outside `'outer`.
2383            let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
2384                0xCCu8
2385            } else {
2386                // We can make this fancy if a normalization other than NFC where looking at
2387                // non-ASCII lead bytes is worthwhile is ever introduced.
2388                composition_passthrough_bound.min(0x80) as u8
2389            };
2390            // This is basically an `Option` discriminant for `undecomposed_starter`,
2391            // but making it a boolean so that writes in the tightest loop are as
2392            // simple as possible (and potentially as peel-hoistable as possible).
2393            // Furthermore, this reduces `unwrap()` later.
2394            let mut undecomposed_starter_valid = true;
2395            // Annotation belongs really on inner statements, but Rust doesn't
2396            // allow it there.
2397            #[allow(clippy::unwrap_used)]
2398            'fast: loop {
2399                let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
2400                'fastest: loop {
2401                    if let Some(&upcoming_byte) = code_unit_iter.next() {
2402                        if upcoming_byte < composition_passthrough_byte_bound {
2403                            // Fast-track succeeded!
2404                            undecomposed_starter_valid = false;
2405                            continue 'fastest;
2406                        }
2407                        composition.decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
2408                        break 'fastest;
2409                    }
2410                    // End of stream
2411                    sink.write_str(pending_slice)?;
2412                    return Ok(());
2413                }
2414                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
2415                // is an upcoming byte.
2416                let upcoming = composition.decomposition.delegate.next().unwrap();
2417                let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2418                if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2419                    // Can't combine backwards, hence a plain (non-backwards-combining)
2420                    // starter albeit past `composition_passthrough_bound`
2421
2422                    // Fast-track succeeded!
2423                    undecomposed_starter = upcoming_with_trie_value;
2424                    undecomposed_starter_valid = true;
2425                    continue 'fast;
2426                }
2427                // We need to fall off the fast path.
2428                composition.decomposition.pending = Some(upcoming_with_trie_value);
2429                let consumed_so_far_slice = if undecomposed_starter_valid {
2430                    &pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8() - undecomposed_starter.character.len_utf8()]
2431                } else {
2432                    // slicing and unwrap OK, because we've just evidently read enough previously.
2433                    let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
2434                    // `unwrap` OK, because we've previously manage to read the previous character
2435                    undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2436                    undecomposed_starter_valid = true;
2437                    consumed_so_far.as_str()
2438                };
2439                sink.write_str(consumed_so_far_slice)?;
2440                break 'fast;
2441            }
2442            debug_assert!(undecomposed_starter_valid);
2443        },
2444        text,
2445        sink,
2446        composition,
2447        composition_passthrough_bound,
2448        undecomposed_starter,
2449        pending_slice,
2450        len_utf8,
2451    );
2452
2453    composing_normalize_to!(
2454        /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2455        ///
2456        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2457        /// according to the WHATWG Encoding Standard.
2458        ,
2459        normalize_utf8_to,
2460        core::fmt::Write,
2461        &[u8],
2462        {},
2463        false,
2464        as_slice,
2465        {
2466            // This is basically an `Option` discriminant for `undecomposed_starter`,
2467            // but making it a boolean so that writes in the tightest loop are as
2468            // simple as possible (and potentially as peel-hoistable as possible).
2469            // Furthermore, this reduces `unwrap()` later.
2470            let mut undecomposed_starter_valid = true;
2471            'fast: loop {
2472                if let Some(upcoming) = composition.decomposition.delegate.next() {
2473                    if u32::from(upcoming) < composition_passthrough_bound {
2474                        // Fast-track succeeded!
2475                        undecomposed_starter_valid = false;
2476                        continue 'fast;
2477                    }
2478                    // TODO(#2006): Annotate as unlikely
2479                    if upcoming == REPLACEMENT_CHARACTER {
2480                        // Can't tell if this is an error or a literal U+FFFD in
2481                        // the input. Assuming the former to be sure.
2482
2483                        // Since the U+FFFD might signify an error, we can't
2484                        // assume `upcoming.len_utf8()` for the backoff length.
2485                        let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
2486                        let back = consumed_so_far.next_back();
2487                        debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2488                        let consumed_so_far_slice = consumed_so_far.as_slice();
2489                        sink.write_str(unsafe{ from_utf8_unchecked(consumed_so_far_slice)})?;
2490                        undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2491                        undecomposed_starter_valid = true;
2492                        composition.decomposition.pending = None;
2493                        break 'fast;
2494                    }
2495                    let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2496                    if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2497                        // Can't combine backwards, hence a plain (non-backwards-combining)
2498                        // starter albeit past `composition_passthrough_bound`
2499
2500                        // Fast-track succeeded!
2501                        undecomposed_starter = upcoming_with_trie_value;
2502                        undecomposed_starter_valid = true;
2503                        continue 'fast;
2504                    }
2505                    // We need to fall off the fast path.
2506                    composition.decomposition.pending = Some(upcoming_with_trie_value);
2507                    // Annotation belongs really on inner statement, but Rust doesn't
2508                    // allow it there.
2509                    #[allow(clippy::unwrap_used)]
2510                    let consumed_so_far_slice = if undecomposed_starter_valid {
2511                        &pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8() - undecomposed_starter.character.len_utf8()]
2512                    } else {
2513                        // slicing and unwrap OK, because we've just evidently read enough previously.
2514                        let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
2515                        // `unwrap` OK, because we've previously manage to read the previous character
2516                        undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2517                        undecomposed_starter_valid = true;
2518                        consumed_so_far.as_slice()
2519                    };
2520                    sink.write_str(unsafe { from_utf8_unchecked(consumed_so_far_slice)})?;
2521                    break 'fast;
2522                }
2523                // End of stream
2524                sink.write_str(unsafe {from_utf8_unchecked(pending_slice) })?;
2525                return Ok(());
2526            }
2527            debug_assert!(undecomposed_starter_valid);
2528        },
2529        text,
2530        sink,
2531        composition,
2532        composition_passthrough_bound,
2533        undecomposed_starter,
2534        pending_slice,
2535        len_utf8,
2536    );
2537
2538    composing_normalize_to!(
2539        /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2540        ///
2541        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2542        /// before normalizing.
2543        ,
2544        normalize_utf16_to,
2545        write16::Write16,
2546        &[u16],
2547        {
2548            sink.size_hint(text.len())?;
2549        },
2550        false,
2551        as_slice,
2552        {
2553            let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
2554            let mut upcoming32;
2555            // This is basically an `Option` discriminant for `undecomposed_starter`,
2556            // but making it a boolean so that writes to it are  are as
2557            // simple as possible.
2558            // Furthermore, this removes the need for `unwrap()` later.
2559            let mut undecomposed_starter_valid;
2560            // The purpose of the counter is to flush once in a while. If we flush
2561            // too much, there is too much flushing overhead. If we flush too rarely,
2562            // the flush starts reading from too far behind compared to the hot
2563            // recently-read memory.
2564            let mut counter = UTF16_FAST_PATH_FLUSH_THRESHOLD;
2565            // The purpose of this trickiness is to avoid writing to
2566            // `undecomposed_starter_valid` from the tightest loop. Writing to it
2567            // from there destroys performance.
2568            let mut counter_reference = counter - 1;
2569            'fast: loop {
2570                counter -= 1;
2571                if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2572                    upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
2573                    if upcoming32 < composition_passthrough_bound && counter != 0 {
2574                        // No need for surrogate or U+FFFD check, because
2575                        // `composition_passthrough_bound` cannot be higher than
2576                        // U+0300.
2577                        // Fast-track succeeded!
2578                        continue 'fast;
2579                    }
2580                    // if `counter` equals `counter_reference`, the `continue 'fast`
2581                    // line above has not executed and `undecomposed_starter` is still
2582                    // valid.
2583                    undecomposed_starter_valid = counter == counter_reference;
2584                    // The loop is only broken out of as goto forward
2585                    #[allow(clippy::never_loop)]
2586                    'surrogateloop: loop {
2587                        let surrogate_base = upcoming32.wrapping_sub(0xD800);
2588                        if surrogate_base > (0xDFFF - 0xD800) {
2589                            // Not surrogate
2590                            break 'surrogateloop;
2591                        }
2592                        if surrogate_base <= (0xDBFF - 0xD800) {
2593                            let iter_backup = code_unit_iter.clone();
2594                            if let Some(&low) = code_unit_iter.next() {
2595                                if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
2596                                    upcoming32 = (upcoming32 << 10) + u32::from(low)
2597                                        - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2598                                    break 'surrogateloop;
2599                                } else {
2600                                    code_unit_iter = iter_backup;
2601                                }
2602                            }
2603                        }
2604                        // unpaired surrogate
2605                        let slice_to_write = &pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - 1];
2606                        sink.write_slice(slice_to_write)?;
2607                        undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2608                        undecomposed_starter_valid = true;
2609                        composition.decomposition.pending = None;
2610                        break 'fast;
2611                    }
2612                    // Not unpaired surrogate
2613                    let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2614                    let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2615                    if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() && counter != 0 {
2616                        // Can't combine backwards, hence a plain (non-backwards-combining)
2617                        // starter albeit past `composition_passthrough_bound`
2618
2619                        // Fast-track succeeded!
2620                        undecomposed_starter = upcoming_with_trie_value;
2621                        // Cause `undecomposed_starter_valid` to be set to true.
2622                        // This regresses English performance on Haswell by 11%
2623                        // compared to commenting out this assignment to
2624                        // `counter_reference`.
2625                        counter_reference = counter - 1;
2626                        continue 'fast;
2627                    }
2628                    // We need to fall off the fast path.
2629                    composition.decomposition.pending = Some(upcoming_with_trie_value);
2630                    // Annotation belongs really on inner statement, but Rust doesn't
2631                    // allow it there.
2632                    #[allow(clippy::unwrap_used)]
2633                    let consumed_so_far_slice = if undecomposed_starter_valid {
2634                        &pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16() - undecomposed_starter.character.len_utf16()]
2635                    } else {
2636                        // slicing and unwrap OK, because we've just evidently read enough previously.
2637                        let mut consumed_so_far = pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16()].chars();
2638                        // `unwrap` OK, because we've previously manage to read the previous character
2639                        undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2640                        undecomposed_starter_valid = true;
2641                        consumed_so_far.as_slice()
2642                    };
2643                    sink.write_slice(consumed_so_far_slice)?;
2644                    break 'fast;
2645                }
2646                // End of stream
2647                sink.write_slice(pending_slice)?;
2648                return Ok(());
2649            }
2650            debug_assert!(undecomposed_starter_valid);
2651            // Sync the main iterator
2652            composition.decomposition.delegate = code_unit_iter.as_slice().chars();
2653        },
2654        text,
2655        sink,
2656        composition,
2657        composition_passthrough_bound,
2658        undecomposed_starter,
2659        pending_slice,
2660        len_utf16,
2661    );
2662}
2663
2664struct IsNormalizedSinkUtf16<'a> {
2665    expect: &'a [u16],
2666}
2667
2668impl<'a> IsNormalizedSinkUtf16<'a> {
2669    pub fn new(slice: &'a [u16]) -> Self {
2670        IsNormalizedSinkUtf16 { expect: slice }
2671    }
2672    pub fn finished(&self) -> bool {
2673        self.expect.is_empty()
2674    }
2675}
2676
2677impl<'a> Write16 for IsNormalizedSinkUtf16<'a> {
2678    fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
2679        // We know that if we get a slice, it's a pass-through,
2680        // so we can compare addresses. Indexing is OK, because
2681        // an indexing failure would be a code bug rather than
2682        // an input or data issue.
2683        #[allow(clippy::indexing_slicing)]
2684        if s.as_ptr() == self.expect.as_ptr() {
2685            self.expect = &self.expect[s.len()..];
2686            Ok(())
2687        } else {
2688            Err(core::fmt::Error {})
2689        }
2690    }
2691
2692    fn write_char(&mut self, c: char) -> core::fmt::Result {
2693        let mut iter = self.expect.chars();
2694        if iter.next() == Some(c) {
2695            self.expect = iter.as_slice();
2696            Ok(())
2697        } else {
2698            Err(core::fmt::Error {})
2699        }
2700    }
2701}
2702
2703struct IsNormalizedSinkUtf8<'a> {
2704    expect: &'a [u8],
2705}
2706
2707impl<'a> IsNormalizedSinkUtf8<'a> {
2708    pub fn new(slice: &'a [u8]) -> Self {
2709        IsNormalizedSinkUtf8 { expect: slice }
2710    }
2711    pub fn finished(&self) -> bool {
2712        self.expect.is_empty()
2713    }
2714}
2715
2716impl<'a> core::fmt::Write for IsNormalizedSinkUtf8<'a> {
2717    fn write_str(&mut self, s: &str) -> core::fmt::Result {
2718        // We know that if we get a slice, it's a pass-through,
2719        // so we can compare addresses. Indexing is OK, because
2720        // an indexing failure would be a code bug rather than
2721        // an input or data issue.
2722        #[allow(clippy::indexing_slicing)]
2723        if s.as_ptr() == self.expect.as_ptr() {
2724            self.expect = &self.expect[s.len()..];
2725            Ok(())
2726        } else {
2727            Err(core::fmt::Error {})
2728        }
2729    }
2730
2731    fn write_char(&mut self, c: char) -> core::fmt::Result {
2732        let mut iter = self.expect.chars();
2733        if iter.next() == Some(c) {
2734            self.expect = iter.as_slice();
2735            Ok(())
2736        } else {
2737            Err(core::fmt::Error {})
2738        }
2739    }
2740}
2741
2742struct IsNormalizedSinkStr<'a> {
2743    expect: &'a str,
2744}
2745
2746impl<'a> IsNormalizedSinkStr<'a> {
2747    pub fn new(slice: &'a str) -> Self {
2748        IsNormalizedSinkStr { expect: slice }
2749    }
2750    pub fn finished(&self) -> bool {
2751        self.expect.is_empty()
2752    }
2753}
2754
2755impl<'a> core::fmt::Write for IsNormalizedSinkStr<'a> {
2756    fn write_str(&mut self, s: &str) -> core::fmt::Result {
2757        // We know that if we get a slice, it's a pass-through,
2758        // so we can compare addresses. Indexing is OK, because
2759        // an indexing failure would be a code bug rather than
2760        // an input or data issue.
2761        #[allow(clippy::indexing_slicing)]
2762        if s.as_ptr() == self.expect.as_ptr() {
2763            self.expect = &self.expect[s.len()..];
2764            Ok(())
2765        } else {
2766            Err(core::fmt::Error {})
2767        }
2768    }
2769
2770    fn write_char(&mut self, c: char) -> core::fmt::Result {
2771        let mut iter = self.expect.chars();
2772        if iter.next() == Some(c) {
2773            self.expect = iter.as_str();
2774            Ok(())
2775        } else {
2776            Err(core::fmt::Error {})
2777        }
2778    }
2779}