1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
45// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
6#![cfg_attr(not(any(test, doc)), no_std)]
7#![cfg_attr(
8 not(test),
9 deny(
10 clippy::indexing_slicing,
11 clippy::unwrap_used,
12 clippy::expect_used,
13 clippy::panic,
14 )
15)]
16#![warn(missing_docs)]
1718//! Normalizing text into Unicode Normalization Forms.
19//!
20//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
21//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
22//!
23//! # Functionality
24//!
25//! The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
26//! Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
27//!
28//! Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
29//! and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
30//!
31//! The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
32//! Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
33//! applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
34//! [`idna`](https://docs.rs/idna/latest/idna/) crate.
35//!
36//! The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
37//! the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
38//! property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/), the types
39//! [`CanonicalComposition`](properties::CanonicalComposition), [`CanonicalDecomposition`](properties::CanonicalDecomposition),
40//! and [`CanonicalCombiningClassMap`](properties::CanonicalCombiningClassMap) implement the [`harfbuzz_traits`] if
41//! the `harfbuzz_traits` Cargo feature is enabled.
42//!
43//! Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
44//! addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
45//! non-“maybe” answer.
46//!
47//! # Examples
48//!
49//! ```
50//! let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
51//! assert_eq!(nfc.normalize("a\u{0308}"), "ä");
52//! assert!(nfc.is_normalized("ä"));
53//!
54//! let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
55//! assert_eq!(nfd.normalize("ä"), "a\u{0308}");
56//! assert!(!nfd.is_normalized("ä"));
57//! ```
5859extern crate alloc;
6061// TODO: The plan is to replace
62// `#[cfg(not(icu4x_unstable_fast_trie_only))]`
63// with
64// `#[cfg(feature = "serde")]`
65// and
66// `#[cfg(icu4x_unstable_fast_trie_only)]`
67// with
68// `#[cfg(not(feature = "serde"))]`
69//
70// Before doing so:
71// * The type of the UTS 46 trie needs to be
72// disentangled from the type of the NFD/NFKD tries.
73// This will involve a more generic iterator hidden
74// inside the public iterator types.
75// * datagen needs to emit fast-mode tries for the
76// NFD and NFKD tries.
77// * The markers and possibly the data struct type
78// for NFD and NFKD need to be revised per policy.
7980#[cfg(not(icu4x_unstable_fast_trie_only))]
81type Trie<'trie> = CodePointTrie<'trie, u32>;
8283#[cfg(icu4x_unstable_fast_trie_only)]
84type Trie<'trie> = FastCodePointTrie<'trie, u32>;
8586// We don't depend on icu_properties to minimize deps, but we want to be able
87// to ensure we're using the right CCC values
88macro_rules! ccc {
89 ($name:ident, $num:expr) => {
90const {
91#[cfg(feature = "icu_properties")]
92if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num {
93panic!("icu_normalizer has incorrect ccc values")
94 }
95 CanonicalCombiningClass::from_icu4c_value($num)
96 }
97 };
98}
99100#[cfg(feature = "harfbuzz_traits")]
101mod harfbuzz;
102pub mod properties;
103pub mod provider;
104pub mod uts46;
105106use crate::provider::CanonicalCompositions;
107use crate::provider::DecompositionData;
108use crate::provider::NormalizerNfdDataV1;
109use crate::provider::NormalizerNfkdDataV1;
110use crate::provider::NormalizerUts46DataV1;
111use alloc::borrow::Cow;
112use alloc::string::String;
113use core::char::REPLACEMENT_CHARACTER;
114use icu_collections::char16trie::Char16Trie;
115use icu_collections::char16trie::Char16TrieIterator;
116use icu_collections::char16trie::TrieResult;
117#[cfg(not(icu4x_unstable_fast_trie_only))]
118use icu_collections::codepointtrie::CodePointTrie;
119#[cfg(icu4x_unstable_fast_trie_only)]
120use icu_collections::codepointtrie::FastCodePointTrie;
121#[cfg(icu4x_unstable_fast_trie_only)]
122use icu_collections::codepointtrie::TypedCodePointTrie;
123#[cfg(feature = "icu_properties")]
124use icu_properties::props::CanonicalCombiningClass;
125use icu_provider::prelude::*;
126use provider::DecompositionTables;
127use provider::NormalizerNfcV1;
128use provider::NormalizerNfdTablesV1;
129use provider::NormalizerNfkdTablesV1;
130use smallvec::SmallVec;
131#[cfg(feature = "utf16_iter")]
132use utf16_iter::Utf16CharsEx;
133#[cfg(feature = "utf8_iter")]
134use utf8_iter::Utf8CharsEx;
135use zerovec::{zeroslice, ZeroSlice};
136137// The optimizations in the area where `likely` is used
138// are extremely brittle. `likely` is useful in the typed-trie
139// case on the UTF-16 fast path, but in order not to disturb
140// the untyped-trie case on the UTF-16 fast path, make the
141// annotations no-ops in the untyped-trie case.
142143// `cold_path` and `likely` come from
144// https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
145// See https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3#commitcomment-164768806
146// for permission to relicense under Unicode-3.0.
147148#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
149#[inline(always)]
150#[cold]
151fn cold_path() {}
152153#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
154#[inline(always)]
155pub(crate) fn likely(b: bool) -> bool {
156if b {
157true
158} else {
159 cold_path();
160false
161}
162}
163164// End import from https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
165166/// No-op for typed trie case.
167#[cfg(all(not(icu4x_unstable_fast_trie_only), feature = "utf16_iter"))]
168#[inline(always)]
169fn likely(b: bool) -> bool {
170 b
171}
172173// This type exists as a shim for `icu_properties` `CanonicalCombiningClass` when the crate is disabled
174// It should not be exposed to users.
175#[cfg(not(feature = "icu_properties"))]
176#[derive(#[automatically_derived]
impl ::core::marker::Copy for CanonicalCombiningClass { }Copy, #[automatically_derived]
impl ::core::clone::Clone for CanonicalCombiningClass {
#[inline]
fn clone(&self) -> CanonicalCombiningClass {
let _: ::core::clone::AssertParamIsClone<u8>;
*self
}
}Clone, #[automatically_derived]
impl ::core::cmp::Eq for CanonicalCombiningClass {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<u8>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for CanonicalCombiningClass {
#[inline]
fn eq(&self, other: &CanonicalCombiningClass) -> bool {
self.0 == other.0
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::PartialOrd for CanonicalCombiningClass {
#[inline]
fn partial_cmp(&self, other: &CanonicalCombiningClass)
-> ::core::option::Option<::core::cmp::Ordering> {
::core::cmp::PartialOrd::partial_cmp(&self.0, &other.0)
}
}PartialOrd, #[automatically_derived]
impl ::core::cmp::Ord for CanonicalCombiningClass {
#[inline]
fn cmp(&self, other: &CanonicalCombiningClass) -> ::core::cmp::Ordering {
::core::cmp::Ord::cmp(&self.0, &other.0)
}
}Ord)]
177struct CanonicalCombiningClass(pub(crate) u8);
178179#[cfg(not(feature = "icu_properties"))]
180impl CanonicalCombiningClass {
181const fn from_icu4c_value(v: u8) -> Self {
182Self(v)
183 }
184const fn to_icu4c_value(self) -> u8 {
185self.0
186}
187}
188189const CCC_NOT_REORDERED: CanonicalCombiningClass = const { CanonicalCombiningClass::from_icu4c_value(0) }ccc!(NotReordered, 0);
190const CCC_ABOVE: CanonicalCombiningClass = const { CanonicalCombiningClass::from_icu4c_value(230) }ccc!(Above, 230);
191192/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
193#[derive(#[automatically_derived]
impl ::core::fmt::Debug for IgnorableBehavior {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::write_str(f,
match self {
IgnorableBehavior::Unsupported => "Unsupported",
IgnorableBehavior::Ignored => "Ignored",
IgnorableBehavior::ReplacementCharacter =>
"ReplacementCharacter",
})
}
}Debug, #[automatically_derived]
impl ::core::cmp::PartialEq for IgnorableBehavior {
#[inline]
fn eq(&self, other: &IgnorableBehavior) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::Eq for IgnorableBehavior {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {}
}Eq)]
194enum IgnorableBehavior {
195/// 0xFFFFFFFF in data is not supported.
196Unsupported,
197/// Ignorables are ignored.
198Ignored,
199/// Ignorables are treated as singleton decompositions
200 /// to the REPLACEMENT CHARACTER.
201ReplacementCharacter,
202}
203204/// Marker for UTS 46 ignorables.
205///
206/// See trie-value-format.md
207const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
208209/// Marker that the decomposition does not round trip via NFC.
210///
211/// See trie-value-format.md
212const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
213214/// Marker that the first character of the decomposition
215/// can combine backwards.
216///
217/// See trie-value-format.md
218const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
219220/// Mask for the bits have to be zero for this to be a BMP
221/// singleton decomposition, or value baked into the surrogate
222/// range.
223///
224/// See trie-value-format.md
225const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
226227/// Mask for the bits have to be zero for this to be a complex
228/// decomposition.
229///
230/// See trie-value-format.md
231const LOW_ZEROS_MASK: u32 = 0xFFE0;
232233/// Checks if a trie value carries a (non-zero) canonical
234/// combining class.
235///
236/// See trie-value-format.md
237#[inline]
238fn trie_value_has_ccc(trie_value: u32) -> bool {
239 (trie_value & 0x3FFFFE00) == 0xD800
240}
241242/// Checks if the trie signifies a special non-starter decomposition.
243///
244/// See trie-value-format.md
245fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
246 (trie_value & 0x3FFFFF00) == 0xD900
247}
248249/// Checks if a trie value signifies a character whose decomposition
250/// starts with a non-starter.
251///
252/// See trie-value-format.md
253fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
254trie_value_has_ccc(trie_value)
255}
256257/// Extracts a canonical combining class (possibly zero) from a trie value.
258///
259/// See trie-value-format.md
260fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
261if trie_value_has_ccc(trie_value) {
262CanonicalCombiningClass::from_icu4c_value(trie_valueas u8)
263 } else {
264CCC_NOT_REORDERED265 }
266}
267268/// The tail (everything after the first character) of the NFKD form U+FDFA
269/// as 16-bit units.
270static FDFA_NFKD: [u16; 17] = [
2710x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
2720x633, 0x644, 0x645,
273];
274275/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
276/// but they differ by `NON_ROUND_TRIP_MARKER`.)
277///
278/// See trie-value-format.md
279const FDFA_MARKER: u16 = 1;
280281// These constants originate from page 143 of Unicode 14.0
282/// Syllable base
283const HANGUL_S_BASE: u32 = 0xAC00;
284/// Lead jamo base
285const HANGUL_L_BASE: u32 = 0x1100;
286/// Vowel jamo base
287const HANGUL_V_BASE: u32 = 0x1161;
288/// Trail jamo base (deliberately off by one to account for the absence of a trail)
289const HANGUL_T_BASE: u32 = 0x11A7;
290/// Lead jamo count
291const HANGUL_L_COUNT: u32 = 19;
292/// Vowel jamo count
293const HANGUL_V_COUNT: u32 = 21;
294/// Trail jamo count (deliberately off by one to account for the absence of a trail)
295const HANGUL_T_COUNT: u32 = 28;
296/// Vowel jamo count times trail jamo count
297const HANGUL_N_COUNT: u32 = 588;
298/// Syllable count
299const HANGUL_S_COUNT: u32 = 11172;
300301/// One past the conjoining jamo block
302const HANGUL_JAMO_LIMIT: u32 = 0x1200;
303304/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
305/// are enabled and return `default` if debug assertions are not enabled.
306///
307/// Use this only if the only reason why `opt` could be `None` is bogus
308/// data from the provider.
309#[inline(always)]
310fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
311if let Some(val) = opt {
312val313 } else {
314// GIGO case
315if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
316default317 }
318}
319320/// Convert a `u32` _obtained from data provider data_ to `char`.
321#[inline(always)]
322fn char_from_u32(u: u32) -> char {
323unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
324}
325326/// Convert a `u16` _obtained from data provider data_ to `char`.
327#[inline(always)]
328fn char_from_u16(u: u16) -> char {
329char_from_u32(u32::from(u))
330}
331332const EMPTY_U16: &ZeroSlice<u16> = ::zerovec::ZeroSlice::new_empty()zeroslice![];
333334const EMPTY_CHAR: &ZeroSlice<char> = ::zerovec::ZeroSlice::new_empty()zeroslice![];
335336#[inline(always)]
337fn in_inclusive_range(c: char, start: char, end: char) -> bool {
338u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
339}
340341#[inline(always)]
342#[cfg(feature = "utf16_iter")]
343fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
344 u.wrapping_sub(start) <= (end - start)
345}
346347/// Performs canonical composition (including Hangul) on a pair of
348/// characters or returns `None` if these characters don't compose.
349/// Composition exclusions are taken into account.
350#[inline]
351fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
352let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
353if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
354return compose_non_hangul(iter, starter, second);
355 }
356if v < HANGUL_V_COUNT {
357let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
358if l < HANGUL_L_COUNT {
359let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
360// Safe, because the inputs are known to be in range.
361return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
362 }
363return None;
364 }
365if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
366let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
367if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
368let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
369// Safe, because the inputs are known to be in range.
370return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
371 }
372 }
373None374}
375376/// Performs (non-Hangul) canonical composition on a pair of characters
377/// or returns `None` if these characters don't compose. Composition
378/// exclusions are taken into account.
379fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
380// To make the trie smaller, the pairs are stored second character first.
381 // Given how this method is used in ways where it's known that `second`
382 // is or isn't a starter. We could potentially split the trie into two
383 // tries depending on whether `second` is a starter.
384match iter.next(second) {
385 TrieResult::NoMatch => None,
386 TrieResult::NoValue => match iter.next(starter) {
387 TrieResult::NoMatch => None,
388 TrieResult::FinalValue(i) => {
389if let Some(c) = char::from_u32(ias u32) {
390Some(c)
391 } else {
392// GIGO case
393if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
394None395 }
396 }
397 TrieResult::NoValue | TrieResult::Intermediate(_) => {
398// GIGO case
399if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
400None401 }
402 },
403 TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
404// GIGO case
405if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
406None407 }
408 }
409}
410411/// See trie-value-format.md
412#[inline(always)]
413fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
414// The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
415 // and this function needs to ignore that.
416(trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
417}
418419/// See trie-value-format.md
420#[inline(always)]
421fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
422 (trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
423}
424425/// Struct for holding together a character and the value
426/// looked up for it from the NFD trie in a more explicit
427/// way than an anonymous pair.
428/// Also holds a flag about the supplementary-trie provenance.
429#[derive(#[automatically_derived]
impl ::core::fmt::Debug for CharacterAndTrieValue {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f,
"CharacterAndTrieValue", "character", &self.character, "trie_val",
&&self.trie_val)
}
}Debug, #[automatically_derived]
impl ::core::cmp::PartialEq for CharacterAndTrieValue {
#[inline]
fn eq(&self, other: &CharacterAndTrieValue) -> bool {
self.character == other.character && self.trie_val == other.trie_val
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::Eq for CharacterAndTrieValue {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<char>;
let _: ::core::cmp::AssertParamIsEq<u32>;
}
}Eq)]
430struct CharacterAndTrieValue {
431 character: char,
432/// See trie-value-format.md
433trie_val: u32,
434}
435436impl CharacterAndTrieValue {
437#[inline(always)]
438pub fn new(c: char, trie_value: u32) -> Self {
439CharacterAndTrieValue {
440 character: c,
441 trie_val: trie_value,
442 }
443 }
444445#[inline(always)]
446pub fn starter_and_decomposes_to_self(&self) -> bool {
447starter_and_decomposes_to_self_impl(self.trie_val)
448 }
449450/// See trie-value-format.md
451#[inline(always)]
452 #[cfg(feature = "utf8_iter")]
453pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
454// This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
455 // to be compared with zero. U+FFFD has that flag set despite really
456 // being being round-tripping in order to make UTF-8 errors
457 // ineligible for passthrough.
458(self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
459}
460461/// See trie-value-format.md
462#[inline(always)]
463pub fn can_combine_backwards(&self) -> bool {
464 (self.trie_val & BACKWARD_COMBINING_MARKER) != 0
465}
466/// See trie-value-format.md
467#[inline(always)]
468pub fn potential_passthrough(&self) -> bool {
469 (self.trie_val & NON_ROUND_TRIP_MARKER) == 0
470}
471/// See trie-value-format.md
472#[inline(always)]
473pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
474potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
475 }
476}
477478/// Pack a `char` and a `CanonicalCombiningClass` in
479/// 32 bits (the former in the lower 24 bits and the
480/// latter in the high 8 bits). The latter can be
481/// initialized to 0xFF upon creation, in which case
482/// it can be actually set later by calling
483/// `set_ccc_from_trie_if_not_already_set`. This is
484/// a micro optimization to avoid the Canonical
485/// Combining Class trie lookup when there is only
486/// one combining character in a sequence. This type
487/// is intentionally non-`Copy` to get compiler help
488/// in making sure that the class is set on the
489/// instance on which it is intended to be set
490/// and not on a temporary copy.
491///
492/// Note that 0xFF is won't be assigned to an actual
493/// canonical combining class per definition D104
494/// in The Unicode Standard.
495//
496// NOTE: The Pernosco debugger has special knowledge
497// of this struct. Please do not change the bit layout
498// or the crate-module-qualified name of this struct
499// without coordination.
500#[derive(#[automatically_derived]
impl ::core::fmt::Debug for CharacterAndClass {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"CharacterAndClass", &&self.0)
}
}Debug)]
501struct CharacterAndClass(u32);
502503impl CharacterAndClass {
504pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
505CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
506 }
507pub fn new_with_placeholder(c: char) -> Self {
508CharacterAndClass(u32::from(c) | ((0xFF) << 24))
509 }
510pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
511Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
512 }
513pub fn new_starter(c: char) -> Self {
514CharacterAndClass(u32::from(c))
515 }
516/// This method must exist for Pernosco to apply its special rendering.
517 /// Also, this must not be dead code!
518pub fn character(&self) -> char {
519// Safe, because the low 24 bits came from a `char`
520 // originally.
521unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
522 }
523/// This method must exist for Pernosco to apply its special rendering.
524pub fn ccc(&self) -> CanonicalCombiningClass {
525CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
526 }
527528pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
529 (self.character(), self.ccc())
530 }
531pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &Trie) {
532if self.0 >> 24 != 0xFF {
533return;
534 }
535let scalar = self.0 & 0xFFFFFF;
536self.0 =
537 ((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar;
538 }
539}
540541// This function exists as a borrow check helper.
542#[inline(always)]
543fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &Trie) {
544// We don't look up the canonical combining class for starters
545 // of for single combining characters between starters. When
546 // there's more than one combining character between starters,
547 // we look up the canonical combining class for each character
548 // exactly once.
549if slice.len() < 2 {
550return;
551 }
552slice553 .iter_mut()
554 .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
555slice.sort_by_key(|cc| cc.ccc());
556}
557558/// An iterator adaptor that turns an `Iterator` over `char` into
559/// a lazily-decomposed `char` sequence.
560#[derive(#[automatically_derived]
impl<'data, I: ::core::fmt::Debug> ::core::fmt::Debug for
Decomposition<'data, I> where I: Iterator<Item = char> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
let names: &'static _ =
&["delegate", "buffer", "buffer_pos", "pending", "trie",
"scalars16", "scalars24", "supplementary_scalars16",
"supplementary_scalars24",
"decomposition_passthrough_bound", "ignorable_behavior"];
let values: &[&dyn ::core::fmt::Debug] =
&[&self.delegate, &self.buffer, &self.buffer_pos, &self.pending,
&self.trie, &self.scalars16, &self.scalars24,
&self.supplementary_scalars16,
&self.supplementary_scalars24,
&self.decomposition_passthrough_bound,
&&self.ignorable_behavior];
::core::fmt::Formatter::debug_struct_fields_finish(f, "Decomposition",
names, values)
}
}Debug)]
561pub struct Decomposition<'data, I>
562where
563I: Iterator<Item = char>,
564{
565 delegate: I,
566 buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
567/// The index of the next item to be read from `buffer`.
568 /// The purpose if this index is to avoid having to move
569 /// the rest upon every read.
570buffer_pos: usize,
571// At the start of `next()` if not `None`, this is a pending unnormalized
572 // starter. When `Decomposition` appears alone, this is never a non-starter.
573 // However, when `Decomposition` appears inside a `Composition`, this
574 // may become a non-starter before `decomposing_next()` is called.
575pending: Option<CharacterAndTrieValue>, // None at end of stream
576 // See trie-value-format.md
577trie: &'data Trie<'data>,
578 scalars16: &'data ZeroSlice<u16>,
579 scalars24: &'data ZeroSlice<char>,
580 supplementary_scalars16: &'data ZeroSlice<u16>,
581 supplementary_scalars24: &'data ZeroSlice<char>,
582/// The lowest character for which either of the following does
583 /// not hold:
584 /// 1. Decomposes to self.
585 /// 2. Decomposition starts with a non-starter
586decomposition_passthrough_bound: u32, // never above 0xC0
587ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
588}
589590impl<'data, I> Decomposition<'data, I>
591where
592I: Iterator<Item = char>,
593{
594/// Constructs a decomposing iterator adapter from a delegate
595 /// iterator and references to the necessary data, without
596 /// supplementary data.
597 ///
598 /// Use `DecomposingNormalizer::normalize_iter()` instead unless
599 /// there's a good reason to use this constructor directly.
600 ///
601 /// Public but hidden in order to be able to use this from the
602 /// collator.
603#[doc(hidden)] // used in collator
604pub fn new(
605 delegate: I,
606 decompositions: &'data DecompositionData,
607 tables: &'data DecompositionTables,
608 ) -> Self {
609Self::new_with_supplements(
610delegate,
611decompositions,
612tables,
613None,
6140xC0,
615 IgnorableBehavior::Unsupported,
616 )
617 }
618619/// Constructs a decomposing iterator adapter from a delegate
620 /// iterator and references to the necessary data, including
621 /// supplementary data.
622 ///
623 /// Use `DecomposingNormalizer::normalize_iter()` instead unless
624 /// there's a good reason to use this constructor directly.
625fn new_with_supplements(
626 delegate: I,
627 decompositions: &'data DecompositionData,
628 tables: &'data DecompositionTables,
629 supplementary_tables: Option<&'data DecompositionTables>,
630 decomposition_passthrough_bound: u8,
631 ignorable_behavior: IgnorableBehavior,
632 ) -> Self {
633let mut ret = Decomposition::<I> {
634delegate,
635 buffer: SmallVec::new(), // Normalized
636buffer_pos: 0,
637// Initialize with a placeholder starter in case
638 // the real stream starts with a non-starter.
639pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
640#[allow(clippy::useless_conversion, clippy::expect_used)] // Expectation always succeeds when untyped tries are in use
641trie: <&Trie>::try_from(&decompositions.trie).expect("Unexpected trie type in data"),
642 scalars16: &tables.scalars16,
643 scalars24: &tables.scalars24,
644 supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
645&supplementary.scalars16
646 } else {
647EMPTY_U16648 },
649 supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
650&supplementary.scalars24
651 } else {
652EMPTY_CHAR653 },
654 decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
655ignorable_behavior,
656 };
657let _ = ret.next(); // Remove the U+FFFF placeholder
658ret659 }
660661fn push_decomposition16(
662&mut self,
663 offset: usize,
664 len: usize,
665 only_non_starters_in_trail: bool,
666 slice16: &ZeroSlice<u16>,
667 ) -> (char, usize) {
668let (starter, tail) = slice16669 .get_subslice(offset..offset + len)
670 .and_then(|slice| slice.split_first())
671 .map_or_else(
672 || {
673// GIGO case
674if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
675 (REPLACEMENT_CHARACTER, EMPTY_U16)
676 },
677 |(first, trail)| (char_from_u16(first), trail),
678 );
679if only_non_starters_in_trail {
680// All the rest are combining
681self.buffer.extend(
682tail.iter()
683 .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
684 );
685 (starter, 0)
686 } else {
687let mut i = 0;
688let mut combining_start = 0;
689for u in tail.iter() {
690let ch = char_from_u16(u);
691let trie_value = self.trie.get(ch);
692self.buffer.push(CharacterAndClass::new_with_trie_value(
693 CharacterAndTrieValue::new(ch, trie_value),
694 ));
695 i += 1;
696// Half-width kana and iota subscript don't occur in the tails
697 // of these multicharacter decompositions.
698if !decomposition_starts_with_non_starter(trie_value) {
699 combining_start = i;
700 }
701 }
702 (starter, combining_start)
703 }
704 }
705706fn push_decomposition32(
707&mut self,
708 offset: usize,
709 len: usize,
710 only_non_starters_in_trail: bool,
711 slice32: &ZeroSlice<char>,
712 ) -> (char, usize) {
713let (starter, tail) = slice32714 .get_subslice(offset..offset + len)
715 .and_then(|slice| slice.split_first())
716 .unwrap_or_else(|| {
717// GIGO case
718if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
719 (REPLACEMENT_CHARACTER, EMPTY_CHAR)
720 });
721if only_non_starters_in_trail {
722// All the rest are combining
723self.buffer
724 .extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
725 (starter, 0)
726 } else {
727let mut i = 0;
728let mut combining_start = 0;
729for ch in tail.iter() {
730let trie_value = self.trie.get(ch);
731self.buffer.push(CharacterAndClass::new_with_trie_value(
732 CharacterAndTrieValue::new(ch, trie_value),
733 ));
734 i += 1;
735// Half-width kana and iota subscript don't occur in the tails
736 // of these multicharacter decompositions.
737if !decomposition_starts_with_non_starter(trie_value) {
738 combining_start = i;
739 }
740 }
741 (starter, combining_start)
742 }
743 }
744745#[inline(always)]
746fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
747CharacterAndTrieValue::new(c, self.trie.get(c))
748 }
749750fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
751if true {
if !self.pending.is_none() {
::core::panicking::panic("assertion failed: self.pending.is_none()")
};
};debug_assert!(self.pending.is_none());
752loop {
753let c = self.delegate.next()?;
754755// TODO(#2384): Measure if this check is actually an optimization.
756if u32::from(c) < self.decomposition_passthrough_bound {
757return Some(CharacterAndTrieValue::new(c, 0));
758 }
759760let trie_val = self.trie.get(c);
761// TODO: Can we do something better about the cost of this branch in the
762 // non-UTS 46 case?
763if trie_val == IGNORABLE_MARKER {
764match self.ignorable_behavior {
765 IgnorableBehavior::Unsupported => {
766if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
767 }
768 IgnorableBehavior::ReplacementCharacter => {
769return Some(CharacterAndTrieValue::new(
770c,
771u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER,
772 ));
773 }
774 IgnorableBehavior::Ignored => {
775// Else ignore this character by reading the next one from the delegate.
776continue;
777 }
778 }
779 }
780return Some(CharacterAndTrieValue::new(c, trie_val));
781 }
782 }
783784fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
785if let Some(pending) = self.pending.take() {
786// Only happens as part of `Composition` and as part of
787 // the contiguous-buffer methods of `DecomposingNormalizer`.
788 // I.e. does not happen as part of standalone iterator
789 // usage of `Decomposition`.
790Some(pending)
791 } else {
792self.delegate_next_no_pending()
793 }
794 }
795796fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
797let (starter, combining_start) = {
798let c = c_and_trie_val.character;
799// See trie-value-format.md
800let decomposition = c_and_trie_val.trie_val;
801// The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
802 // and that flag needs to be ignored here.
803if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
804// The character is its own decomposition
805(c, 0)
806 } else {
807let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
808let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
809if !high_zeros && !low_zeros {
810// Decomposition into two BMP characters: starter and non-starter
811let starter = char_from_u32(decomposition & 0x7FFF);
812let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
813self.buffer
814 .push(CharacterAndClass::new_with_placeholder(combining));
815 (starter, 0)
816 } else if high_zeros {
817// Do the check by looking at `c` instead of looking at a marker
818 // in `singleton` below, because if we looked at the trie value,
819 // we'd still have to check that `c` is in the Hangul syllable
820 // range in order for the subsequent interpretations as `char`
821 // to be safe.
822 // Alternatively, `FDFA_MARKER` and the Hangul marker could
823 // be unified. That would add a branch for Hangul and remove
824 // a branch from singleton decompositions. It seems more
825 // important to favor Hangul syllables than singleton
826 // decompositions.
827 // Note that it would be valid to hoist this Hangul check
828 // one or even two steps earlier in this check hierarchy.
829 // Right now, it's assumed the kind of decompositions into
830 // BMP starter and non-starter, which occur in many languages,
831 // should be checked before Hangul syllables, which are about
832 // one language specifically. Hopefully, we get some
833 // instruction-level parallelism out of the disjointness of
834 // operations on `c` and `decomposition`.
835let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
836if hangul_offset < HANGUL_S_COUNT {
837if true {
match (&decomposition, &1) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(decomposition, 1);
838// Hangul syllable
839 // The math here comes from page 144 of Unicode 14.0
840let l = hangul_offset / HANGUL_N_COUNT;
841let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
842let t = hangul_offset % HANGUL_T_COUNT;
843844// The unsafe blocks here are OK, because the values stay
845 // within the Hangul jamo block and, therefore, the scalar
846 // value range by construction.
847self.buffer.push(CharacterAndClass::new_starter(unsafe {
848 core::char::from_u32_unchecked(HANGUL_V_BASE + v)
849 }));
850let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
851if t != 0 {
852self.buffer.push(CharacterAndClass::new_starter(unsafe {
853 core::char::from_u32_unchecked(HANGUL_T_BASE + t)
854 }));
855 (first, 2)
856 } else {
857 (first, 1)
858 }
859 } else {
860let singleton = decompositionas u16;
861if singleton != FDFA_MARKER {
862// Decomposition into one BMP character
863let starter = char_from_u16(singleton);
864 (starter, 0)
865 } else {
866// Special case for the NFKD form of U+FDFA.
867self.buffer.extend(FDFA_NFKD.map(|u| {
868// SAFETY: `FDFA_NFKD` is known not to contain
869 // surrogates.
870CharacterAndClass::new_starter(unsafe {
871 core::char::from_u32_unchecked(u32::from(u))
872 })
873 }));
874 ('\u{0635}', 17)
875 }
876 }
877 } else {
878if true {
if !low_zeros { ::core::panicking::panic("assertion failed: low_zeros") };
};debug_assert!(low_zeros);
879// Only 12 of 14 bits used as of Unicode 16.
880let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
881// Only 3 of 4 bits used as of Unicode 16.
882let len_bits = decomposition & 0b1111;
883let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
884if offset < self.scalars16.len() {
885self.push_decomposition16(
886offset,
887 (len_bits + 2) as usize,
888only_non_starters_in_trail,
889self.scalars16,
890 )
891 } else if offset < self.scalars16.len() + self.scalars24.len() {
892self.push_decomposition32(
893offset - self.scalars16.len(),
894 (len_bits + 1) as usize,
895only_non_starters_in_trail,
896self.scalars24,
897 )
898 } else if offset899 < self.scalars16.len()
900 + self.scalars24.len()
901 + self.supplementary_scalars16.len()
902 {
903self.push_decomposition16(
904offset - (self.scalars16.len() + self.scalars24.len()),
905 (len_bits + 2) as usize,
906only_non_starters_in_trail,
907self.supplementary_scalars16,
908 )
909 } else {
910self.push_decomposition32(
911offset912 - (self.scalars16.len()
913 + self.scalars24.len()
914 + self.supplementary_scalars16.len()),
915 (len_bits + 1) as usize,
916only_non_starters_in_trail,
917self.supplementary_scalars24,
918 )
919 }
920 }
921 }
922 };
923// Either we're inside `Composition` or `self.pending.is_none()`.
924925self.gather_and_sort_combining(combining_start);
926starter927 }
928929fn gather_and_sort_combining(&mut self, combining_start: usize) {
930// Not a `for` loop to avoid holding a mutable reference to `self` across
931 // the loop body.
932while let Some(ch_and_trie_val) = self.delegate_next() {
933if !trie_value_has_ccc(ch_and_trie_val.trie_val) {
934self.pending = Some(ch_and_trie_val);
935break;
936 } else if !trie_value_indicates_special_non_starter_decomposition(
937 ch_and_trie_val.trie_val,
938 ) {
939self.buffer
940 .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
941 } else {
942// The Tibetan special cases are starters that decompose into non-starters.
943let mapped = match ch_and_trie_val.character {
944'\u{0340}' => {
945// COMBINING GRAVE TONE MARK
946CharacterAndClass::new('\u{0300}', CCC_ABOVE)
947 }
948'\u{0341}' => {
949// COMBINING ACUTE TONE MARK
950CharacterAndClass::new('\u{0301}', CCC_ABOVE)
951 }
952'\u{0343}' => {
953// COMBINING GREEK KORONIS
954CharacterAndClass::new('\u{0313}', CCC_ABOVE)
955 }
956'\u{0344}' => {
957// COMBINING GREEK DIALYTIKA TONOS
958self.buffer
959 .push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
960 CharacterAndClass::new('\u{0301}', CCC_ABOVE)
961 }
962'\u{0F73}' => {
963// TIBETAN VOWEL SIGN II
964self.buffer
965 .push(CharacterAndClass::new('\u{0F71}', const { CanonicalCombiningClass::from_icu4c_value(129) }ccc!(CCC129, 129)));
966 CharacterAndClass::new('\u{0F72}', const { CanonicalCombiningClass::from_icu4c_value(130) }ccc!(CCC130, 130))
967 }
968'\u{0F75}' => {
969// TIBETAN VOWEL SIGN UU
970self.buffer
971 .push(CharacterAndClass::new('\u{0F71}', const { CanonicalCombiningClass::from_icu4c_value(129) }ccc!(CCC129, 129)));
972 CharacterAndClass::new('\u{0F74}', const { CanonicalCombiningClass::from_icu4c_value(132) }ccc!(CCC132, 132))
973 }
974'\u{0F81}' => {
975// TIBETAN VOWEL SIGN REVERSED II
976self.buffer
977 .push(CharacterAndClass::new('\u{0F71}', const { CanonicalCombiningClass::from_icu4c_value(129) }ccc!(CCC129, 129)));
978 CharacterAndClass::new('\u{0F80}', const { CanonicalCombiningClass::from_icu4c_value(130) }ccc!(CCC130, 130))
979 }
980'\u{FF9E}' => {
981// HALFWIDTH KATAKANA VOICED SOUND MARK
982 CharacterAndClass::new('\u{3099}', const { CanonicalCombiningClass::from_icu4c_value(8) }ccc!(KanaVoicing, 8))
983 }
984'\u{FF9F}' => {
985// HALFWIDTH KATAKANA VOICED SOUND MARK
986 CharacterAndClass::new('\u{309A}', const { CanonicalCombiningClass::from_icu4c_value(8) }ccc!(KanaVoicing, 8))
987 }
988_ => {
989// GIGO case
990if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
991 CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
992 }
993 };
994self.buffer.push(mapped);
995 }
996 }
997// Slicing succeeds by construction; we've always ensured that `combining_start`
998 // is in permissible range.
999#[expect(clippy::indexing_slicing)]
1000sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
1001 }
1002}
10031004impl<I> Iteratorfor Decomposition<'_, I>
1005where
1006I: Iterator<Item = char>,
1007{
1008type Item = char;
10091010fn next(&mut self) -> Option<char> {
1011if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
1012self.buffer_pos += 1;
1013if self.buffer_pos == self.buffer.len() {
1014self.buffer.clear();
1015self.buffer_pos = 0;
1016 }
1017return Some(ret);
1018 }
1019if true {
match (&self.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(self.buffer_pos, 0);
1020let c_and_trie_val = self.pending.take()?;
1021Some(self.decomposing_next(c_and_trie_val))
1022 }
1023}
10241025/// An iterator adaptor that turns an `Iterator` over `char` into
1026/// a lazily-decomposed and then canonically composed `char` sequence.
1027#[derive(#[automatically_derived]
impl<'data, I: ::core::fmt::Debug> ::core::fmt::Debug for
Composition<'data, I> where I: Iterator<Item = char> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field4_finish(f, "Composition",
"decomposition", &self.decomposition, "canonical_compositions",
&self.canonical_compositions, "unprocessed_starter",
&self.unprocessed_starter, "composition_passthrough_bound",
&&self.composition_passthrough_bound)
}
}Debug)]
1028pub struct Composition<'data, I>
1029where
1030I: Iterator<Item = char>,
1031{
1032/// The decomposing part of the normalizer than operates before
1033 /// the canonical composition is performed on its output.
1034decomposition: Decomposition<'data, I>,
1035/// Non-Hangul canonical composition data.
1036canonical_compositions: Char16Trie<'data>,
1037/// To make `next()` yield in cases where there's a non-composing
1038 /// starter in the decomposition buffer, we put it here to let it
1039 /// wait for the next `next()` call (or a jump forward within the
1040 /// `next()` call).
1041unprocessed_starter: Option<char>,
1042/// The lowest character for which any one of the following does
1043 /// not hold:
1044 /// 1. Roundtrips via decomposition and recomposition.
1045 /// 2. Decomposition starts with a non-starter
1046 /// 3. Is not a backward-combining starter
1047composition_passthrough_bound: u32,
1048}
10491050impl<'data, I> Composition<'data, I>
1051where
1052I: Iterator<Item = char>,
1053{
1054fn new(
1055 decomposition: Decomposition<'data, I>,
1056 canonical_compositions: Char16Trie<'data>,
1057 composition_passthrough_bound: u16,
1058 ) -> Self {
1059Self {
1060decomposition,
1061canonical_compositions,
1062 unprocessed_starter: None,
1063 composition_passthrough_bound: u32::from(composition_passthrough_bound),
1064 }
1065 }
10661067/// Performs canonical composition (including Hangul) on a pair of
1068 /// characters or returns `None` if these characters don't compose.
1069 /// Composition exclusions are taken into account.
1070#[inline(always)]
1071pub fn compose(&self, starter: char, second: char) -> Option<char> {
1072compose(self.canonical_compositions.iter(), starter, second)
1073 }
10741075/// Performs (non-Hangul) canonical composition on a pair of characters
1076 /// or returns `None` if these characters don't compose. Composition
1077 /// exclusions are taken into account.
1078#[inline(always)]
1079fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
1080compose_non_hangul(self.canonical_compositions.iter(), starter, second)
1081 }
1082}
10831084impl<I> Iteratorfor Composition<'_, I>
1085where
1086I: Iterator<Item = char>,
1087{
1088type Item = char;
10891090#[inline]
1091fn next(&mut self) -> Option<char> {
1092let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
1093if self.unprocessed_starter.is_none() {
1094// The loop is only broken out of as goto forward
1095#[expect(clippy::never_loop)]
1096loop {
1097if let Some((character, ccc)) = self1098 .decomposition
1099 .buffer
1100 .get(self.decomposition.buffer_pos)
1101 .map(|c| c.character_and_ccc())
1102 {
1103self.decomposition.buffer_pos += 1;
1104if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
1105self.decomposition.buffer.clear();
1106self.decomposition.buffer_pos = 0;
1107 }
1108if ccc == CCC_NOT_REORDERED {
1109// Previous decomposition contains a starter. This must
1110 // now become the `unprocessed_starter` for it to have
1111 // a chance to compose with the upcoming characters.
1112 //
1113 // E.g. parenthesized Hangul in NFKC comes through here,
1114 // but suitable composition exclusion could exercise this
1115 // in NFC.
1116self.unprocessed_starter = Some(character);
1117break; // We already have a starter, so skip taking one from `pending`.
1118}
1119return Some(character);
1120 }
1121if true {
match (&self.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(self.decomposition.buffer_pos, 0);
1122undecomposed_starter = self.decomposition.pending.take()?;
1123if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
1124 || undecomposed_starter.potential_passthrough()
1125 {
1126// TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
1127 // character is not below `decomposition_passthrough_bound` but is
1128 // below `composition_passthrough_bound`, we read from the trie
1129 // unnecessarily.
1130if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
1131let cannot_combine_backwards = u32::from(upcoming.character)
1132 < self.composition_passthrough_bound
1133 || !upcoming.can_combine_backwards();
1134self.decomposition.pending = Some(upcoming);
1135if cannot_combine_backwards {
1136// Fast-track succeeded!
1137return Some(undecomposed_starter.character);
1138 }
1139 } else {
1140// End of stream
1141return Some(undecomposed_starter.character);
1142 }
1143 }
1144break; // Not actually looping
1145}
1146 }
1147let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
11481149 // The point of having this boolean is to have only one call site to
1150 // `self.decomposition.decomposing_next`, which is hopefully beneficial for
1151 // code size under inlining.
1152let mut attempt_composition = false;
1153loop {
1154if let Some(unprocessed) = self.unprocessed_starter.take() {
1155if true {
match (&undecomposed_starter, &CharacterAndTrieValue::new('\u{0}', 0)) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
1156if true {
match (&starter, &'\u{0}') {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(starter, '\u{0}');
1157starter = unprocessed;
1158 } else {
1159if true {
match (&self.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(self.decomposition.buffer_pos, 0);
1160let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
1161if !attempt_composition {
1162starter = next_starter;
1163 } else if let Some(composed) = self.compose(starter, next_starter) {
1164starter = composed;
1165 } else {
1166// This is our yield point. We'll pick this up above in the
1167 // next call to `next()`.
1168self.unprocessed_starter = Some(next_starter);
1169return Some(starter);
1170 }
1171 }
1172// We first loop by index to avoid moving the contents of `buffer`, but
1173 // if there's a discontiguous match, we'll start modifying `buffer` instead.
1174loop {
1175let (character, ccc) = if let Some((character, ccc)) = self1176 .decomposition
1177 .buffer
1178 .get(self.decomposition.buffer_pos)
1179 .map(|c| c.character_and_ccc())
1180 {
1181 (character, ccc)
1182 } else {
1183self.decomposition.buffer.clear();
1184self.decomposition.buffer_pos = 0;
1185break;
1186 };
1187if let Some(composed) = self.compose(starter, character) {
1188starter = composed;
1189self.decomposition.buffer_pos += 1;
1190continue;
1191 }
1192let mut most_recent_skipped_ccc = ccc;
1193 {
1194let _ = self1195 .decomposition
1196 .buffer
1197 .drain(0..self.decomposition.buffer_pos);
1198 }
1199self.decomposition.buffer_pos = 0;
1200if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1201// We failed to compose a starter. Discontiguous match not allowed.
1202 // We leave the starter in `buffer` for `next()` to find.
1203return Some(starter);
1204 }
1205let mut i = 1; // We have skipped one non-starter.
1206while let Some((character, ccc)) = self
1207.decomposition
1208 .buffer
1209 .get(i)
1210 .map(|c| c.character_and_ccc())
1211 {
1212if ccc == CCC_NOT_REORDERED {
1213// Discontiguous match not allowed.
1214return Some(starter);
1215 }
1216if true {
if !(ccc >= most_recent_skipped_ccc) {
::core::panicking::panic("assertion failed: ccc >= most_recent_skipped_ccc")
};
};debug_assert!(ccc >= most_recent_skipped_ccc);
1217if ccc != most_recent_skipped_ccc {
1218// Using the non-Hangul version as a micro-optimization, since
1219 // we already rejected the case where `second` is a starter
1220 // above, and conjoining jamo are starters.
1221if let Some(composed) = self.compose_non_hangul(starter, character) {
1222self.decomposition.buffer.remove(i);
1223 starter = composed;
1224continue;
1225 }
1226 }
1227 most_recent_skipped_ccc = ccc;
1228 i += 1;
1229 }
1230break;
1231 }
12321233if true {
match (&self.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(self.decomposition.buffer_pos, 0);
12341235if !self.decomposition.buffer.is_empty() {
1236return Some(starter);
1237 }
1238// Now we need to check if composition with an upcoming starter is possible.
1239if let Some(pending) = self.decomposition.pending.take() {
1240// We know that `pending_starter` decomposes to start with a starter.
1241 // Otherwise, it would have been moved to `self.decomposition.buffer`
1242 // by `self.decomposing_next()`. We do this set lookup here in order
1243 // to get an opportunity to go back to the fast track.
1244 // Note that this check has to happen _after_ checking that `pending`
1245 // holds a character, because this flag isn't defined to be meaningful
1246 // when `pending` isn't holding a character.
1247if u32::from(pending.character) < self.composition_passthrough_bound
1248 || !pending.can_combine_backwards()
1249 {
1250// Won't combine backwards anyway.
1251self.decomposition.pending = Some(pending);
1252return Some(starter);
1253 }
1254// Consume what we peeked.
1255undecomposed_starter = pending;
1256// The following line is OK, because we're about to loop back
1257 // to `self.decomposition.decomposing_next(c);`, which will
1258 // restore the between-`next()`-calls invariant of `pending`
1259 // before this function returns.
1260attempt_composition = true;
1261continue;
1262 }
1263// End of input
1264return Some(starter);
1265 }
1266 }
1267}
12681269macro_rules! composing_normalize_to {
1270 ($(#[$meta:meta])*,
1271$normalize_to:ident,
1272$write:path,
1273$slice:ty,
1274$prolog:block,
1275$always_valid_utf:literal,
1276$as_slice:ident,
1277$fast:block,
1278$text:ident,
1279$sink:ident,
1280$composition:ident,
1281$composition_passthrough_bound:ident,
1282$undecomposed_starter:ident,
1283$pending_slice:ident,
1284$len_utf:ident,
1285 ) => {
1286 $(#[$meta])*
1287pub fn $normalize_to<W: $write + ?Sized>(
1288&self,
1289$text: $slice,
1290$sink: &mut W,
1291 ) -> core::fmt::Result {
1292$prolog
1293let mut $composition = self.normalize_iter($text.chars());
1294debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1295for cc in $composition.decomposition.buffer.drain(..) {
1296$sink.write_char(cc.character())?;
1297 }
12981299// Try to get the compiler to hoist the bound to a register.
1300let $composition_passthrough_bound = $composition.composition_passthrough_bound;
1301'outer: loop {
1302debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1303let mut $undecomposed_starter =
1304if let Some(pending) = $composition.decomposition.pending.take() {
1305 pending
1306 } else {
1307return Ok(());
1308 };
1309if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
1310$undecomposed_starter.potential_passthrough()
1311 {
1312// We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
1313 // was returned in response to an error by the iterator. Assume the
1314 // latter for correctness even though it pessimizes the former.
1315if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
1316let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
1317// The `$fast` block must either:
1318 // 1. Return due to reaching EOF
1319 // 2. Leave a starter with its trie value in `$undecomposed_starter`
1320 // and, if there is still more input, leave the next character
1321 // and its trie value in `$composition.decomposition.pending`.
1322$fast
1323}
1324 }
1325// Fast track above, full algorithm below
1326let mut starter = $composition
1327.decomposition
1328 .decomposing_next($undecomposed_starter);
1329'bufferloop: loop {
1330// We first loop by index to avoid moving the contents of `buffer`, but
1331 // if there's a discontiguous match, we'll start modifying `buffer` instead.
1332loop {
1333let (character, ccc) = if let Some((character, ccc)) = $composition
1334.decomposition
1335 .buffer
1336 .get($composition.decomposition.buffer_pos)
1337 .map(|c| c.character_and_ccc())
1338 {
1339 (character, ccc)
1340 } else {
1341$composition.decomposition.buffer.clear();
1342$composition.decomposition.buffer_pos = 0;
1343break;
1344 };
1345if let Some(composed) = $composition.compose(starter, character) {
1346 starter = composed;
1347$composition.decomposition.buffer_pos += 1;
1348continue;
1349 }
1350let mut most_recent_skipped_ccc = ccc;
1351if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1352// We failed to compose a starter. Discontiguous match not allowed.
1353 // Write the current `starter` we've been composing, make the unmatched
1354 // starter in the buffer the new `starter` (we know it's been decomposed)
1355 // and process the rest of the buffer with that as the starter.
1356$sink.write_char(starter)?;
1357 starter = character;
1358$composition.decomposition.buffer_pos += 1;
1359continue 'bufferloop;
1360 } else {
1361 {
1362let _ = $composition
1363.decomposition
1364 .buffer
1365 .drain(0..$composition.decomposition.buffer_pos);
1366 }
1367$composition.decomposition.buffer_pos = 0;
1368 }
1369let mut i = 1; // We have skipped one non-starter.
1370while let Some((character, ccc)) = $composition
1371.decomposition
1372 .buffer
1373 .get(i)
1374 .map(|c| c.character_and_ccc())
1375 {
1376if ccc == CCC_NOT_REORDERED {
1377// Discontiguous match not allowed.
1378$sink.write_char(starter)?;
1379for cc in $composition.decomposition.buffer.drain(..i) {
1380$sink.write_char(cc.character())?;
1381 }
1382 starter = character;
1383 {
1384let removed = $composition.decomposition.buffer.remove(0);
1385debug_assert_eq!(starter, removed.character());
1386 }
1387debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1388continue 'bufferloop;
1389 }
1390debug_assert!(ccc >= most_recent_skipped_ccc);
1391if ccc != most_recent_skipped_ccc {
1392// Using the non-Hangul version as a micro-optimization, since
1393 // we already rejected the case where `second` is a starter
1394 // above, and conjoining jamo are starters.
1395if let Some(composed) =
1396$composition.compose_non_hangul(starter, character)
1397 {
1398$composition.decomposition.buffer.remove(i);
1399 starter = composed;
1400continue;
1401 }
1402 }
1403 most_recent_skipped_ccc = ccc;
1404 i += 1;
1405 }
1406break;
1407 }
1408debug_assert_eq!($composition.decomposition.buffer_pos, 0);
14091410if !$composition.decomposition.buffer.is_empty() {
1411$sink.write_char(starter)?;
1412for cc in $composition.decomposition.buffer.drain(..) {
1413$sink.write_char(cc.character())?;
1414 }
1415// We had non-empty buffer, so can't compose with upcoming.
1416continue 'outer;
1417 }
1418// Now we need to check if composition with an upcoming starter is possible.
1419if $composition.decomposition.pending.is_some() {
1420// We know that `pending_starter` decomposes to start with a starter.
1421 // Otherwise, it would have been moved to `composition.decomposition.buffer`
1422 // by `composition.decomposing_next()`. We do this set lookup here in order
1423 // to get an opportunity to go back to the fast track.
1424 // Note that this check has to happen _after_ checking that `pending`
1425 // holds a character, because this flag isn't defined to be meaningful
1426 // when `pending` isn't holding a character.
1427let pending = $composition.decomposition.pending.as_ref().unwrap();
1428if u32::from(pending.character) < $composition.composition_passthrough_bound
1429 || !pending.can_combine_backwards()
1430 {
1431// Won't combine backwards anyway.
1432$sink.write_char(starter)?;
1433continue 'outer;
1434 }
1435let pending_starter = $composition.decomposition.pending.take().unwrap();
1436let decomposed = $composition.decomposition.decomposing_next(pending_starter);
1437if let Some(composed) = $composition.compose(starter, decomposed) {
1438 starter = composed;
1439 } else {
1440$sink.write_char(starter)?;
1441 starter = decomposed;
1442 }
1443continue 'bufferloop;
1444 }
1445// End of input
1446$sink.write_char(starter)?;
1447return Ok(());
1448 } // 'bufferloop
1449}
1450 }
1451 };
1452}
14531454macro_rules! decomposing_normalize_to {
1455 ($(#[$meta:meta])*,
1456$normalize_to:ident,
1457$write:path,
1458$slice:ty,
1459$prolog:block,
1460$as_slice:ident,
1461$fast:block,
1462$text:ident,
1463$sink:ident,
1464$decomposition:ident,
1465$decomposition_passthrough_bound:ident,
1466$undecomposed_starter:ident,
1467$pending_slice:ident,
1468$outer:lifetime, // loop labels use lifetime tokens
1469) => {
1470 $(#[$meta])*
1471pub fn $normalize_to<W: $write + ?Sized>(
1472&self,
1473$text: $slice,
1474$sink: &mut W,
1475 ) -> core::fmt::Result {
1476$prolog
14771478let mut $decomposition = self.normalize_iter($text.chars());
1479debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
14801481// Try to get the compiler to hoist the bound to a register.
1482let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
1483$outer: loop {
1484for cc in $decomposition.buffer.drain(..) {
1485$sink.write_char(cc.character())?;
1486 }
1487debug_assert_eq!($decomposition.buffer_pos, 0);
1488let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
1489 pending
1490 } else {
1491return Ok(());
1492 };
1493if $undecomposed_starter.starter_and_decomposes_to_self() {
1494// Don't bother including `undecomposed_starter` in a contiguous buffer
1495 // write: Just write it right away:
1496$sink.write_char($undecomposed_starter.character)?;
14971498let $pending_slice = $decomposition.delegate.$as_slice();
1499$fast
1500}
1501let starter = $decomposition.decomposing_next($undecomposed_starter);
1502$sink.write_char(starter)?;
1503 }
1504 }
1505 };
1506}
15071508macro_rules! normalizer_methods {
1509 () => {
1510/// Normalize a string slice into a `Cow<'a, str>`.
1511pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
1512let (head, tail) = self.split_normalized(text);
1513if tail.is_empty() {
1514return Cow::Borrowed(head);
1515 }
1516let mut ret = String::new();
1517 ret.reserve(text.len());
1518 ret.push_str(head);
1519let _ = self.normalize_to(tail, &mut ret);
1520 Cow::Owned(ret)
1521 }
15221523/// Split a string slice into maximum normalized prefix and unnormalized suffix
1524 /// such that the concatenation of the prefix and the normalization of the suffix
1525 /// is the normalization of the whole input.
1526pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
1527let up_to = self.is_normalized_up_to(text);
1528 text.split_at_checked(up_to).unwrap_or_else(|| {
1529// Internal bug, not even GIGO, never supposed to happen
1530debug_assert!(false);
1531 ("", text)
1532 })
1533 }
15341535/// Return the index a string slice is normalized up to.
1536fn is_normalized_up_to(&self, text: &str) -> usize {
1537let mut sink = IsNormalizedSinkStr::new(text);
1538let _ = self.normalize_to(text, &mut sink);
1539 text.len() - sink.remaining_len()
1540 }
15411542/// Check whether a string slice is normalized.
1543pub fn is_normalized(&self, text: &str) -> bool {
1544self.is_normalized_up_to(text) == text.len()
1545 }
15461547/// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`.
1548 ///
1549 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1550 /// before normalizing.
1551 ///
1552 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1553#[cfg(feature = "utf16_iter")]
1554pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
1555let (head, tail) = self.split_normalized_utf16(text);
1556if tail.is_empty() {
1557return Cow::Borrowed(head);
1558 }
1559let mut ret = alloc::vec::Vec::with_capacity(text.len());
1560 ret.extend_from_slice(head);
1561let _ = self.normalize_utf16_to(tail, &mut ret);
1562 Cow::Owned(ret)
1563 }
15641565/// Split a slice of potentially-invalid UTF-16 into maximum normalized (and valid)
1566 /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1567 /// normalization of the suffix is the normalization of the whole input.
1568 ///
1569 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1570#[cfg(feature = "utf16_iter")]
1571pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
1572let up_to = self.is_normalized_utf16_up_to(text);
1573 text.split_at_checked(up_to).unwrap_or_else(|| {
1574// Internal bug, not even GIGO, never supposed to happen
1575debug_assert!(false);
1576 (&[], text)
1577 })
1578 }
15791580/// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
1581 ///
1582 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1583#[cfg(feature = "utf16_iter")]
1584fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
1585let mut sink = IsNormalizedSinkUtf16::new(text);
1586let _ = self.normalize_utf16_to(text, &mut sink);
1587 text.len() - sink.remaining_len()
1588 }
15891590/// Checks whether a slice of potentially-invalid UTF-16 is normalized.
1591 ///
1592 /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
1593 ///
1594 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1595#[cfg(feature = "utf16_iter")]
1596pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
1597self.is_normalized_utf16_up_to(text) == text.len()
1598 }
15991600/// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`.
1601 ///
1602 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1603 /// according to the WHATWG Encoding Standard.
1604 ///
1605 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1606#[cfg(feature = "utf8_iter")]
1607pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
1608let (head, tail) = self.split_normalized_utf8(text);
1609if tail.is_empty() {
1610return Cow::Borrowed(head);
1611 }
1612let mut ret = String::new();
1613 ret.reserve(text.len());
1614 ret.push_str(head);
1615let _ = self.normalize_utf8_to(tail, &mut ret);
1616 Cow::Owned(ret)
1617 }
16181619/// Split a slice of potentially-invalid UTF-8 into maximum normalized (and valid)
1620 /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1621 /// normalization of the suffix is the normalization of the whole input.
1622 ///
1623 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1624#[cfg(feature = "utf8_iter")]
1625pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
1626let up_to = self.is_normalized_utf8_up_to(text);
1627let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
1628// Internal bug, not even GIGO, never supposed to happen
1629debug_assert!(false);
1630 (&[], text)
1631 });
1632// SAFETY: The normalization check also checks for
1633 // UTF-8 well-formedness.
1634(unsafe { core::str::from_utf8_unchecked(head) }, tail)
1635 }
16361637/// Return the index a slice of potentially-invalid UTF-8 is normalized up to
1638 ///
1639 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1640#[cfg(feature = "utf8_iter")]
1641fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
1642let mut sink = IsNormalizedSinkUtf8::new(text);
1643let _ = self.normalize_utf8_to(text, &mut sink);
1644 text.len() - sink.remaining_len()
1645 }
16461647/// Check if a slice of potentially-invalid UTF-8 is normalized.
1648 ///
1649 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1650 /// according to the WHATWG Encoding Standard before checking.
1651 ///
1652 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1653#[cfg(feature = "utf8_iter")]
1654pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
1655self.is_normalized_utf8_up_to(text) == text.len()
1656 }
1657 };
1658}
16591660/// Borrowed version of a normalizer for performing decomposing normalization.
1661#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for DecomposingNormalizerBorrowed<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field5_finish(f,
"DecomposingNormalizerBorrowed", "decompositions",
&self.decompositions, "tables", &self.tables,
"supplementary_tables", &self.supplementary_tables,
"decomposition_passthrough_bound",
&self.decomposition_passthrough_bound,
"composition_passthrough_bound",
&&self.composition_passthrough_bound)
}
}Debug)]
1662pub struct DecomposingNormalizerBorrowed<'a> {
1663 decompositions: &'a DecompositionData<'a>,
1664 tables: &'a DecompositionTables<'a>,
1665 supplementary_tables: Option<&'a DecompositionTables<'a>>,
1666 decomposition_passthrough_bound: u8, // never above 0xC0
1667composition_passthrough_bound: u16, // never above 0x0300
1668}
16691670impl DecomposingNormalizerBorrowed<'static> {
1671/// Cheaply converts a [`DecomposingNormalizerBorrowed<'static>`] into a [`DecomposingNormalizer`].
1672 ///
1673 /// Note: Due to branching and indirection, using [`DecomposingNormalizer`] might inhibit some
1674 /// compile-time optimizations that are possible with [`DecomposingNormalizerBorrowed`].
1675pub const fn static_to_owned(self) -> DecomposingNormalizer {
1676DecomposingNormalizer {
1677 decompositions: DataPayload::from_static_ref(self.decompositions),
1678 tables: DataPayload::from_static_ref(self.tables),
1679 supplementary_tables: if let Some(s) = self.supplementary_tables {
1680// `map` not available in const context
1681Some(DataPayload::from_static_ref(s))
1682 } else {
1683None1684 },
1685 decomposition_passthrough_bound: self.decomposition_passthrough_bound,
1686 composition_passthrough_bound: self.composition_passthrough_bound,
1687 }
1688 }
16891690/// NFD constructor using compiled data.
1691 ///
1692 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1693 ///
1694 /// [📚 Help choosing a constructor](icu_provider::constructors)
1695#[cfg(feature = "compiled_data")]
1696pub const fn new_nfd() -> Self {
1697const _: () = if !(provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars16.const_len()
+
provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars24.const_len()
<= 0xFFF) {
{ ::core::panicking::panic_fmt(format_args!("future extension")); }
}assert!(
1698 provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1699 .scalars16
1700 .const_len()
1701 + provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1702 .scalars24
1703 .const_len()
1704 <= 0xFFF,
1705"future extension"
1706);
17071708DecomposingNormalizerBorrowed {
1709 decompositions: provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
1710 tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1711 supplementary_tables: None,
1712 decomposition_passthrough_bound: 0xC0,
1713 composition_passthrough_bound: 0x0300,
1714 }
1715 }
17161717/// NFKD constructor using compiled data.
1718 ///
1719 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1720 ///
1721 /// [📚 Help choosing a constructor](icu_provider::constructors)
1722#[cfg(feature = "compiled_data")]
1723pub const fn new_nfkd() -> Self {
1724const _: () = if !(provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars16.const_len()
+
provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars24.const_len()
+
provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1.scalars16.const_len()
+
provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1.scalars24.const_len()
<= 0xFFF) {
{ ::core::panicking::panic_fmt(format_args!("future extension")); }
}assert!(
1725 provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1726 .scalars16
1727 .const_len()
1728 + provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1729 .scalars24
1730 .const_len()
1731 + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1732 .scalars16
1733 .const_len()
1734 + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1735 .scalars24
1736 .const_len()
1737 <= 0xFFF,
1738"future extension"
1739);
17401741const _: () = if !(provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <=
0x0300) {
{ ::core::panicking::panic_fmt(format_args!("invalid")); }
}assert!(
1742 provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300,
1743"invalid"
1744);
17451746let decomposition_capped =
1747if provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 {
1748 provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1749 } else {
17500xC0
1751};
1752let composition_capped =
1753if provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 {
1754 provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1755 } else {
17560x0300
1757};
17581759DecomposingNormalizerBorrowed {
1760 decompositions: provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1,
1761 tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1762 supplementary_tables: Some(provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1763 decomposition_passthrough_bound: decomposition_cappedas u8,
1764 composition_passthrough_bound: composition_capped,
1765 }
1766 }
17671768#[cfg(feature = "compiled_data")]
1769pub(crate) const fn new_uts46_decomposed() -> Self {
1770const _: () = if !(provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars16.const_len()
+
provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars24.const_len()
+
provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1.scalars16.const_len()
+
provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1.scalars24.const_len()
<= 0xFFF) {
{ ::core::panicking::panic_fmt(format_args!("future extension")); }
}assert!(
1771 provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1772 .scalars16
1773 .const_len()
1774 + provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1775 .scalars24
1776 .const_len()
1777 + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1778 .scalars16
1779 .const_len()
1780 + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1781 .scalars24
1782 .const_len()
1783 <= 0xFFF,
1784"future extension"
1785);
17861787const _: () = if !(provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <=
0x0300) {
{ ::core::panicking::panic_fmt(format_args!("invalid")); }
}assert!(
1788 provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300,
1789"invalid"
1790);
17911792let decomposition_capped =
1793if provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 {
1794 provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1795 } else {
17960xC0
1797};
1798let composition_capped =
1799if provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0x0300 {
1800 provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1801 } else {
18020x0300
1803};
18041805DecomposingNormalizerBorrowed {
1806 decompositions: provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1,
1807 tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1808 supplementary_tables: Some(provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1809 decomposition_passthrough_bound: decomposition_cappedas u8,
1810 composition_passthrough_bound: composition_capped,
1811 }
1812 }
1813}
18141815impl<'data> DecomposingNormalizerBorrowed<'data> {
1816/// NFD constructor using already-loaded data.
1817 ///
1818 /// This constructor is intended for use by collations.
1819 ///
1820 /// [📚 Help choosing a constructor](icu_provider::constructors)
1821#[doc(hidden)]
1822pub fn new_with_data(
1823 decompositions: &'data DecompositionData<'data>,
1824 tables: &'data DecompositionTables<'data>,
1825 ) -> Self {
1826Self {
1827decompositions,
1828tables,
1829 supplementary_tables: None,
1830 decomposition_passthrough_bound: 0xC0,
1831 composition_passthrough_bound: 0x0300,
1832 }
1833 }
18341835/// Wraps a delegate iterator into a decomposing iterator
1836 /// adapter by using the data already held by this normalizer.
1837pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> {
1838Decomposition::new_with_supplements(
1839iter,
1840self.decompositions,
1841self.tables,
1842self.supplementary_tables,
1843self.decomposition_passthrough_bound,
1844 IgnorableBehavior::Unsupported,
1845 )
1846 }
18471848self
&'a str
text
Cow<'a, str>
let (head, tail) = self.split_normalized(text);
if tail.is_empty() { return Cow::Borrowed(head); }
let mut ret = String::new();
ret.reserve(text.len());
ret.push_str(head);
let _ = self.normalize_to(tail, &mut ret);
Cow::Owned(ret);
&Self
self
&'a str
text
(&'a str, &'a str)
let up_to = self.is_normalized_up_to(text);
text.split_at_checked(up_to).unwrap_or_else(||
{
if true {
if !false {
::core::panicking::panic("assertion failed: false")
};
};
("", text)
});
&Self
self
&str
text
usize
let mut sink = IsNormalizedSinkStr::new(text);
let _ = self.normalize_to(text, &mut sink);
text.len() - sink.remaining_len();
&Self
self
&str
text
bool
self.is_normalized_up_to(text) == text.len();normalizer_methods!();
18491850self
text
&mut W
sink
core::fmt::Result
{}
let mut decomposition = self.normalize_iter(text.chars());
if true {
match (&decomposition.ignorable_behavior, &IgnorableBehavior::Unsupported)
{
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
let decomposition_passthrough_bound =
decomposition.decomposition_passthrough_bound;
'outer: loop {
for cc in decomposition.buffer.drain(..) {
sink.write_char(cc.character())?;
}
if true {
match (&decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
let mut undecomposed_starter =
if let Some(pending) = decomposition.pending.take() {
pending
} else { return Ok(()); };
if undecomposed_starter.starter_and_decomposes_to_self() {
sink.write_char(undecomposed_starter.character)?;
let pending_slice = decomposition.delegate.as_str();
{
let decomposition_passthrough_byte_bound =
if decomposition_passthrough_bound == 0xC0 {
0xC3u8
} else { decomposition_passthrough_bound.min(0x80) as u8 };
#[expect(clippy::unwrap_used)]
'fast: loop {
let mut code_unit_iter =
decomposition.delegate.as_str().as_bytes().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < decomposition_passthrough_byte_bound {
continue 'fastest;
}
decomposition.delegate =
pending_slice[pending_slice.len() -
code_unit_iter.as_slice().len() - 1..].chars();
break 'fastest;
}
sink.write_str(pending_slice)?;
return Ok(());
}
let upcoming = decomposition.delegate.next().unwrap();
let upcoming_with_trie_value =
decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.starter_and_decomposes_to_self() {
continue 'fast;
}
let consumed_so_far_slice =
&pending_slice[..pending_slice.len() -
decomposition.delegate.as_str().len() -
upcoming.len_utf8()];
sink.write_str(consumed_so_far_slice)?;
if decomposition_starts_with_non_starter(upcoming_with_trie_value.trie_val)
{
decomposition.pending = Some(upcoming_with_trie_value);
decomposition.gather_and_sort_combining(0);
continue 'outer;
}
undecomposed_starter = upcoming_with_trie_value;
if true {
if !decomposition.pending.is_none() {
::core::panicking::panic("assertion failed: decomposition.pending.is_none()")
};
};
break 'fast;
}
}
}
let starter = decomposition.decomposing_next(undecomposed_starter);
sink.write_char(starter)?;
}decomposing_normalize_to!(
1851/// Normalize a string slice into a `Write` sink.
1852,
1853 normalize_to,
1854core::fmt::Write,
1855&str,
1856 {
1857 },
1858 as_str,
1859 {
1860let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
18610xC3u8
1862} else {
1863 decomposition_passthrough_bound.min(0x80) as u8
1864 };
1865// The attribute belongs on an inner statement, but Rust doesn't allow it there.
1866#[expect(clippy::unwrap_used)]
1867'fast: loop {
1868let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
1869'fastest: loop {
1870if let Some(&upcoming_byte) = code_unit_iter.next() {
1871if upcoming_byte < decomposition_passthrough_byte_bound {
1872// Fast-track succeeded!
1873continue 'fastest;
1874 }
1875// This deliberately isn't panic-free, since the code pattern
1876 // that was OK for the composing counterpart regressed
1877 // English and French performance if done here, too.
1878decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
1879break 'fastest;
1880 }
1881// End of stream
1882sink.write_str(pending_slice)?;
1883return Ok(());
1884 }
18851886// `unwrap()` OK, because the slice is valid UTF-8 and we know there
1887 // is an upcoming byte.
1888let upcoming = decomposition.delegate.next().unwrap();
1889let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1890if upcoming_with_trie_value.starter_and_decomposes_to_self() {
1891continue 'fast;
1892 }
1893let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1894 - decomposition.delegate.as_str().len()
1895 - upcoming.len_utf8()];
1896 sink.write_str(consumed_so_far_slice)?;
18971898// Now let's figure out if we got a starter or a non-starter.
1899if decomposition_starts_with_non_starter(
1900 upcoming_with_trie_value.trie_val,
1901 ) {
1902// Let this trie value to be reprocessed in case it is
1903 // one of the rare decomposing ones.
1904decomposition.pending = Some(upcoming_with_trie_value);
1905 decomposition.gather_and_sort_combining(0);
1906continue 'outer;
1907 }
1908 undecomposed_starter = upcoming_with_trie_value;
1909debug_assert!(decomposition.pending.is_none());
1910break 'fast;
1911 }
1912 },
1913 text,
1914 sink,
1915 decomposition,
1916 decomposition_passthrough_bound,
1917 undecomposed_starter,
1918 pending_slice,
1919'outer,
1920 );
19211922decomposing_normalize_to!(
1923/// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
1924 ///
1925 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1926 /// according to the WHATWG Encoding Standard.
1927 ///
1928 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1929#[cfg(feature = "utf8_iter")]
1930,
1931 normalize_utf8_to,
1932 core::fmt::Write,
1933&[u8],
1934 {
1935 },
1936 as_slice,
1937 {
1938let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
1939'fast: loop {
1940let mut code_unit_iter = decomposition.delegate.as_slice().iter();
1941'fastest: loop {
1942if let Some(&upcoming_byte) = code_unit_iter.next() {
1943if upcoming_byte < decomposition_passthrough_byte_bound {
1944// Fast-track succeeded!
1945continue 'fastest;
1946 }
1947break 'fastest;
1948 }
1949// End of stream
1950sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
1951return Ok(());
1952 }
1953#[expect(clippy::indexing_slicing)]
1954{decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();}
19551956// `unwrap()` OK, because the slice is valid UTF-8 and we know there
1957 // is an upcoming byte.
1958#[expect(clippy::unwrap_used)]
1959let upcoming = decomposition.delegate.next().unwrap();
1960let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1961if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() {
1962// Note: The trie value of the REPLACEMENT CHARACTER is
1963 // intentionally formatted to fail the
1964 // `starter_and_decomposes_to_self` test even though it
1965 // really is a starter that decomposes to self. This
1966 // Allows moving the branch on REPLACEMENT CHARACTER
1967 // below this `continue`.
1968continue 'fast;
1969 }
19701971// TODO: Annotate as unlikely.
1972if upcoming == REPLACEMENT_CHARACTER {
1973// We might have an error, so fall out of the fast path.
19741975 // Since the U+FFFD might signify an error, we can't
1976 // assume `upcoming.len_utf8()` for the backoff length.
1977#[expect(clippy::indexing_slicing)]
1978let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
1979let back = consumed_so_far.next_back();
1980debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
1981let consumed_so_far_slice = consumed_so_far.as_slice();
1982 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
19831984// We could call `gather_and_sort_combining` here and
1985 // `continue 'outer`, but this should be better for code
1986 // size.
1987undecomposed_starter = upcoming_with_trie_value;
1988debug_assert!(decomposition.pending.is_none());
1989break 'fast;
1990 }
19911992#[expect(clippy::indexing_slicing)]
1993let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1994 - decomposition.delegate.as_slice().len()
1995 - upcoming.len_utf8()];
1996 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
19971998// Now let's figure out if we got a starter or a non-starter.
1999if decomposition_starts_with_non_starter(
2000 upcoming_with_trie_value.trie_val,
2001 ) {
2002// Let this trie value to be reprocessed in case it is
2003 // one of the rare decomposing ones.
2004decomposition.pending = Some(upcoming_with_trie_value);
2005 decomposition.gather_and_sort_combining(0);
2006continue 'outer;
2007 }
2008 undecomposed_starter = upcoming_with_trie_value;
2009debug_assert!(decomposition.pending.is_none());
2010break 'fast;
2011 }
2012 },
2013 text,
2014 sink,
2015 decomposition,
2016 decomposition_passthrough_bound,
2017 undecomposed_starter,
2018 pending_slice,
2019'outer,
2020 );
20212022decomposing_normalize_to!(
2023/// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2024 ///
2025 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2026 /// before normalizing.
2027 ///
2028 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2029#[cfg(feature = "utf16_iter")]
2030,
2031 normalize_utf16_to,
2032 write16::Write16,
2033&[u16],
2034 {
2035 sink.size_hint(text.len())?;
2036 },
2037 as_slice,
2038 {
2039// This loop is only broken out of as goto forward and only as release-build recovery from
2040 // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2041#[expect(clippy::never_loop)]
2042'fastwrap: loop {
2043// Commented out `code_unit_iter` and used `ptr` and `end` to
2044 // work around https://github.com/rust-lang/rust/issues/144684 .
2045 //
2046 // let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2047let delegate_as_slice = decomposition.delegate.as_slice();
2048let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2049// SAFETY: materializing a pointer immediately past the end of an
2050 // allocation is OK.
2051let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2052'fast: loop {
2053// if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2054if ptr != end {
2055// SAFETY: We just checked that `ptr` has not reached `end`.
2056 // `ptr` always advances by one, and we always have a check
2057 // per advancement.
2058let upcoming_code_unit = unsafe { *ptr };
2059// SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2060 // by one points to the same allocation or to immediately
2061 // after, which is OK.
2062ptr = unsafe { ptr.add(1) };
20632064let mut upcoming32 = u32::from(upcoming_code_unit);
2065// The performance of what logically is supposed to be this
2066 // branch is _incredibly_ brittle and what LLVM ends up doing
2067 // that affects the performance of what's logically about this
2068 // decision can swing to double/halve the throughput for Basic
2069 // Latin in ways that are completely unintuitive. Basically _any_
2070 // change to _any_ code that participates in how LLVM sees the
2071 // code around here can make the perf fall over. In seems that
2072 // manually annotating this branch as likely has worse effects
2073 // on non-Basic-Latin input that the case where LLVM just happens to
2074 // do the right thing.
2075 //
2076 // What happens with this branch may depend on what sink type
2077 // this code is monomorphized over.
2078 //
2079 // What a terrible sink of developer time!
2080if upcoming32 < decomposition_passthrough_bound {
2081continue 'fast;
2082 }
2083// We might be doing a trie lookup by surrogate. Surrogates get
2084 // a decomposition to U+FFFD.
2085let mut trie_value = decomposition.trie.get16(upcoming_code_unit);
2086if starter_and_decomposes_to_self_impl(trie_value) {
2087continue 'fast;
2088 }
2089// We might now be looking at a surrogate.
2090 // The loop is only broken out of as goto forward
2091#[expect(clippy::never_loop)]
2092'surrogateloop: loop {
2093// LLVM's optimizations are incredibly brittle for the code _above_,
2094 // and using `likely` _below_ without using it _above_ helps!
2095 // What a massive sink of developer time!
2096 // Seriously, the effect of these annotations is massively
2097 // unintuitive. Measure everything!
2098 // Notably, the `if likely(...)` formulation optimizes differently
2099 // than just putting `cold_path()` on the `else` path!
2100let surrogate_base = upcoming32.wrapping_sub(0xD800);
2101if likely(surrogate_base > (0xDFFF - 0xD800)) {
2102// Not surrogate
2103break 'surrogateloop;
2104 }
2105if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2106// let iter_backup = code_unit_iter.clone();
2107 // if let Some(&low) = code_unit_iter.next() {
2108if ptr != end {
2109// SAFETY: We just checked that `ptr` has not reached `end`.
2110 // `ptr` always advances by one, and we always have a check
2111 // per advancement.
2112let low = unsafe { *ptr };
2113if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2114// SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2115 // by one points to the same allocation or to immediately
2116 // after, which is OK.
2117ptr = unsafe { ptr.add(1) };
21182119 upcoming32 = (upcoming32 << 10) + u32::from(low)
2120 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2121// Successfully-paired surrogate. Read from the trie again.
2122trie_value = {
2123// Semantically, this bit of conditional compilation makes no sense.
2124 // The purpose is to keep LLVM seeing the untyped trie case the way
2125 // it did before so as not to regress the performance of the untyped
2126 // case due to unintuitive optimizer effects. If you care about the
2127 // perf of the untyped trie case and have better ideas, please try
2128 // something better.
2129#[cfg(not(icu4x_unstable_fast_trie_only))]
2130{decomposition.trie.get32(upcoming32)}
2131#[cfg(icu4x_unstable_fast_trie_only)]
2132{decomposition.trie.get32_supplementary(upcoming32)}
2133 };
2134if likely(starter_and_decomposes_to_self_impl(trie_value)) {
2135continue 'fast;
2136 }
2137break 'surrogateloop;
2138// } else {
2139 // code_unit_iter = iter_backup;
2140}
2141 }
2142 }
2143// unpaired surrogate
2144upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2145 // trie_value already holds a decomposition to U+FFFD.
2146break 'surrogateloop;
2147 }
21482149let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2150let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
215121522153let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2154// code_unit_iter.as_slice().len()
2155 // SAFETY: `ptr` and `end` have been derived from the same allocation
2156 // and `ptr` is never greater than `end`.
2157unsafe { end.offset_from(ptr) as usize }
2158 - upcoming.len_utf16()) else {
2159// If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2160debug_assert!(false);
2161// Throw away the results of the fast path.
2162break 'fastwrap;
2163 };
2164 sink.write_slice(consumed_so_far_slice)?;
21652166if decomposition_starts_with_non_starter(
2167 upcoming_with_trie_value.trie_val,
2168 ) {
2169// Sync with main iterator
2170 // decomposition.delegate = code_unit_iter.as_slice().chars();
2171 // SAFETY: `ptr` and `end` have been derived from the same allocation
2172 // and `ptr` is never greater than `end`.
2173decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2174// Let this trie value to be reprocessed in case it is
2175 // one of the rare decomposing ones.
2176decomposition.pending = Some(upcoming_with_trie_value);
2177 decomposition.gather_and_sort_combining(0);
2178continue 'outer;
2179 }
2180 undecomposed_starter = upcoming_with_trie_value;
2181debug_assert!(decomposition.pending.is_none());
2182break 'fast;
2183 }
2184// End of stream
2185sink.write_slice(pending_slice)?;
2186return Ok(());
2187 }
2188// Sync the main iterator
2189 // decomposition.delegate = code_unit_iter.as_slice().chars();
2190 // SAFETY: `ptr` and `end` have been derived from the same allocation
2191 // and `ptr` is never greater than `end`.
2192decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2193break 'fastwrap;
2194 }
2195 },
2196 text,
2197 sink,
2198 decomposition,
2199 decomposition_passthrough_bound,
2200 undecomposed_starter,
2201 pending_slice,
2202'outer,
2203 );
2204}
22052206/// A normalizer for performing decomposing normalization.
2207#[derive(#[automatically_derived]
impl ::core::fmt::Debug for DecomposingNormalizer {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field5_finish(f,
"DecomposingNormalizer", "decompositions", &self.decompositions,
"tables", &self.tables, "supplementary_tables",
&self.supplementary_tables, "decomposition_passthrough_bound",
&self.decomposition_passthrough_bound,
"composition_passthrough_bound",
&&self.composition_passthrough_bound)
}
}Debug)]
2208pub struct DecomposingNormalizer {
2209 decompositions: DataPayload<NormalizerNfdDataV1>,
2210 tables: DataPayload<NormalizerNfdTablesV1>,
2211 supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>,
2212 decomposition_passthrough_bound: u8, // never above 0xC0
2213composition_passthrough_bound: u16, // never above 0x0300
2214}
22152216impl DecomposingNormalizer {
2217/// Constructs a borrowed version of this type for more efficient querying.
2218pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed<'_> {
2219DecomposingNormalizerBorrowed {
2220 decompositions: self.decompositions.get(),
2221 tables: self.tables.get(),
2222 supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()),
2223 decomposition_passthrough_bound: self.decomposition_passthrough_bound,
2224 composition_passthrough_bound: self.composition_passthrough_bound,
2225 }
2226 }
22272228/// NFD constructor using compiled data.
2229 ///
2230 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2231 ///
2232 /// [📚 Help choosing a constructor](icu_provider::constructors)
2233#[cfg(feature = "compiled_data")]
2234pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> {
2235DecomposingNormalizerBorrowed::new_nfd()
2236 }
22372238icu_provider::gen_buffer_data_constructors!(
2239 () -> error: DataError,
2240 functions: [
2241 new_nfd: skip,
2242 try_new_nfd_with_buffer_provider,
2243 try_new_nfd_unstable,
2244Self,
2245 ]
2246 );
22472248#[doc = "A version of [`Self::new_nfd`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
2249pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError>
2250where
2251D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized,
2252 {
2253let decompositions: DataPayload<NormalizerNfdDataV1> =
2254provider.load(Default::default())?.payload;
2255let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
22562257if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
2258// The data is from a future where there exists a normalization flavor whose
2259 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2260 // of space. If a good use case from such a decomposition flavor arises, we can
2261 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2262 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2263 // since for now the masks are hard-coded, error out.
2264return Err(
2265DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2266 );
2267 }
22682269let cap = decompositions.get().passthrough_cap;
2270if cap > 0x0300 {
2271return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO));
2272 }
2273let decomposition_capped = cap.min(0xC0);
2274let composition_capped = cap.min(0x0300);
22752276Ok(DecomposingNormalizer {
2277decompositions,
2278tables,
2279 supplementary_tables: None,
2280 decomposition_passthrough_bound: decomposition_cappedas u8,
2281 composition_passthrough_bound: composition_capped,
2282 })
2283 }
22842285icu_provider::gen_buffer_data_constructors!(
2286 () -> error: DataError,
2287 functions: [
2288 new_nfkd: skip,
2289 try_new_nfkd_with_buffer_provider,
2290 try_new_nfkd_unstable,
2291Self,
2292 ]
2293 );
22942295/// NFKD constructor using compiled data.
2296 ///
2297 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2298 ///
2299 /// [📚 Help choosing a constructor](icu_provider::constructors)
2300#[cfg(feature = "compiled_data")]
2301pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> {
2302DecomposingNormalizerBorrowed::new_nfkd()
2303 }
23042305#[doc = "A version of [`Self::new_nfkd`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
2306pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError>
2307where
2308D: DataProvider<NormalizerNfkdDataV1>
2309 + DataProvider<NormalizerNfdTablesV1>
2310 + DataProvider<NormalizerNfkdTablesV1>
2311 + ?Sized,
2312 {
2313let decompositions: DataPayload<NormalizerNfkdDataV1> =
2314provider.load(Default::default())?.payload;
2315let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2316let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2317provider.load(Default::default())?.payload;
23182319if tables.get().scalars16.len()
2320 + tables.get().scalars24.len()
2321 + supplementary_tables.get().scalars16.len()
2322 + supplementary_tables.get().scalars24.len()
2323 > 0xFFF
2324{
2325// The data is from a future where there exists a normalization flavor whose
2326 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2327 // of space. If a good use case from such a decomposition flavor arises, we can
2328 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2329 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2330 // since for now the masks are hard-coded, error out.
2331return Err(
2332DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2333 );
2334 }
23352336let cap = decompositions.get().passthrough_cap;
2337if cap > 0x0300 {
2338return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO));
2339 }
2340let decomposition_capped = cap.min(0xC0);
2341let composition_capped = cap.min(0x0300);
23422343Ok(DecomposingNormalizer {
2344 decompositions: decompositions.cast(),
2345tables,
2346 supplementary_tables: Some(supplementary_tables),
2347 decomposition_passthrough_bound: decomposition_cappedas u8,
2348 composition_passthrough_bound: composition_capped,
2349 })
2350 }
23512352/// UTS 46 decomposed constructor (testing only)
2353 ///
2354 /// This is a special building block normalization for IDNA. It is the decomposed counterpart of
2355 /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
2356 /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
2357 /// NFD in this normalization. In both cases, the previous UTS 46 processing before using
2358 /// normalization is expected to deal with these characters. Making the disallowed characters
2359 /// behave like this is beneficial to data size, and this normalizer implementation cannot
2360 /// deal with a character normalizing to the empty string, which doesn't happen in NFD or
2361 /// NFKD as of Unicode 14.
2362 ///
2363 /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2364 /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2365 /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2366 /// Therefore, the output of this normalization may differ for different inputs that are
2367 /// canonically equivalent with each other if they differ by how U+0345 is ordered relative
2368 /// to other reorderable characters.
2369pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError>
2370where
2371D: DataProvider<NormalizerUts46DataV1>
2372 + DataProvider<NormalizerNfdTablesV1>
2373 + DataProvider<NormalizerNfkdTablesV1>
2374// UTS 46 tables merged into CompatibilityDecompositionTablesV1
2375+ ?Sized,
2376 {
2377let decompositions: DataPayload<NormalizerUts46DataV1> =
2378provider.load(Default::default())?.payload;
2379let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2380let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2381provider.load(Default::default())?.payload;
23822383if tables.get().scalars16.len()
2384 + tables.get().scalars24.len()
2385 + supplementary_tables.get().scalars16.len()
2386 + supplementary_tables.get().scalars24.len()
2387 > 0xFFF
2388{
2389// The data is from a future where there exists a normalization flavor whose
2390 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2391 // of space. If a good use case from such a decomposition flavor arises, we can
2392 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2393 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2394 // since for now the masks are hard-coded, error out.
2395return Err(
2396DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2397 );
2398 }
23992400let cap = decompositions.get().passthrough_cap;
2401if cap > 0x0300 {
2402return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO));
2403 }
2404let decomposition_capped = cap.min(0xC0);
2405let composition_capped = cap.min(0x0300);
24062407Ok(DecomposingNormalizer {
2408 decompositions: decompositions.cast(),
2409tables,
2410 supplementary_tables: Some(supplementary_tables),
2411 decomposition_passthrough_bound: decomposition_cappedas u8,
2412 composition_passthrough_bound: composition_capped,
2413 })
2414 }
2415}
24162417/// Borrowed version of a normalizer for performing composing normalization.
2418#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ComposingNormalizerBorrowed<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f,
"ComposingNormalizerBorrowed", "decomposing_normalizer",
&self.decomposing_normalizer, "canonical_compositions",
&&self.canonical_compositions)
}
}Debug)]
2419pub struct ComposingNormalizerBorrowed<'a> {
2420 decomposing_normalizer: DecomposingNormalizerBorrowed<'a>,
2421 canonical_compositions: &'a CanonicalCompositions<'a>,
2422}
24232424impl ComposingNormalizerBorrowed<'static> {
2425/// Cheaply converts a [`ComposingNormalizerBorrowed<'static>`] into a [`ComposingNormalizer`].
2426 ///
2427 /// Note: Due to branching and indirection, using [`ComposingNormalizer`] might inhibit some
2428 /// compile-time optimizations that are possible with [`ComposingNormalizerBorrowed`].
2429pub const fn static_to_owned(self) -> ComposingNormalizer {
2430ComposingNormalizer {
2431 decomposing_normalizer: self.decomposing_normalizer.static_to_owned(),
2432 canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
2433 }
2434 }
24352436/// NFC constructor using compiled data.
2437 ///
2438 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2439 ///
2440 /// [📚 Help choosing a constructor](icu_provider::constructors)
2441#[cfg(feature = "compiled_data")]
2442pub const fn new_nfc() -> Self {
2443ComposingNormalizerBorrowed {
2444 decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(),
2445 canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2446 }
2447 }
24482449/// NFKC constructor using compiled data.
2450 ///
2451 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2452 ///
2453 /// [📚 Help choosing a constructor](icu_provider::constructors)
2454#[cfg(feature = "compiled_data")]
2455pub const fn new_nfkc() -> Self {
2456ComposingNormalizerBorrowed {
2457 decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(),
2458 canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2459 }
2460 }
24612462/// This is a special building block normalization for IDNA that implements parts of the Map
2463 /// step and the following Normalize step.
2464 ///
2465 /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2466 /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2467 /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2468 /// Therefore, the output of this normalization may differ for different inputs that are
2469 /// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2470 /// to other reorderable characters.
2471#[cfg(feature = "compiled_data")]
2472pub(crate) const fn new_uts46() -> Self {
2473ComposingNormalizerBorrowed {
2474 decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(),
2475 canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2476 }
2477 }
2478}
24792480impl<'data> ComposingNormalizerBorrowed<'data> {
2481/// Wraps a delegate iterator into a composing iterator
2482 /// adapter by using the data already held by this normalizer.
2483pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> {
2484self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2485 }
24862487fn normalize_iter_private<I: Iterator<Item = char>>(
2488&self,
2489 iter: I,
2490 ignorable_behavior: IgnorableBehavior,
2491 ) -> Composition<'data, I> {
2492Composition::new(
2493Decomposition::new_with_supplements(
2494iter,
2495self.decomposing_normalizer.decompositions,
2496self.decomposing_normalizer.tables,
2497self.decomposing_normalizer.supplementary_tables,
2498self.decomposing_normalizer.decomposition_passthrough_bound,
2499ignorable_behavior,
2500 ),
2501self.canonical_compositions.canonical_compositions.clone(),
2502self.decomposing_normalizer.composition_passthrough_bound,
2503 )
2504 }
25052506self
&'a str
text
Cow<'a, str>
let (head, tail) = self.split_normalized(text);
if tail.is_empty() { return Cow::Borrowed(head); }
let mut ret = String::new();
ret.reserve(text.len());
ret.push_str(head);
let _ = self.normalize_to(tail, &mut ret);
Cow::Owned(ret);
&Self
self
&'a str
text
(&'a str, &'a str)
let up_to = self.is_normalized_up_to(text);
text.split_at_checked(up_to).unwrap_or_else(||
{
if true {
if !false {
::core::panicking::panic("assertion failed: false")
};
};
("", text)
});
&Self
self
&str
text
usize
let mut sink = IsNormalizedSinkStr::new(text);
let _ = self.normalize_to(text, &mut sink);
text.len() - sink.remaining_len();
&Self
self
&str
text
bool
self.is_normalized_up_to(text) == text.len();normalizer_methods!();
25072508self
text
&mut W
sink
core::fmt::Result
{}
let mut composition = self.normalize_iter(text.chars());
if true {
match (&composition.decomposition.ignorable_behavior,
&IgnorableBehavior::Unsupported) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
for cc in composition.decomposition.buffer.drain(..) {
sink.write_char(cc.character())?;
}
let composition_passthrough_bound = composition.composition_passthrough_bound;
'outer: loop {
if true {
match (&composition.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
let mut undecomposed_starter =
if let Some(pending) = composition.decomposition.pending.take() {
pending
} else { return Ok(()); };
if u32::from(undecomposed_starter.character) <
composition_passthrough_bound ||
undecomposed_starter.potential_passthrough() {
if true || undecomposed_starter.character != REPLACEMENT_CHARACTER {
let pending_slice =
&text[text.len() -
composition.decomposition.delegate.as_str().len() -
undecomposed_starter.character.len_utf8()..];
{
let composition_passthrough_byte_bound =
if composition_passthrough_bound == 0x300 {
0xCCu8
} else { composition_passthrough_bound.min(0x80) as u8 };
#[expect(clippy::unwrap_used)]
'fast: loop {
let mut code_unit_iter =
composition.decomposition.delegate.as_str().as_bytes().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < composition_passthrough_byte_bound {
continue 'fastest;
}
let Some(remaining_slice) =
pending_slice.get(pending_slice.len() -
code_unit_iter.as_slice().len() -
1..) else {
if true {
if !false {
::core::panicking::panic("assertion failed: false")
};
};
break 'fastest;
};
composition.decomposition.delegate =
remaining_slice.chars();
break 'fastest;
}
sink.write_str(pending_slice)?;
return Ok(());
}
let upcoming =
composition.decomposition.delegate.next().unwrap();
let upcoming_with_trie_value =
composition.decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards()
{
continue 'fast;
}
composition.decomposition.pending =
Some(upcoming_with_trie_value);
let mut consumed_so_far =
pending_slice[..pending_slice.len() -
composition.decomposition.delegate.as_str().len() -
upcoming.len_utf8()].chars();
undecomposed_starter =
composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
let consumed_so_far_slice = consumed_so_far.as_str();
sink.write_str(consumed_so_far_slice)?;
break 'fast;
}
}
}
}
let mut starter =
composition.decomposition.decomposing_next(undecomposed_starter);
'bufferloop: loop {
loop {
let (character, ccc) =
if let Some((character, ccc)) =
composition.decomposition.buffer.get(composition.decomposition.buffer_pos).map(|c|
c.character_and_ccc()) {
(character, ccc)
} else {
composition.decomposition.buffer.clear();
composition.decomposition.buffer_pos = 0;
break;
};
if let Some(composed) = composition.compose(starter, character) {
starter = composed;
composition.decomposition.buffer_pos += 1;
continue;
}
let mut most_recent_skipped_ccc = ccc;
if most_recent_skipped_ccc == CCC_NOT_REORDERED {
sink.write_char(starter)?;
starter = character;
composition.decomposition.buffer_pos += 1;
continue 'bufferloop;
} else {
{
let _ =
composition.decomposition.buffer.drain(0..composition.decomposition.buffer_pos);
}
composition.decomposition.buffer_pos = 0;
}
let mut i = 1;
while let Some((character, ccc)) =
composition.decomposition.buffer.get(i).map(|c|
c.character_and_ccc()) {
if ccc == CCC_NOT_REORDERED {
sink.write_char(starter)?;
for cc in composition.decomposition.buffer.drain(..i) {
sink.write_char(cc.character())?;
}
starter = character;
{
let removed = composition.decomposition.buffer.remove(0);
if true {
match (&starter, &removed.character()) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
}
if true {
match (&composition.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
continue 'bufferloop;
}
if true {
if !(ccc >= most_recent_skipped_ccc) {
::core::panicking::panic("assertion failed: ccc >= most_recent_skipped_ccc")
};
};
if ccc != most_recent_skipped_ccc {
if let Some(composed) =
composition.compose_non_hangul(starter, character) {
composition.decomposition.buffer.remove(i);
starter = composed;
continue;
}
}
most_recent_skipped_ccc = ccc;
i += 1;
}
break;
}
if true {
match (&composition.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
if !composition.decomposition.buffer.is_empty() {
sink.write_char(starter)?;
for cc in composition.decomposition.buffer.drain(..) {
sink.write_char(cc.character())?;
}
continue 'outer;
}
if composition.decomposition.pending.is_some() {
let pending = composition.decomposition.pending.as_ref().unwrap();
if u32::from(pending.character) <
composition.composition_passthrough_bound ||
!pending.can_combine_backwards() {
sink.write_char(starter)?;
continue 'outer;
}
let pending_starter =
composition.decomposition.pending.take().unwrap();
let decomposed =
composition.decomposition.decomposing_next(pending_starter);
if let Some(composed) = composition.compose(starter, decomposed) {
starter = composed;
} else { sink.write_char(starter)?; starter = decomposed; }
continue 'bufferloop;
}
sink.write_char(starter)?;
return Ok(());
}
}composing_normalize_to!(
2509/// Normalize a string slice into a `Write` sink.
2510,
2511 normalize_to,
2512core::fmt::Write,
2513&str,
2514 {},
2515true,
2516 as_str,
2517 {
2518// Let's hope LICM hoists this outside `'outer`.
2519let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
25200xCCu8
2521} else {
2522// We can make this fancy if a normalization other than NFC where looking at
2523 // non-ASCII lead bytes is worthwhile is ever introduced.
2524composition_passthrough_bound.min(0x80) as u8
2525 };
2526// Attributes have to be on blocks, so hoisting all the way here.
2527#[expect(clippy::unwrap_used)]
2528'fast: loop {
2529let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
2530'fastest: loop {
2531if let Some(&upcoming_byte) = code_unit_iter.next() {
2532if upcoming_byte < composition_passthrough_byte_bound {
2533// Fast-track succeeded!
2534continue 'fastest;
2535 }
2536let Some(remaining_slice) = pending_slice.get(pending_slice.len() - code_unit_iter.as_slice().len() - 1..) else {
2537// If we ever come here, it's an internal bug. Let's avoid panic code paths in release builds.
2538debug_assert!(false);
2539// Throw away the fastest-path result in case of an internal bug.
2540break 'fastest;
2541 };
2542 composition.decomposition.delegate = remaining_slice.chars();
2543break 'fastest;
2544 }
2545// End of stream
2546sink.write_str(pending_slice)?;
2547return Ok(());
2548 }
2549// `unwrap()` OK, because the slice is valid UTF-8 and we know there
2550 // is an upcoming byte.
2551let upcoming = composition.decomposition.delegate.next().unwrap();
2552let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2553if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2554// Can't combine backwards, hence a plain (non-backwards-combining)
2555 // starter albeit past `composition_passthrough_bound`
25562557 // Fast-track succeeded!
2558continue 'fast;
2559 }
2560// We need to fall off the fast path.
2561composition.decomposition.pending = Some(upcoming_with_trie_value);
25622563// slicing and unwrap OK, because we've just evidently read enough previously.
2564let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
2565// `unwrap` OK, because we've previously manage to read the previous character
2566undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2567let consumed_so_far_slice = consumed_so_far.as_str();
2568 sink.write_str(consumed_so_far_slice)?;
2569break 'fast;
2570 }
2571 },
2572 text,
2573 sink,
2574 composition,
2575 composition_passthrough_bound,
2576 undecomposed_starter,
2577 pending_slice,
2578 len_utf8,
2579 );
25802581composing_normalize_to!(
2582/// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2583 ///
2584 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2585 /// according to the WHATWG Encoding Standard.
2586 ///
2587 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
2588#[cfg(feature = "utf8_iter")]
2589,
2590 normalize_utf8_to,
2591 core::fmt::Write,
2592&[u8],
2593 {},
2594false,
2595 as_slice,
2596 {
2597'fast: loop {
2598if let Some(upcoming) = composition.decomposition.delegate.next() {
2599if u32::from(upcoming) < composition_passthrough_bound {
2600// Fast-track succeeded!
2601continue 'fast;
2602 }
2603// TODO: Be statically aware of fast/small trie.
2604let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2605if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2606// Note: The trie value of the REPLACEMENT CHARACTER is
2607 // intentionally formatted to fail the
2608 // `potential_passthrough_and_cannot_combine_backwards`
2609 // test even though it really is a starter that decomposes
2610 // to self and cannot combine backwards. This
2611 // Allows moving the branch on REPLACEMENT CHARACTER
2612 // below this `continue`.
2613continue 'fast;
2614 }
2615// We need to fall off the fast path.
26162617 // TODO(#2006): Annotate as unlikely
2618if upcoming == REPLACEMENT_CHARACTER {
2619// Can't tell if this is an error or a literal U+FFFD in
2620 // the input. Assuming the former to be sure.
26212622 // Since the U+FFFD might signify an error, we can't
2623 // assume `upcoming.len_utf8()` for the backoff length.
2624#[expect(clippy::indexing_slicing)]
2625let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
2626let back = consumed_so_far.next_back();
2627debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2628let consumed_so_far_slice = consumed_so_far.as_slice();
2629 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
2630 undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2631 composition.decomposition.pending = None;
2632break 'fast;
2633 }
26342635 composition.decomposition.pending = Some(upcoming_with_trie_value);
2636// slicing and unwrap OK, because we've just evidently read enough previously.
2637 // `unwrap` OK, because we've previously manage to read the previous character
2638#[expect(clippy::indexing_slicing)]
2639let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
2640#[expect(clippy::unwrap_used)]
2641{
2642// TODO: If the previous character was below the passthrough bound,
2643 // we really need to read from the trie. Otherwise, we could maintain
2644 // the most-recent trie value. Need to measure what's more expensive:
2645 // Remembering the trie value on each iteration or re-reading the
2646 // last one after the fast-track run.
2647undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2648 }
2649let consumed_so_far_slice = consumed_so_far.as_slice();
2650 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
2651break 'fast;
2652 }
2653// End of stream
2654sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
2655return Ok(());
2656 }
2657 },
2658 text,
2659 sink,
2660 composition,
2661 composition_passthrough_bound,
2662 undecomposed_starter,
2663 pending_slice,
2664 len_utf8,
2665 );
26662667composing_normalize_to!(
2668/// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2669 ///
2670 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2671 /// before normalizing.
2672 ///
2673 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2674#[cfg(feature = "utf16_iter")]
2675,
2676 normalize_utf16_to,
2677 write16::Write16,
2678&[u16],
2679 {
2680 sink.size_hint(text.len())?;
2681 },
2682false,
2683 as_slice,
2684 {
2685// This loop is only broken out of as goto forward and only as release-build recovery from
2686 // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2687#[expect(clippy::never_loop)]
2688'fastwrap: loop {
2689// Commented out `code_unit_iter` and used `ptr` and `end` to
2690 // work around https://github.com/rust-lang/rust/issues/144684 .
2691 //
2692 // let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
2693let delegate_as_slice = composition.decomposition.delegate.as_slice();
2694let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2695// SAFETY: materializing a pointer immediately past the end of an
2696 // allocation is OK.
2697let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
26982699'fast: loop {
2700// if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2701if ptr != end {
2702// SAFETY: We just checked that `ptr` has not reached `end`.
2703 // `ptr` always advances by one, and we always have a check
2704 // per advancement.
2705let upcoming_code_unit = unsafe { *ptr };
2706// SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2707 // by one points to the same allocation or to immediately
2708 // after, which is OK.
2709ptr = unsafe { ptr.add(1) };
27102711let mut upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
2712 // The performance of what logically is supposed to be this
2713 // branch is somewhat brittle and what LLVM ends up doing
2714 // that affects the performance of what's logically about this
2715 // decision can swing to double/halve the throughput for Basic
2716 // Latin in ways that are completely unintuitive. Basically _any_
2717 // change to _any_ code that participates in how LLVM sees the
2718 // code around here can make the perf fall over. In seems that
2719 // manually annotating this branch as likely has worse effects
2720 // on non-Basic-Latin input that the case where LLVM just happens to
2721 // do the right thing.
2722 //
2723 // What happens with this branch may depend on what sink type
2724 // this code is monomorphized over.
2725 //
2726 // What a terrible sink of developer time!
2727if upcoming32 < composition_passthrough_bound {
2728// No need for surrogate or U+FFFD check, because
2729 // `composition_passthrough_bound` cannot be higher than
2730 // U+0300.
2731 // Fast-track succeeded!
2732continue 'fast;
2733 }
2734// We might be doing a trie lookup by surrogate. Surrogates get
2735 // a decomposition to U+FFFD.
2736let mut trie_value = composition.decomposition.trie.get16(upcoming_code_unit);
2737if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
2738// Can't combine backwards, hence a plain (non-backwards-combining)
2739 // starter albeit past `composition_passthrough_bound`
27402741 // Fast-track succeeded!
2742continue 'fast;
2743 }
27442745// We might now be looking at a surrogate.
2746 // The loop is only broken out of as goto forward
2747#[expect(clippy::never_loop)]
2748'surrogateloop: loop {
2749// The `likely` annotations _below_ exist to make the code _above_
2750 // go faster!
2751let surrogate_base = upcoming32.wrapping_sub(0xD800);
2752if likely(surrogate_base > (0xDFFF - 0xD800)) {
2753// Not surrogate
2754break 'surrogateloop;
2755 }
2756if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2757// let iter_backup = code_unit_iter.clone();
2758 // if let Some(&low) = code_unit_iter.next() {
2759if ptr != end {
2760// SAFETY: We just checked that `ptr` has not reached `end`.
2761 // `ptr` always advances by one, and we always have a check
2762 // per advancement.
2763let low = unsafe { *ptr };
2764if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2765// SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2766 // by one points to the same allocation or to immediately
2767 // after, which is OK.
2768ptr = unsafe { ptr.add(1) };
27692770 upcoming32 = (upcoming32 << 10) + u32::from(low)
2771 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2772// Successfully-paired surrogate. Read from the trie again.
2773trie_value = {
2774// Semantically, this bit of conditional compilation makes no sense.
2775 // The purpose is to keep LLVM seeing the untyped trie case the way
2776 // it did before so as not to regress the performance of the untyped
2777 // case due to unintuitive optimizer effects. If you care about the
2778 // perf of the untyped trie case and have better ideas, please try
2779 // something better.
2780#[cfg(not(icu4x_unstable_fast_trie_only))]
2781{composition.decomposition.trie.get32(upcoming32)}
2782#[cfg(icu4x_unstable_fast_trie_only)]
2783{composition.decomposition.trie.get32_supplementary(upcoming32)}
2784 };
2785if likely(potential_passthrough_and_cannot_combine_backwards_impl(trie_value)) {
2786// Fast-track succeeded!
2787continue 'fast;
2788 }
2789break 'surrogateloop;
2790// } else {
2791 // code_unit_iter = iter_backup;
2792}
2793 }
2794 }
2795// unpaired surrogate
2796upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2797 // trie_value already holds a decomposition to U+FFFD.
2798debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD);
2799break 'surrogateloop;
2800 }
28012802// SAFETY: upcoming32 can no longer be a surrogate.
2803let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2804let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2805// We need to fall off the fast path.
2806composition.decomposition.pending = Some(upcoming_with_trie_value);
2807let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2808// code_unit_iter.as_slice().len()
2809 // SAFETY: `ptr` and `end` have been derived from the same allocation
2810 // and `ptr` is never greater than `end`.
2811unsafe { end.offset_from(ptr) as usize }
2812 - upcoming.len_utf16()) else {
2813// If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2814debug_assert!(false);
2815// Throw away the results of the fast path.
2816break 'fastwrap;
2817 };
2818let mut consumed_so_far = consumed_so_far_slice.chars();
2819let Some(c_from_back) = consumed_so_far.next_back() else {
2820// If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2821debug_assert!(false);
2822// Throw away the results of the fast path.
2823break 'fastwrap;
2824 };
2825// TODO: If the previous character was below the passthrough bound,
2826 // we really need to read from the trie. Otherwise, we could maintain
2827 // the most-recent trie value. Need to measure what's more expensive:
2828 // Remembering the trie value on each iteration or re-reading the
2829 // last one after the fast-track run.
2830undecomposed_starter = composition.decomposition.attach_trie_value(c_from_back);
2831 sink.write_slice(consumed_so_far.as_slice())?;
2832break 'fast;
2833 }
2834// End of stream
2835sink.write_slice(pending_slice)?;
2836return Ok(());
2837 }
2838// Sync the main iterator
2839 // composition.decomposition.delegate = code_unit_iter.as_slice().chars();
2840 // SAFETY: `ptr` and `end` have been derive from the same allocation
2841 // and `ptr` is never greater than `end`.
2842composition.decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2843break 'fastwrap;
2844 }
2845 },
2846 text,
2847 sink,
2848 composition,
2849 composition_passthrough_bound,
2850 undecomposed_starter,
2851 pending_slice,
2852 len_utf16,
2853 );
2854}
28552856/// A normalizer for performing composing normalization.
2857#[derive(#[automatically_derived]
impl ::core::fmt::Debug for ComposingNormalizer {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f,
"ComposingNormalizer", "decomposing_normalizer",
&self.decomposing_normalizer, "canonical_compositions",
&&self.canonical_compositions)
}
}Debug)]
2858pub struct ComposingNormalizer {
2859 decomposing_normalizer: DecomposingNormalizer,
2860 canonical_compositions: DataPayload<NormalizerNfcV1>,
2861}
28622863impl ComposingNormalizer {
2864/// Constructs a borrowed version of this type for more efficient querying.
2865pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> {
2866ComposingNormalizerBorrowed {
2867 decomposing_normalizer: self.decomposing_normalizer.as_borrowed(),
2868 canonical_compositions: self.canonical_compositions.get(),
2869 }
2870 }
28712872/// NFC constructor using compiled data.
2873 ///
2874 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2875 ///
2876 /// [📚 Help choosing a constructor](icu_provider::constructors)
2877#[cfg(feature = "compiled_data")]
2878pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> {
2879ComposingNormalizerBorrowed::new_nfc()
2880 }
28812882icu_provider::gen_buffer_data_constructors!(
2883 () -> error: DataError,
2884 functions: [
2885 new_nfc: skip,
2886 try_new_nfc_with_buffer_provider,
2887 try_new_nfc_unstable,
2888Self,
2889 ]
2890 );
28912892#[doc = "A version of [`Self::new_nfc`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
2893pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError>
2894where
2895D: DataProvider<NormalizerNfdDataV1>
2896 + DataProvider<NormalizerNfdTablesV1>
2897 + DataProvider<NormalizerNfcV1>
2898 + ?Sized,
2899 {
2900let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
29012902let canonical_compositions: DataPayload<NormalizerNfcV1> =
2903provider.load(Default::default())?.payload;
29042905Ok(ComposingNormalizer {
2906decomposing_normalizer,
2907canonical_compositions,
2908 })
2909 }
29102911/// NFKC constructor using compiled data.
2912 ///
2913 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2914 ///
2915 /// [📚 Help choosing a constructor](icu_provider::constructors)
2916#[cfg(feature = "compiled_data")]
2917pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> {
2918ComposingNormalizerBorrowed::new_nfkc()
2919 }
29202921icu_provider::gen_buffer_data_constructors!(
2922 () -> error: DataError,
2923 functions: [
2924 new_nfkc: skip,
2925 try_new_nfkc_with_buffer_provider,
2926 try_new_nfkc_unstable,
2927Self,
2928 ]
2929 );
29302931#[doc = "A version of [`Self::new_nfkc`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
2932pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError>
2933where
2934D: DataProvider<NormalizerNfkdDataV1>
2935 + DataProvider<NormalizerNfdTablesV1>
2936 + DataProvider<NormalizerNfkdTablesV1>
2937 + DataProvider<NormalizerNfcV1>
2938 + ?Sized,
2939 {
2940let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
29412942let canonical_compositions: DataPayload<NormalizerNfcV1> =
2943provider.load(Default::default())?.payload;
29442945Ok(ComposingNormalizer {
2946decomposing_normalizer,
2947canonical_compositions,
2948 })
2949 }
29502951#[doc = "A version of [`Self::new_uts46`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2952pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError>
2953where
2954D: DataProvider<NormalizerUts46DataV1>
2955 + DataProvider<NormalizerNfdTablesV1>
2956 + DataProvider<NormalizerNfkdTablesV1>
2957// UTS 46 tables merged into CompatibilityDecompositionTablesV1
2958+ DataProvider<NormalizerNfcV1>
2959 + ?Sized,
2960 {
2961let decomposing_normalizer =
2962DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
29632964let canonical_compositions: DataPayload<NormalizerNfcV1> =
2965provider.load(Default::default())?.payload;
29662967Ok(ComposingNormalizer {
2968decomposing_normalizer,
2969canonical_compositions,
2970 })
2971 }
2972}
29732974#[cfg(feature = "utf16_iter")]
2975struct IsNormalizedSinkUtf16<'a> {
2976 expect: &'a [u16],
2977}
29782979#[cfg(feature = "utf16_iter")]
2980impl<'a> IsNormalizedSinkUtf16<'a> {
2981pub fn new(slice: &'a [u16]) -> Self {
2982 IsNormalizedSinkUtf16 { expect: slice }
2983 }
2984pub fn remaining_len(&self) -> usize {
2985self.expect.len()
2986 }
2987}
29882989#[cfg(feature = "utf16_iter")]
2990impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
2991fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
2992// We know that if we get a slice, it's a pass-through,
2993 // so we can compare addresses. Indexing is OK, because
2994 // an indexing failure would be a code bug rather than
2995 // an input or data issue.
2996#[expect(clippy::indexing_slicing)]
2997if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
2998self.expect = &self.expect[s.len()..];
2999Ok(())
3000 } else {
3001Err(core::fmt::Error {})
3002 }
3003 }
30043005fn write_char(&mut self, c: char) -> core::fmt::Result {
3006let mut iter = self.expect.chars();
3007if iter.next() == Some(c) {
3008self.expect = iter.as_slice();
3009Ok(())
3010 } else {
3011Err(core::fmt::Error {})
3012 }
3013 }
3014}
30153016#[cfg(feature = "utf8_iter")]
3017struct IsNormalizedSinkUtf8<'a> {
3018 expect: &'a [u8],
3019}
30203021#[cfg(feature = "utf8_iter")]
3022impl<'a> IsNormalizedSinkUtf8<'a> {
3023pub fn new(slice: &'a [u8]) -> Self {
3024 IsNormalizedSinkUtf8 { expect: slice }
3025 }
3026pub fn remaining_len(&self) -> usize {
3027self.expect.len()
3028 }
3029}
30303031#[cfg(feature = "utf8_iter")]
3032impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
3033fn write_str(&mut self, s: &str) -> core::fmt::Result {
3034// We know that if we get a slice, it's a pass-through,
3035 // so we can compare addresses. Indexing is OK, because
3036 // an indexing failure would be a code bug rather than
3037 // an input or data issue.
3038#[expect(clippy::indexing_slicing)]
3039if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3040self.expect = &self.expect[s.len()..];
3041Ok(())
3042 } else {
3043Err(core::fmt::Error {})
3044 }
3045 }
30463047fn write_char(&mut self, c: char) -> core::fmt::Result {
3048let mut iter = self.expect.chars();
3049if iter.next() == Some(c) {
3050self.expect = iter.as_slice();
3051Ok(())
3052 } else {
3053Err(core::fmt::Error {})
3054 }
3055 }
3056}
30573058struct IsNormalizedSinkStr<'a> {
3059 expect: &'a str,
3060}
30613062impl<'a> IsNormalizedSinkStr<'a> {
3063pub fn new(slice: &'a str) -> Self {
3064IsNormalizedSinkStr { expect: slice }
3065 }
3066pub fn remaining_len(&self) -> usize {
3067self.expect.len()
3068 }
3069}
30703071impl core::fmt::Writefor IsNormalizedSinkStr<'_> {
3072fn write_str(&mut self, s: &str) -> core::fmt::Result {
3073// We know that if we get a slice, it's a pass-through,
3074 // so we can compare addresses. Indexing is OK, because
3075 // an indexing failure would be a code bug rather than
3076 // an input or data issue.
3077if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3078self.expect = &self.expect[s.len()..];
3079Ok(())
3080 } else {
3081Err(core::fmt::Error {})
3082 }
3083 }
30843085fn write_char(&mut self, c: char) -> core::fmt::Result {
3086let mut iter = self.expect.chars();
3087if iter.next() == Some(c) {
3088self.expect = iter.as_str();
3089Ok(())
3090 } else {
3091Err(core::fmt::Error {})
3092 }
3093 }
3094}