1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
45// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
6#![cfg_attr(not(any(test, doc)), no_std)]
7#![cfg_attr(
8 not(test),
9 deny(
10 clippy::indexing_slicing,
11 clippy::unwrap_used,
12 clippy::expect_used,
13 clippy::panic,
14 clippy::exhaustive_structs,
15 clippy::exhaustive_enums,
16 clippy::trivially_copy_pass_by_ref,
17 missing_debug_implementations,
18 )
19)]
20#![warn(missing_docs)]
2122//! Normalizing text into Unicode Normalization Forms.
23//!
24//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
25//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
26//!
27//! # Functionality
28//!
29//! The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
30//! Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
31//!
32//! Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
33//! and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
34//!
35//! The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
36//! Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
37//! applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
38//! [`idna`](https://docs.rs/idna/latest/idna/) crate.
39//!
40//! The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
41//! the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
42//! property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the
43//! [`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate.
44//!
45//! Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
46//! addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
47//! non-“maybe” answer.
48//!
49//! # Examples
50//!
51//! ```
52//! let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
53//! assert_eq!(nfc.normalize("a\u{0308}"), "ä");
54//! assert!(nfc.is_normalized("ä"));
55//!
56//! let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
57//! assert_eq!(nfd.normalize("ä"), "a\u{0308}");
58//! assert!(!nfd.is_normalized("ä"));
59//! ```
6061extern crate alloc;
6263// TODO: The plan is to replace
64// `#[cfg(not(icu4x_unstable_fast_trie_only))]`
65// with
66// `#[cfg(feature = "serde")]`
67// and
68// `#[cfg(icu4x_unstable_fast_trie_only)]`
69// with
70// `#[cfg(not(feature = "serde"))]`
71//
72// Before doing so:
73// * The type of the UTS 46 trie needs to be
74// disentangled from the type of the NFD/NFKD tries.
75// This will involve a more generic iterator hidden
76// inside the public iterator types.
77// * datagen needs to emit fast-mode tries for the
78// NFD and NFKD tries.
79// * The markers and possibly the data struct type
80// for NFD and NFKD need to be revised per policy.
8182#[cfg(not(icu4x_unstable_fast_trie_only))]
83type Trie<'trie> = CodePointTrie<'trie, u32>;
8485#[cfg(icu4x_unstable_fast_trie_only)]
86type Trie<'trie> = FastCodePointTrie<'trie, u32>;
8788// We don't depend on icu_properties to minimize deps, but we want to be able
89// to ensure we're using the right CCC values
90macro_rules! ccc {
91 ($name:ident, $num:expr) => {
92const {
93#[cfg(feature = "icu_properties")]
94if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num {
95panic!("icu_normalizer has incorrect ccc values")
96 }
97 CanonicalCombiningClass::from_icu4c_value($num)
98 }
99 };
100}
101102pub mod properties;
103pub mod provider;
104pub mod uts46;
105106use crate::provider::CanonicalCompositions;
107use crate::provider::DecompositionData;
108use crate::provider::NormalizerNfdDataV1;
109use crate::provider::NormalizerNfkdDataV1;
110use crate::provider::NormalizerUts46DataV1;
111use alloc::borrow::Cow;
112use alloc::string::String;
113use core::char::REPLACEMENT_CHARACTER;
114use icu_collections::char16trie::Char16Trie;
115use icu_collections::char16trie::Char16TrieIterator;
116use icu_collections::char16trie::TrieResult;
117#[cfg(not(icu4x_unstable_fast_trie_only))]
118use icu_collections::codepointtrie::CodePointTrie;
119#[cfg(icu4x_unstable_fast_trie_only)]
120use icu_collections::codepointtrie::FastCodePointTrie;
121#[cfg(icu4x_unstable_fast_trie_only)]
122use icu_collections::codepointtrie::TypedCodePointTrie;
123#[cfg(feature = "icu_properties")]
124use icu_properties::props::CanonicalCombiningClass;
125use icu_provider::prelude::*;
126use provider::DecompositionTables;
127use provider::NormalizerNfcV1;
128use provider::NormalizerNfdTablesV1;
129use provider::NormalizerNfkdTablesV1;
130use smallvec::SmallVec;
131#[cfg(feature = "utf16_iter")]
132use utf16_iter::Utf16CharsEx;
133#[cfg(feature = "utf8_iter")]
134use utf8_iter::Utf8CharsEx;
135use zerovec::{zeroslice, ZeroSlice};
136137// The optimizations in the area where `likely` is used
138// are extremely brittle. `likely` is useful in the typed-trie
139// case on the UTF-16 fast path, but in order not to disturb
140// the untyped-trie case on the UTF-16 fast path, make the
141// annotations no-ops in the untyped-trie case.
142143// `cold_path` and `likely` come from
144// https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
145// See https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3#commitcomment-164768806
146// for permission to relicense under Unicode-3.0.
147148#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
149#[inline(always)]
150#[cold]
151fn cold_path() {}
152153#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
154#[inline(always)]
155pub(crate) fn likely(b: bool) -> bool {
156if b {
157true
158} else {
159 cold_path();
160false
161}
162}
163164// End import from https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
165166/// No-op for typed trie case.
167#[cfg(all(not(icu4x_unstable_fast_trie_only), feature = "utf16_iter"))]
168#[inline(always)]
169fn likely(b: bool) -> bool {
170 b
171}
172173/// This type exists as a shim for icu_properties CanonicalCombiningClass when the crate is disabled
174/// It should not be exposed to users.
175#[cfg(not(feature = "icu_properties"))]
176#[derive(#[automatically_derived]
impl ::core::marker::Copy for CanonicalCombiningClass { }Copy, #[automatically_derived]
impl ::core::clone::Clone for CanonicalCombiningClass {
#[inline]
fn clone(&self) -> CanonicalCombiningClass {
let _: ::core::clone::AssertParamIsClone<u8>;
*self
}
}Clone, #[automatically_derived]
impl ::core::cmp::Eq for CanonicalCombiningClass {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_receiver_is_total_eq(&self) -> () {
let _: ::core::cmp::AssertParamIsEq<u8>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for CanonicalCombiningClass {
#[inline]
fn eq(&self, other: &CanonicalCombiningClass) -> bool {
self.0 == other.0
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::PartialOrd for CanonicalCombiningClass {
#[inline]
fn partial_cmp(&self, other: &CanonicalCombiningClass)
-> ::core::option::Option<::core::cmp::Ordering> {
::core::cmp::PartialOrd::partial_cmp(&self.0, &other.0)
}
}PartialOrd, #[automatically_derived]
impl ::core::cmp::Ord for CanonicalCombiningClass {
#[inline]
fn cmp(&self, other: &CanonicalCombiningClass) -> ::core::cmp::Ordering {
::core::cmp::Ord::cmp(&self.0, &other.0)
}
}Ord)]
177struct CanonicalCombiningClass(pub(crate) u8);
178179#[cfg(not(feature = "icu_properties"))]
180impl CanonicalCombiningClass {
181const fn from_icu4c_value(v: u8) -> Self {
182Self(v)
183 }
184const fn to_icu4c_value(self) -> u8 {
185self.0
186}
187}
188189const CCC_NOT_REORDERED: CanonicalCombiningClass = const { CanonicalCombiningClass::from_icu4c_value(0) }ccc!(NotReordered, 0);
190const CCC_ABOVE: CanonicalCombiningClass = const { CanonicalCombiningClass::from_icu4c_value(230) }ccc!(Above, 230);
191192/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
193#[derive(#[automatically_derived]
impl ::core::fmt::Debug for IgnorableBehavior {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::write_str(f,
match self {
IgnorableBehavior::Unsupported => "Unsupported",
IgnorableBehavior::Ignored => "Ignored",
IgnorableBehavior::ReplacementCharacter =>
"ReplacementCharacter",
})
}
}Debug, #[automatically_derived]
impl ::core::cmp::PartialEq for IgnorableBehavior {
#[inline]
fn eq(&self, other: &IgnorableBehavior) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::Eq for IgnorableBehavior {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_receiver_is_total_eq(&self) -> () {}
}Eq)]
194enum IgnorableBehavior {
195/// 0xFFFFFFFF in data is not supported.
196Unsupported,
197/// Ignorables are ignored.
198Ignored,
199/// Ignorables are treated as singleton decompositions
200 /// to the REPLACEMENT CHARACTER.
201ReplacementCharacter,
202}
203204/// Marker for UTS 46 ignorables.
205///
206/// See trie-value-format.md
207const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
208209/// Marker that the decomposition does not round trip via NFC.
210///
211/// See trie-value-format.md
212const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
213214/// Marker that the first character of the decomposition
215/// can combine backwards.
216///
217/// See trie-value-format.md
218const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
219220/// Mask for the bits have to be zero for this to be a BMP
221/// singleton decomposition, or value baked into the surrogate
222/// range.
223///
224/// See trie-value-format.md
225const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
226227/// Mask for the bits have to be zero for this to be a complex
228/// decomposition.
229///
230/// See trie-value-format.md
231const LOW_ZEROS_MASK: u32 = 0xFFE0;
232233/// Checks if a trie value carries a (non-zero) canonical
234/// combining class.
235///
236/// See trie-value-format.md
237fn trie_value_has_ccc(trie_value: u32) -> bool {
238 (trie_value & 0x3FFFFE00) == 0xD800
239}
240241/// Checks if the trie signifies a special non-starter decomposition.
242///
243/// See trie-value-format.md
244fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
245 (trie_value & 0x3FFFFF00) == 0xD900
246}
247248/// Checks if a trie value signifies a character whose decomposition
249/// starts with a non-starter.
250///
251/// See trie-value-format.md
252fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
253trie_value_has_ccc(trie_value)
254}
255256/// Extracts a canonical combining class (possibly zero) from a trie value.
257///
258/// See trie-value-format.md
259fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
260if trie_value_has_ccc(trie_value) {
261CanonicalCombiningClass::from_icu4c_value(trie_valueas u8)
262 } else {
263CCC_NOT_REORDERED264 }
265}
266267/// The tail (everything after the first character) of the NFKD form U+FDFA
268/// as 16-bit units.
269static FDFA_NFKD: [u16; 17] = [
2700x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
2710x633, 0x644, 0x645,
272];
273274/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
275/// but they differ by `NON_ROUND_TRIP_MARKER`.)
276///
277/// See trie-value-format.md
278const FDFA_MARKER: u16 = 1;
279280// These constants originate from page 143 of Unicode 14.0
281/// Syllable base
282const HANGUL_S_BASE: u32 = 0xAC00;
283/// Lead jamo base
284const HANGUL_L_BASE: u32 = 0x1100;
285/// Vowel jamo base
286const HANGUL_V_BASE: u32 = 0x1161;
287/// Trail jamo base (deliberately off by one to account for the absence of a trail)
288const HANGUL_T_BASE: u32 = 0x11A7;
289/// Lead jamo count
290const HANGUL_L_COUNT: u32 = 19;
291/// Vowel jamo count
292const HANGUL_V_COUNT: u32 = 21;
293/// Trail jamo count (deliberately off by one to account for the absence of a trail)
294const HANGUL_T_COUNT: u32 = 28;
295/// Vowel jamo count times trail jamo count
296const HANGUL_N_COUNT: u32 = 588;
297/// Syllable count
298const HANGUL_S_COUNT: u32 = 11172;
299300/// One past the conjoining jamo block
301const HANGUL_JAMO_LIMIT: u32 = 0x1200;
302303/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
304/// are enabled and return `default` if debug assertions are not enabled.
305///
306/// Use this only if the only reason why `opt` could be `None` is bogus
307/// data from the provider.
308#[inline(always)]
309fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
310if let Some(val) = opt {
311val312 } else {
313// GIGO case
314if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
315default316 }
317}
318319/// Convert a `u32` _obtained from data provider data_ to `char`.
320#[inline(always)]
321fn char_from_u32(u: u32) -> char {
322unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
323}
324325/// Convert a `u16` _obtained from data provider data_ to `char`.
326#[inline(always)]
327fn char_from_u16(u: u16) -> char {
328char_from_u32(u32::from(u))
329}
330331const EMPTY_U16: &ZeroSlice<u16> = ::zerovec::ZeroSlice::new_empty()zeroslice![];
332333const EMPTY_CHAR: &ZeroSlice<char> = ::zerovec::ZeroSlice::new_empty()zeroslice![];
334335#[inline(always)]
336fn in_inclusive_range(c: char, start: char, end: char) -> bool {
337u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
338}
339340#[inline(always)]
341#[cfg(feature = "utf16_iter")]
342fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
343 u.wrapping_sub(start) <= (end - start)
344}
345346/// Performs canonical composition (including Hangul) on a pair of
347/// characters or returns `None` if these characters don't compose.
348/// Composition exclusions are taken into account.
349#[inline]
350fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
351let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
352if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
353return compose_non_hangul(iter, starter, second);
354 }
355if v < HANGUL_V_COUNT {
356let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
357if l < HANGUL_L_COUNT {
358let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
359// Safe, because the inputs are known to be in range.
360return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
361 }
362return None;
363 }
364if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
365let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
366if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
367let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
368// Safe, because the inputs are known to be in range.
369return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
370 }
371 }
372None373}
374375/// Performs (non-Hangul) canonical composition on a pair of characters
376/// or returns `None` if these characters don't compose. Composition
377/// exclusions are taken into account.
378fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
379// To make the trie smaller, the pairs are stored second character first.
380 // Given how this method is used in ways where it's known that `second`
381 // is or isn't a starter. We could potentially split the trie into two
382 // tries depending on whether `second` is a starter.
383match iter.next(second) {
384 TrieResult::NoMatch => None,
385 TrieResult::NoValue => match iter.next(starter) {
386 TrieResult::NoMatch => None,
387 TrieResult::FinalValue(i) => {
388if let Some(c) = char::from_u32(ias u32) {
389Some(c)
390 } else {
391// GIGO case
392if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
393None394 }
395 }
396 TrieResult::NoValue | TrieResult::Intermediate(_) => {
397// GIGO case
398if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
399None400 }
401 },
402 TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
403// GIGO case
404if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
405None406 }
407 }
408}
409410/// See trie-value-format.md
411#[inline(always)]
412fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
413// The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
414 // and this function needs to ignore that.
415(trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
416}
417418/// See trie-value-format.md
419#[inline(always)]
420fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
421 (trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
422}
423424/// Struct for holding together a character and the value
425/// looked up for it from the NFD trie in a more explicit
426/// way than an anonymous pair.
427/// Also holds a flag about the supplementary-trie provenance.
428#[derive(#[automatically_derived]
impl ::core::fmt::Debug for CharacterAndTrieValue {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f,
"CharacterAndTrieValue", "character", &self.character, "trie_val",
&&self.trie_val)
}
}Debug, #[automatically_derived]
impl ::core::cmp::PartialEq for CharacterAndTrieValue {
#[inline]
fn eq(&self, other: &CharacterAndTrieValue) -> bool {
self.character == other.character && self.trie_val == other.trie_val
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::Eq for CharacterAndTrieValue {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_receiver_is_total_eq(&self) -> () {
let _: ::core::cmp::AssertParamIsEq<char>;
let _: ::core::cmp::AssertParamIsEq<u32>;
}
}Eq)]
429struct CharacterAndTrieValue {
430 character: char,
431/// See trie-value-format.md
432trie_val: u32,
433}
434435impl CharacterAndTrieValue {
436#[inline(always)]
437pub fn new(c: char, trie_value: u32) -> Self {
438CharacterAndTrieValue {
439 character: c,
440 trie_val: trie_value,
441 }
442 }
443444#[inline(always)]
445pub fn starter_and_decomposes_to_self(&self) -> bool {
446starter_and_decomposes_to_self_impl(self.trie_val)
447 }
448449/// See trie-value-format.md
450#[inline(always)]
451 #[cfg(feature = "utf8_iter")]
452pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
453// This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
454 // to be compared with zero. U+FFFD has that flag set despite really
455 // being being round-tripping in order to make UTF-8 errors
456 // ineligible for passthrough.
457(self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
458}
459460/// See trie-value-format.md
461#[inline(always)]
462pub fn can_combine_backwards(&self) -> bool {
463 (self.trie_val & BACKWARD_COMBINING_MARKER) != 0
464}
465/// See trie-value-format.md
466#[inline(always)]
467pub fn potential_passthrough(&self) -> bool {
468 (self.trie_val & NON_ROUND_TRIP_MARKER) == 0
469}
470/// See trie-value-format.md
471#[inline(always)]
472pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
473potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
474 }
475}
476477/// Pack a `char` and a `CanonicalCombiningClass` in
478/// 32 bits (the former in the lower 24 bits and the
479/// latter in the high 8 bits). The latter can be
480/// initialized to 0xFF upon creation, in which case
481/// it can be actually set later by calling
482/// `set_ccc_from_trie_if_not_already_set`. This is
483/// a micro optimization to avoid the Canonical
484/// Combining Class trie lookup when there is only
485/// one combining character in a sequence. This type
486/// is intentionally non-`Copy` to get compiler help
487/// in making sure that the class is set on the
488/// instance on which it is intended to be set
489/// and not on a temporary copy.
490///
491/// Note that 0xFF is won't be assigned to an actual
492/// canonical combining class per definition D104
493/// in The Unicode Standard.
494//
495// NOTE: The Pernosco debugger has special knowledge
496// of this struct. Please do not change the bit layout
497// or the crate-module-qualified name of this struct
498// without coordination.
499#[derive(#[automatically_derived]
impl ::core::fmt::Debug for CharacterAndClass {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"CharacterAndClass", &&self.0)
}
}Debug)]
500struct CharacterAndClass(u32);
501502impl CharacterAndClass {
503pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
504CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
505 }
506pub fn new_with_placeholder(c: char) -> Self {
507CharacterAndClass(u32::from(c) | ((0xFF) << 24))
508 }
509pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
510Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
511 }
512pub fn new_starter(c: char) -> Self {
513CharacterAndClass(u32::from(c))
514 }
515/// This method must exist for Pernosco to apply its special rendering.
516 /// Also, this must not be dead code!
517pub fn character(&self) -> char {
518// Safe, because the low 24 bits came from a `char`
519 // originally.
520unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
521 }
522/// This method must exist for Pernosco to apply its special rendering.
523pub fn ccc(&self) -> CanonicalCombiningClass {
524CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
525 }
526527pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
528 (self.character(), self.ccc())
529 }
530pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &Trie) {
531if self.0 >> 24 != 0xFF {
532return;
533 }
534let scalar = self.0 & 0xFFFFFF;
535self.0 =
536 ((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar;
537 }
538}
539540// This function exists as a borrow check helper.
541#[inline(always)]
542fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &Trie) {
543// We don't look up the canonical combining class for starters
544 // of for single combining characters between starters. When
545 // there's more than one combining character between starters,
546 // we look up the canonical combining class for each character
547 // exactly once.
548if slice.len() < 2 {
549return;
550 }
551slice552 .iter_mut()
553 .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
554slice.sort_by_key(|cc| cc.ccc());
555}
556557/// An iterator adaptor that turns an `Iterator` over `char` into
558/// a lazily-decomposed `char` sequence.
559#[derive(#[automatically_derived]
impl<'data, I: ::core::fmt::Debug> ::core::fmt::Debug for
Decomposition<'data, I> where I: Iterator<Item = char> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
let names: &'static _ =
&["delegate", "buffer", "buffer_pos", "pending", "trie",
"scalars16", "scalars24", "supplementary_scalars16",
"supplementary_scalars24",
"decomposition_passthrough_bound", "ignorable_behavior"];
let values: &[&dyn ::core::fmt::Debug] =
&[&self.delegate, &self.buffer, &self.buffer_pos, &self.pending,
&self.trie, &self.scalars16, &self.scalars24,
&self.supplementary_scalars16,
&self.supplementary_scalars24,
&self.decomposition_passthrough_bound,
&&self.ignorable_behavior];
::core::fmt::Formatter::debug_struct_fields_finish(f, "Decomposition",
names, values)
}
}Debug)]
560pub struct Decomposition<'data, I>
561where
562I: Iterator<Item = char>,
563{
564 delegate: I,
565 buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
566/// The index of the next item to be read from `buffer`.
567 /// The purpose if this index is to avoid having to move
568 /// the rest upon every read.
569buffer_pos: usize,
570// At the start of `next()` if not `None`, this is a pending unnormalized
571 // starter. When `Decomposition` appears alone, this is never a non-starter.
572 // However, when `Decomposition` appears inside a `Composition`, this
573 // may become a non-starter before `decomposing_next()` is called.
574pending: Option<CharacterAndTrieValue>, // None at end of stream
575 // See trie-value-format.md
576trie: &'data Trie<'data>,
577 scalars16: &'data ZeroSlice<u16>,
578 scalars24: &'data ZeroSlice<char>,
579 supplementary_scalars16: &'data ZeroSlice<u16>,
580 supplementary_scalars24: &'data ZeroSlice<char>,
581/// The lowest character for which either of the following does
582 /// not hold:
583 /// 1. Decomposes to self.
584 /// 2. Decomposition starts with a non-starter
585decomposition_passthrough_bound: u32, // never above 0xC0
586ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
587}
588589impl<'data, I> Decomposition<'data, I>
590where
591I: Iterator<Item = char>,
592{
593/// Constructs a decomposing iterator adapter from a delegate
594 /// iterator and references to the necessary data, without
595 /// supplementary data.
596 ///
597 /// Use `DecomposingNormalizer::normalize_iter()` instead unless
598 /// there's a good reason to use this constructor directly.
599 ///
600 /// Public but hidden in order to be able to use this from the
601 /// collator.
602#[doc(hidden)] // used in collator
603pub fn new(
604 delegate: I,
605 decompositions: &'data DecompositionData,
606 tables: &'data DecompositionTables,
607 ) -> Self {
608Self::new_with_supplements(
609delegate,
610decompositions,
611tables,
612None,
6130xC0,
614 IgnorableBehavior::Unsupported,
615 )
616 }
617618/// Constructs a decomposing iterator adapter from a delegate
619 /// iterator and references to the necessary data, including
620 /// supplementary data.
621 ///
622 /// Use `DecomposingNormalizer::normalize_iter()` instead unless
623 /// there's a good reason to use this constructor directly.
624fn new_with_supplements(
625 delegate: I,
626 decompositions: &'data DecompositionData,
627 tables: &'data DecompositionTables,
628 supplementary_tables: Option<&'data DecompositionTables>,
629 decomposition_passthrough_bound: u8,
630 ignorable_behavior: IgnorableBehavior,
631 ) -> Self {
632let mut ret = Decomposition::<I> {
633delegate,
634 buffer: SmallVec::new(), // Normalized
635buffer_pos: 0,
636// Initialize with a placeholder starter in case
637 // the real stream starts with a non-starter.
638pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
639#[allow(clippy::useless_conversion, clippy::expect_used)] // Expectation always succeeds when untyped tries are in use
640trie: <&Trie>::try_from(&decompositions.trie).expect("Unexpected trie type in data"),
641 scalars16: &tables.scalars16,
642 scalars24: &tables.scalars24,
643 supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
644&supplementary.scalars16
645 } else {
646EMPTY_U16647 },
648 supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
649&supplementary.scalars24
650 } else {
651EMPTY_CHAR652 },
653 decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
654ignorable_behavior,
655 };
656let _ = ret.next(); // Remove the U+FFFF placeholder
657ret658 }
659660fn push_decomposition16(
661&mut self,
662 offset: usize,
663 len: usize,
664 only_non_starters_in_trail: bool,
665 slice16: &ZeroSlice<u16>,
666 ) -> (char, usize) {
667let (starter, tail) = slice16668 .get_subslice(offset..offset + len)
669 .and_then(|slice| slice.split_first())
670 .map_or_else(
671 || {
672// GIGO case
673if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
674 (REPLACEMENT_CHARACTER, EMPTY_U16)
675 },
676 |(first, trail)| (char_from_u16(first), trail),
677 );
678if only_non_starters_in_trail {
679// All the rest are combining
680self.buffer.extend(
681tail.iter()
682 .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
683 );
684 (starter, 0)
685 } else {
686let mut i = 0;
687let mut combining_start = 0;
688for u in tail.iter() {
689let ch = char_from_u16(u);
690let trie_value = self.trie.get(ch);
691self.buffer.push(CharacterAndClass::new_with_trie_value(
692 CharacterAndTrieValue::new(ch, trie_value),
693 ));
694 i += 1;
695// Half-width kana and iota subscript don't occur in the tails
696 // of these multicharacter decompositions.
697if !decomposition_starts_with_non_starter(trie_value) {
698 combining_start = i;
699 }
700 }
701 (starter, combining_start)
702 }
703 }
704705fn push_decomposition32(
706&mut self,
707 offset: usize,
708 len: usize,
709 only_non_starters_in_trail: bool,
710 slice32: &ZeroSlice<char>,
711 ) -> (char, usize) {
712let (starter, tail) = slice32713 .get_subslice(offset..offset + len)
714 .and_then(|slice| slice.split_first())
715 .unwrap_or_else(|| {
716// GIGO case
717if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
718 (REPLACEMENT_CHARACTER, EMPTY_CHAR)
719 });
720if only_non_starters_in_trail {
721// All the rest are combining
722self.buffer
723 .extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
724 (starter, 0)
725 } else {
726let mut i = 0;
727let mut combining_start = 0;
728for ch in tail.iter() {
729let trie_value = self.trie.get(ch);
730self.buffer.push(CharacterAndClass::new_with_trie_value(
731 CharacterAndTrieValue::new(ch, trie_value),
732 ));
733 i += 1;
734// Half-width kana and iota subscript don't occur in the tails
735 // of these multicharacter decompositions.
736if !decomposition_starts_with_non_starter(trie_value) {
737 combining_start = i;
738 }
739 }
740 (starter, combining_start)
741 }
742 }
743744#[inline(always)]
745fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
746CharacterAndTrieValue::new(c, self.trie.get(c))
747 }
748749fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
750if true {
if !self.pending.is_none() {
::core::panicking::panic("assertion failed: self.pending.is_none()")
};
};debug_assert!(self.pending.is_none());
751loop {
752let c = self.delegate.next()?;
753754// TODO(#2384): Measure if this check is actually an optimization.
755if u32::from(c) < self.decomposition_passthrough_bound {
756return Some(CharacterAndTrieValue::new(c, 0));
757 }
758759let trie_val = self.trie.get(c);
760// TODO: Can we do something better about the cost of this branch in the
761 // non-UTS 46 case?
762if trie_val == IGNORABLE_MARKER {
763match self.ignorable_behavior {
764 IgnorableBehavior::Unsupported => {
765if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
766 }
767 IgnorableBehavior::ReplacementCharacter => {
768return Some(CharacterAndTrieValue::new(
769c,
770u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER,
771 ));
772 }
773 IgnorableBehavior::Ignored => {
774// Else ignore this character by reading the next one from the delegate.
775continue;
776 }
777 }
778 }
779return Some(CharacterAndTrieValue::new(c, trie_val));
780 }
781 }
782783fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
784if let Some(pending) = self.pending.take() {
785// Only happens as part of `Composition` and as part of
786 // the contiguous-buffer methods of `DecomposingNormalizer`.
787 // I.e. does not happen as part of standalone iterator
788 // usage of `Decomposition`.
789Some(pending)
790 } else {
791self.delegate_next_no_pending()
792 }
793 }
794795fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
796let (starter, combining_start) = {
797let c = c_and_trie_val.character;
798// See trie-value-format.md
799let decomposition = c_and_trie_val.trie_val;
800// The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
801 // and that flag needs to be ignored here.
802if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
803// The character is its own decomposition
804(c, 0)
805 } else {
806let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
807let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
808if !high_zeros && !low_zeros {
809// Decomposition into two BMP characters: starter and non-starter
810let starter = char_from_u32(decomposition & 0x7FFF);
811let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
812self.buffer
813 .push(CharacterAndClass::new_with_placeholder(combining));
814 (starter, 0)
815 } else if high_zeros {
816// Do the check by looking at `c` instead of looking at a marker
817 // in `singleton` below, because if we looked at the trie value,
818 // we'd still have to check that `c` is in the Hangul syllable
819 // range in order for the subsequent interpretations as `char`
820 // to be safe.
821 // Alternatively, `FDFA_MARKER` and the Hangul marker could
822 // be unified. That would add a branch for Hangul and remove
823 // a branch from singleton decompositions. It seems more
824 // important to favor Hangul syllables than singleton
825 // decompositions.
826 // Note that it would be valid to hoist this Hangul check
827 // one or even two steps earlier in this check hierarchy.
828 // Right now, it's assumed the kind of decompositions into
829 // BMP starter and non-starter, which occur in many languages,
830 // should be checked before Hangul syllables, which are about
831 // one language specifically. Hopefully, we get some
832 // instruction-level parallelism out of the disjointness of
833 // operations on `c` and `decomposition`.
834let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
835if hangul_offset < HANGUL_S_COUNT {
836if true {
match (&decomposition, &1) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(decomposition, 1);
837// Hangul syllable
838 // The math here comes from page 144 of Unicode 14.0
839let l = hangul_offset / HANGUL_N_COUNT;
840let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
841let t = hangul_offset % HANGUL_T_COUNT;
842843// The unsafe blocks here are OK, because the values stay
844 // within the Hangul jamo block and, therefore, the scalar
845 // value range by construction.
846self.buffer.push(CharacterAndClass::new_starter(unsafe {
847 core::char::from_u32_unchecked(HANGUL_V_BASE + v)
848 }));
849let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
850if t != 0 {
851self.buffer.push(CharacterAndClass::new_starter(unsafe {
852 core::char::from_u32_unchecked(HANGUL_T_BASE + t)
853 }));
854 (first, 2)
855 } else {
856 (first, 1)
857 }
858 } else {
859let singleton = decompositionas u16;
860if singleton != FDFA_MARKER {
861// Decomposition into one BMP character
862let starter = char_from_u16(singleton);
863 (starter, 0)
864 } else {
865// Special case for the NFKD form of U+FDFA.
866self.buffer.extend(FDFA_NFKD.map(|u| {
867// SAFETY: `FDFA_NFKD` is known not to contain
868 // surrogates.
869CharacterAndClass::new_starter(unsafe {
870 core::char::from_u32_unchecked(u32::from(u))
871 })
872 }));
873 ('\u{0635}', 17)
874 }
875 }
876 } else {
877if true {
if !low_zeros { ::core::panicking::panic("assertion failed: low_zeros") };
};debug_assert!(low_zeros);
878// Only 12 of 14 bits used as of Unicode 16.
879let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
880// Only 3 of 4 bits used as of Unicode 16.
881let len_bits = decomposition & 0b1111;
882let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
883if offset < self.scalars16.len() {
884self.push_decomposition16(
885offset,
886 (len_bits + 2) as usize,
887only_non_starters_in_trail,
888self.scalars16,
889 )
890 } else if offset < self.scalars16.len() + self.scalars24.len() {
891self.push_decomposition32(
892offset - self.scalars16.len(),
893 (len_bits + 1) as usize,
894only_non_starters_in_trail,
895self.scalars24,
896 )
897 } else if offset898 < self.scalars16.len()
899 + self.scalars24.len()
900 + self.supplementary_scalars16.len()
901 {
902self.push_decomposition16(
903offset - (self.scalars16.len() + self.scalars24.len()),
904 (len_bits + 2) as usize,
905only_non_starters_in_trail,
906self.supplementary_scalars16,
907 )
908 } else {
909self.push_decomposition32(
910offset911 - (self.scalars16.len()
912 + self.scalars24.len()
913 + self.supplementary_scalars16.len()),
914 (len_bits + 1) as usize,
915only_non_starters_in_trail,
916self.supplementary_scalars24,
917 )
918 }
919 }
920 }
921 };
922// Either we're inside `Composition` or `self.pending.is_none()`.
923924self.gather_and_sort_combining(combining_start);
925starter926 }
927928fn gather_and_sort_combining(&mut self, combining_start: usize) {
929// Not a `for` loop to avoid holding a mutable reference to `self` across
930 // the loop body.
931while let Some(ch_and_trie_val) = self.delegate_next() {
932if !trie_value_has_ccc(ch_and_trie_val.trie_val) {
933self.pending = Some(ch_and_trie_val);
934break;
935 } else if !trie_value_indicates_special_non_starter_decomposition(
936 ch_and_trie_val.trie_val,
937 ) {
938self.buffer
939 .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
940 } else {
941// The Tibetan special cases are starters that decompose into non-starters.
942let mapped = match ch_and_trie_val.character {
943'\u{0340}' => {
944// COMBINING GRAVE TONE MARK
945CharacterAndClass::new('\u{0300}', CCC_ABOVE)
946 }
947'\u{0341}' => {
948// COMBINING ACUTE TONE MARK
949CharacterAndClass::new('\u{0301}', CCC_ABOVE)
950 }
951'\u{0343}' => {
952// COMBINING GREEK KORONIS
953CharacterAndClass::new('\u{0313}', CCC_ABOVE)
954 }
955'\u{0344}' => {
956// COMBINING GREEK DIALYTIKA TONOS
957self.buffer
958 .push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
959 CharacterAndClass::new('\u{0301}', CCC_ABOVE)
960 }
961'\u{0F73}' => {
962// TIBETAN VOWEL SIGN II
963self.buffer
964 .push(CharacterAndClass::new('\u{0F71}', const { CanonicalCombiningClass::from_icu4c_value(129) }ccc!(CCC129, 129)));
965 CharacterAndClass::new('\u{0F72}', const { CanonicalCombiningClass::from_icu4c_value(130) }ccc!(CCC130, 130))
966 }
967'\u{0F75}' => {
968// TIBETAN VOWEL SIGN UU
969self.buffer
970 .push(CharacterAndClass::new('\u{0F71}', const { CanonicalCombiningClass::from_icu4c_value(129) }ccc!(CCC129, 129)));
971 CharacterAndClass::new('\u{0F74}', const { CanonicalCombiningClass::from_icu4c_value(132) }ccc!(CCC132, 132))
972 }
973'\u{0F81}' => {
974// TIBETAN VOWEL SIGN REVERSED II
975self.buffer
976 .push(CharacterAndClass::new('\u{0F71}', const { CanonicalCombiningClass::from_icu4c_value(129) }ccc!(CCC129, 129)));
977 CharacterAndClass::new('\u{0F80}', const { CanonicalCombiningClass::from_icu4c_value(130) }ccc!(CCC130, 130))
978 }
979'\u{FF9E}' => {
980// HALFWIDTH KATAKANA VOICED SOUND MARK
981 CharacterAndClass::new('\u{3099}', const { CanonicalCombiningClass::from_icu4c_value(8) }ccc!(KanaVoicing, 8))
982 }
983'\u{FF9F}' => {
984// HALFWIDTH KATAKANA VOICED SOUND MARK
985 CharacterAndClass::new('\u{309A}', const { CanonicalCombiningClass::from_icu4c_value(8) }ccc!(KanaVoicing, 8))
986 }
987_ => {
988// GIGO case
989if true {
if !false { ::core::panicking::panic("assertion failed: false") };
};debug_assert!(false);
990 CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
991 }
992 };
993self.buffer.push(mapped);
994 }
995 }
996// Slicing succeeds by construction; we've always ensured that `combining_start`
997 // is in permissible range.
998#[expect(clippy::indexing_slicing)]
999sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
1000 }
1001}
10021003impl<I> Iteratorfor Decomposition<'_, I>
1004where
1005I: Iterator<Item = char>,
1006{
1007type Item = char;
10081009fn next(&mut self) -> Option<char> {
1010if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
1011self.buffer_pos += 1;
1012if self.buffer_pos == self.buffer.len() {
1013self.buffer.clear();
1014self.buffer_pos = 0;
1015 }
1016return Some(ret);
1017 }
1018if true {
match (&self.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(self.buffer_pos, 0);
1019let c_and_trie_val = self.pending.take()?;
1020Some(self.decomposing_next(c_and_trie_val))
1021 }
1022}
10231024/// An iterator adaptor that turns an `Iterator` over `char` into
1025/// a lazily-decomposed and then canonically composed `char` sequence.
1026#[derive(#[automatically_derived]
impl<'data, I: ::core::fmt::Debug> ::core::fmt::Debug for
Composition<'data, I> where I: Iterator<Item = char> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field4_finish(f, "Composition",
"decomposition", &self.decomposition, "canonical_compositions",
&self.canonical_compositions, "unprocessed_starter",
&self.unprocessed_starter, "composition_passthrough_bound",
&&self.composition_passthrough_bound)
}
}Debug)]
1027pub struct Composition<'data, I>
1028where
1029I: Iterator<Item = char>,
1030{
1031/// The decomposing part of the normalizer than operates before
1032 /// the canonical composition is performed on its output.
1033decomposition: Decomposition<'data, I>,
1034/// Non-Hangul canonical composition data.
1035canonical_compositions: Char16Trie<'data>,
1036/// To make `next()` yield in cases where there's a non-composing
1037 /// starter in the decomposition buffer, we put it here to let it
1038 /// wait for the next `next()` call (or a jump forward within the
1039 /// `next()` call).
1040unprocessed_starter: Option<char>,
1041/// The lowest character for which any one of the following does
1042 /// not hold:
1043 /// 1. Roundtrips via decomposition and recomposition.
1044 /// 2. Decomposition starts with a non-starter
1045 /// 3. Is not a backward-combining starter
1046composition_passthrough_bound: u32,
1047}
10481049impl<'data, I> Composition<'data, I>
1050where
1051I: Iterator<Item = char>,
1052{
1053fn new(
1054 decomposition: Decomposition<'data, I>,
1055 canonical_compositions: Char16Trie<'data>,
1056 composition_passthrough_bound: u16,
1057 ) -> Self {
1058Self {
1059decomposition,
1060canonical_compositions,
1061 unprocessed_starter: None,
1062 composition_passthrough_bound: u32::from(composition_passthrough_bound),
1063 }
1064 }
10651066/// Performs canonical composition (including Hangul) on a pair of
1067 /// characters or returns `None` if these characters don't compose.
1068 /// Composition exclusions are taken into account.
1069#[inline(always)]
1070pub fn compose(&self, starter: char, second: char) -> Option<char> {
1071compose(self.canonical_compositions.iter(), starter, second)
1072 }
10731074/// Performs (non-Hangul) canonical composition on a pair of characters
1075 /// or returns `None` if these characters don't compose. Composition
1076 /// exclusions are taken into account.
1077#[inline(always)]
1078fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
1079compose_non_hangul(self.canonical_compositions.iter(), starter, second)
1080 }
1081}
10821083impl<I> Iteratorfor Composition<'_, I>
1084where
1085I: Iterator<Item = char>,
1086{
1087type Item = char;
10881089#[inline]
1090fn next(&mut self) -> Option<char> {
1091let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
1092if self.unprocessed_starter.is_none() {
1093// The loop is only broken out of as goto forward
1094#[expect(clippy::never_loop)]
1095loop {
1096if let Some((character, ccc)) = self1097 .decomposition
1098 .buffer
1099 .get(self.decomposition.buffer_pos)
1100 .map(|c| c.character_and_ccc())
1101 {
1102self.decomposition.buffer_pos += 1;
1103if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
1104self.decomposition.buffer.clear();
1105self.decomposition.buffer_pos = 0;
1106 }
1107if ccc == CCC_NOT_REORDERED {
1108// Previous decomposition contains a starter. This must
1109 // now become the `unprocessed_starter` for it to have
1110 // a chance to compose with the upcoming characters.
1111 //
1112 // E.g. parenthesized Hangul in NFKC comes through here,
1113 // but suitable composition exclusion could exercise this
1114 // in NFC.
1115self.unprocessed_starter = Some(character);
1116break; // We already have a starter, so skip taking one from `pending`.
1117}
1118return Some(character);
1119 }
1120if true {
match (&self.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(self.decomposition.buffer_pos, 0);
1121undecomposed_starter = self.decomposition.pending.take()?;
1122if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
1123 || undecomposed_starter.potential_passthrough()
1124 {
1125// TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
1126 // character is not below `decomposition_passthrough_bound` but is
1127 // below `composition_passthrough_bound`, we read from the trie
1128 // unnecessarily.
1129if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
1130let cannot_combine_backwards = u32::from(upcoming.character)
1131 < self.composition_passthrough_bound
1132 || !upcoming.can_combine_backwards();
1133self.decomposition.pending = Some(upcoming);
1134if cannot_combine_backwards {
1135// Fast-track succeeded!
1136return Some(undecomposed_starter.character);
1137 }
1138 } else {
1139// End of stream
1140return Some(undecomposed_starter.character);
1141 }
1142 }
1143break; // Not actually looping
1144}
1145 }
1146let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
11471148 // The point of having this boolean is to have only one call site to
1149 // `self.decomposition.decomposing_next`, which is hopefully beneficial for
1150 // code size under inlining.
1151let mut attempt_composition = false;
1152loop {
1153if let Some(unprocessed) = self.unprocessed_starter.take() {
1154if true {
match (&undecomposed_starter, &CharacterAndTrieValue::new('\u{0}', 0)) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
1155if true {
match (&starter, &'\u{0}') {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(starter, '\u{0}');
1156starter = unprocessed;
1157 } else {
1158if true {
match (&self.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(self.decomposition.buffer_pos, 0);
1159let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
1160if !attempt_composition {
1161starter = next_starter;
1162 } else if let Some(composed) = self.compose(starter, next_starter) {
1163starter = composed;
1164 } else {
1165// This is our yield point. We'll pick this up above in the
1166 // next call to `next()`.
1167self.unprocessed_starter = Some(next_starter);
1168return Some(starter);
1169 }
1170 }
1171// We first loop by index to avoid moving the contents of `buffer`, but
1172 // if there's a discontiguous match, we'll start modifying `buffer` instead.
1173loop {
1174let (character, ccc) = if let Some((character, ccc)) = self1175 .decomposition
1176 .buffer
1177 .get(self.decomposition.buffer_pos)
1178 .map(|c| c.character_and_ccc())
1179 {
1180 (character, ccc)
1181 } else {
1182self.decomposition.buffer.clear();
1183self.decomposition.buffer_pos = 0;
1184break;
1185 };
1186if let Some(composed) = self.compose(starter, character) {
1187starter = composed;
1188self.decomposition.buffer_pos += 1;
1189continue;
1190 }
1191let mut most_recent_skipped_ccc = ccc;
1192 {
1193let _ = self1194 .decomposition
1195 .buffer
1196 .drain(0..self.decomposition.buffer_pos);
1197 }
1198self.decomposition.buffer_pos = 0;
1199if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1200// We failed to compose a starter. Discontiguous match not allowed.
1201 // We leave the starter in `buffer` for `next()` to find.
1202return Some(starter);
1203 }
1204let mut i = 1; // We have skipped one non-starter.
1205while let Some((character, ccc)) = self
1206.decomposition
1207 .buffer
1208 .get(i)
1209 .map(|c| c.character_and_ccc())
1210 {
1211if ccc == CCC_NOT_REORDERED {
1212// Discontiguous match not allowed.
1213return Some(starter);
1214 }
1215if true {
if !(ccc >= most_recent_skipped_ccc) {
::core::panicking::panic("assertion failed: ccc >= most_recent_skipped_ccc")
};
};debug_assert!(ccc >= most_recent_skipped_ccc);
1216if ccc != most_recent_skipped_ccc {
1217// Using the non-Hangul version as a micro-optimization, since
1218 // we already rejected the case where `second` is a starter
1219 // above, and conjoining jamo are starters.
1220if let Some(composed) = self.compose_non_hangul(starter, character) {
1221self.decomposition.buffer.remove(i);
1222 starter = composed;
1223continue;
1224 }
1225 }
1226 most_recent_skipped_ccc = ccc;
1227 i += 1;
1228 }
1229break;
1230 }
12311232if true {
match (&self.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};debug_assert_eq!(self.decomposition.buffer_pos, 0);
12331234if !self.decomposition.buffer.is_empty() {
1235return Some(starter);
1236 }
1237// Now we need to check if composition with an upcoming starter is possible.
1238#[expect(clippy::unwrap_used)]
1239if self.decomposition.pending.is_some() {
1240// We know that `pending_starter` decomposes to start with a starter.
1241 // Otherwise, it would have been moved to `self.decomposition.buffer`
1242 // by `self.decomposing_next()`. We do this set lookup here in order
1243 // to get an opportunity to go back to the fast track.
1244 // Note that this check has to happen _after_ checking that `pending`
1245 // holds a character, because this flag isn't defined to be meaningful
1246 // when `pending` isn't holding a character.
1247let pending = self.decomposition.pending.as_ref().unwrap();
1248if u32::from(pending.character) < self.composition_passthrough_bound
1249 || !pending.can_combine_backwards()
1250 {
1251// Won't combine backwards anyway.
1252return Some(starter);
1253 }
1254// Consume what we peeked. `unwrap` OK, because we checked `is_some()`
1255 // above.
1256undecomposed_starter = self.decomposition.pending.take().unwrap();
1257// The following line is OK, because we're about to loop back
1258 // to `self.decomposition.decomposing_next(c);`, which will
1259 // restore the between-`next()`-calls invariant of `pending`
1260 // before this function returns.
1261attempt_composition = true;
1262continue;
1263 }
1264// End of input
1265return Some(starter);
1266 }
1267 }
1268}
12691270macro_rules! composing_normalize_to {
1271 ($(#[$meta:meta])*,
1272$normalize_to:ident,
1273$write:path,
1274$slice:ty,
1275$prolog:block,
1276$always_valid_utf:literal,
1277$as_slice:ident,
1278$fast:block,
1279$text:ident,
1280$sink:ident,
1281$composition:ident,
1282$composition_passthrough_bound:ident,
1283$undecomposed_starter:ident,
1284$pending_slice:ident,
1285$len_utf:ident,
1286 ) => {
1287 $(#[$meta])*
1288pub fn $normalize_to<W: $write + ?Sized>(
1289&self,
1290$text: $slice,
1291$sink: &mut W,
1292 ) -> core::fmt::Result {
1293$prolog
1294let mut $composition = self.normalize_iter($text.chars());
1295debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1296for cc in $composition.decomposition.buffer.drain(..) {
1297$sink.write_char(cc.character())?;
1298 }
12991300// Try to get the compiler to hoist the bound to a register.
1301let $composition_passthrough_bound = $composition.composition_passthrough_bound;
1302'outer: loop {
1303debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1304let mut $undecomposed_starter =
1305if let Some(pending) = $composition.decomposition.pending.take() {
1306 pending
1307 } else {
1308return Ok(());
1309 };
1310if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
1311$undecomposed_starter.potential_passthrough()
1312 {
1313// We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
1314 // was returned in response to an error by the iterator. Assume the
1315 // latter for correctness even though it pessimizes the former.
1316if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
1317let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
1318// The `$fast` block must either:
1319 // 1. Return due to reaching EOF
1320 // 2. Leave a starter with its trie value in `$undecomposed_starter`
1321 // and, if there is still more input, leave the next character
1322 // and its trie value in `$composition.decomposition.pending`.
1323$fast
1324}
1325 }
1326// Fast track above, full algorithm below
1327let mut starter = $composition
1328.decomposition
1329 .decomposing_next($undecomposed_starter);
1330'bufferloop: loop {
1331// We first loop by index to avoid moving the contents of `buffer`, but
1332 // if there's a discontiguous match, we'll start modifying `buffer` instead.
1333loop {
1334let (character, ccc) = if let Some((character, ccc)) = $composition
1335.decomposition
1336 .buffer
1337 .get($composition.decomposition.buffer_pos)
1338 .map(|c| c.character_and_ccc())
1339 {
1340 (character, ccc)
1341 } else {
1342$composition.decomposition.buffer.clear();
1343$composition.decomposition.buffer_pos = 0;
1344break;
1345 };
1346if let Some(composed) = $composition.compose(starter, character) {
1347 starter = composed;
1348$composition.decomposition.buffer_pos += 1;
1349continue;
1350 }
1351let mut most_recent_skipped_ccc = ccc;
1352if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1353// We failed to compose a starter. Discontiguous match not allowed.
1354 // Write the current `starter` we've been composing, make the unmatched
1355 // starter in the buffer the new `starter` (we know it's been decomposed)
1356 // and process the rest of the buffer with that as the starter.
1357$sink.write_char(starter)?;
1358 starter = character;
1359$composition.decomposition.buffer_pos += 1;
1360continue 'bufferloop;
1361 } else {
1362 {
1363let _ = $composition
1364.decomposition
1365 .buffer
1366 .drain(0..$composition.decomposition.buffer_pos);
1367 }
1368$composition.decomposition.buffer_pos = 0;
1369 }
1370let mut i = 1; // We have skipped one non-starter.
1371while let Some((character, ccc)) = $composition
1372.decomposition
1373 .buffer
1374 .get(i)
1375 .map(|c| c.character_and_ccc())
1376 {
1377if ccc == CCC_NOT_REORDERED {
1378// Discontiguous match not allowed.
1379$sink.write_char(starter)?;
1380for cc in $composition.decomposition.buffer.drain(..i) {
1381$sink.write_char(cc.character())?;
1382 }
1383 starter = character;
1384 {
1385let removed = $composition.decomposition.buffer.remove(0);
1386debug_assert_eq!(starter, removed.character());
1387 }
1388debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1389continue 'bufferloop;
1390 }
1391debug_assert!(ccc >= most_recent_skipped_ccc);
1392if ccc != most_recent_skipped_ccc {
1393// Using the non-Hangul version as a micro-optimization, since
1394 // we already rejected the case where `second` is a starter
1395 // above, and conjoining jamo are starters.
1396if let Some(composed) =
1397$composition.compose_non_hangul(starter, character)
1398 {
1399$composition.decomposition.buffer.remove(i);
1400 starter = composed;
1401continue;
1402 }
1403 }
1404 most_recent_skipped_ccc = ccc;
1405 i += 1;
1406 }
1407break;
1408 }
1409debug_assert_eq!($composition.decomposition.buffer_pos, 0);
14101411if !$composition.decomposition.buffer.is_empty() {
1412$sink.write_char(starter)?;
1413for cc in $composition.decomposition.buffer.drain(..) {
1414$sink.write_char(cc.character())?;
1415 }
1416// We had non-empty buffer, so can't compose with upcoming.
1417continue 'outer;
1418 }
1419// Now we need to check if composition with an upcoming starter is possible.
1420if $composition.decomposition.pending.is_some() {
1421// We know that `pending_starter` decomposes to start with a starter.
1422 // Otherwise, it would have been moved to `composition.decomposition.buffer`
1423 // by `composition.decomposing_next()`. We do this set lookup here in order
1424 // to get an opportunity to go back to the fast track.
1425 // Note that this check has to happen _after_ checking that `pending`
1426 // holds a character, because this flag isn't defined to be meaningful
1427 // when `pending` isn't holding a character.
1428let pending = $composition.decomposition.pending.as_ref().unwrap();
1429if u32::from(pending.character) < $composition.composition_passthrough_bound
1430 || !pending.can_combine_backwards()
1431 {
1432// Won't combine backwards anyway.
1433$sink.write_char(starter)?;
1434continue 'outer;
1435 }
1436let pending_starter = $composition.decomposition.pending.take().unwrap();
1437let decomposed = $composition.decomposition.decomposing_next(pending_starter);
1438if let Some(composed) = $composition.compose(starter, decomposed) {
1439 starter = composed;
1440 } else {
1441$sink.write_char(starter)?;
1442 starter = decomposed;
1443 }
1444continue 'bufferloop;
1445 }
1446// End of input
1447$sink.write_char(starter)?;
1448return Ok(());
1449 } // 'bufferloop
1450}
1451 }
1452 };
1453}
14541455macro_rules! decomposing_normalize_to {
1456 ($(#[$meta:meta])*,
1457$normalize_to:ident,
1458$write:path,
1459$slice:ty,
1460$prolog:block,
1461$as_slice:ident,
1462$fast:block,
1463$text:ident,
1464$sink:ident,
1465$decomposition:ident,
1466$decomposition_passthrough_bound:ident,
1467$undecomposed_starter:ident,
1468$pending_slice:ident,
1469$outer:lifetime, // loop labels use lifetime tokens
1470) => {
1471 $(#[$meta])*
1472pub fn $normalize_to<W: $write + ?Sized>(
1473&self,
1474$text: $slice,
1475$sink: &mut W,
1476 ) -> core::fmt::Result {
1477$prolog
14781479let mut $decomposition = self.normalize_iter($text.chars());
1480debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
14811482// Try to get the compiler to hoist the bound to a register.
1483let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
1484$outer: loop {
1485for cc in $decomposition.buffer.drain(..) {
1486$sink.write_char(cc.character())?;
1487 }
1488debug_assert_eq!($decomposition.buffer_pos, 0);
1489let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
1490 pending
1491 } else {
1492return Ok(());
1493 };
1494if $undecomposed_starter.starter_and_decomposes_to_self() {
1495// Don't bother including `undecomposed_starter` in a contiguous buffer
1496 // write: Just write it right away:
1497$sink.write_char($undecomposed_starter.character)?;
14981499let $pending_slice = $decomposition.delegate.$as_slice();
1500$fast
1501}
1502let starter = $decomposition.decomposing_next($undecomposed_starter);
1503$sink.write_char(starter)?;
1504 }
1505 }
1506 };
1507}
15081509macro_rules! normalizer_methods {
1510 () => {
1511/// Normalize a string slice into a `Cow<'a, str>`.
1512pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
1513let (head, tail) = self.split_normalized(text);
1514if tail.is_empty() {
1515return Cow::Borrowed(head);
1516 }
1517let mut ret = String::new();
1518 ret.reserve(text.len());
1519 ret.push_str(head);
1520let _ = self.normalize_to(tail, &mut ret);
1521 Cow::Owned(ret)
1522 }
15231524/// Split a string slice into maximum normalized prefix and unnormalized suffix
1525 /// such that the concatenation of the prefix and the normalization of the suffix
1526 /// is the normalization of the whole input.
1527pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
1528let up_to = self.is_normalized_up_to(text);
1529 text.split_at_checked(up_to).unwrap_or_else(|| {
1530// Internal bug, not even GIGO, never supposed to happen
1531debug_assert!(false);
1532 ("", text)
1533 })
1534 }
15351536/// Return the index a string slice is normalized up to.
1537fn is_normalized_up_to(&self, text: &str) -> usize {
1538let mut sink = IsNormalizedSinkStr::new(text);
1539let _ = self.normalize_to(text, &mut sink);
1540 text.len() - sink.remaining_len()
1541 }
15421543/// Check whether a string slice is normalized.
1544pub fn is_normalized(&self, text: &str) -> bool {
1545self.is_normalized_up_to(text) == text.len()
1546 }
15471548/// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`.
1549 ///
1550 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1551 /// before normalizing.
1552 ///
1553 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1554#[cfg(feature = "utf16_iter")]
1555pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
1556let (head, tail) = self.split_normalized_utf16(text);
1557if tail.is_empty() {
1558return Cow::Borrowed(head);
1559 }
1560let mut ret = alloc::vec::Vec::with_capacity(text.len());
1561 ret.extend_from_slice(head);
1562let _ = self.normalize_utf16_to(tail, &mut ret);
1563 Cow::Owned(ret)
1564 }
15651566/// Split a slice of potentially-invalid UTF-16 into maximum normalized (and valid)
1567 /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1568 /// normalization of the suffix is the normalization of the whole input.
1569 ///
1570 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1571#[cfg(feature = "utf16_iter")]
1572pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
1573let up_to = self.is_normalized_utf16_up_to(text);
1574 text.split_at_checked(up_to).unwrap_or_else(|| {
1575// Internal bug, not even GIGO, never supposed to happen
1576debug_assert!(false);
1577 (&[], text)
1578 })
1579 }
15801581/// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
1582 ///
1583 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1584#[cfg(feature = "utf16_iter")]
1585fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
1586let mut sink = IsNormalizedSinkUtf16::new(text);
1587let _ = self.normalize_utf16_to(text, &mut sink);
1588 text.len() - sink.remaining_len()
1589 }
15901591/// Checks whether a slice of potentially-invalid UTF-16 is normalized.
1592 ///
1593 /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
1594 ///
1595 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1596#[cfg(feature = "utf16_iter")]
1597pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
1598self.is_normalized_utf16_up_to(text) == text.len()
1599 }
16001601/// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`.
1602 ///
1603 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1604 /// according to the WHATWG Encoding Standard.
1605 ///
1606 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1607#[cfg(feature = "utf8_iter")]
1608pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
1609let (head, tail) = self.split_normalized_utf8(text);
1610if tail.is_empty() {
1611return Cow::Borrowed(head);
1612 }
1613let mut ret = String::new();
1614 ret.reserve(text.len());
1615 ret.push_str(head);
1616let _ = self.normalize_utf8_to(tail, &mut ret);
1617 Cow::Owned(ret)
1618 }
16191620/// Split a slice of potentially-invalid UTF-8 into maximum normalized (and valid)
1621 /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1622 /// normalization of the suffix is the normalization of the whole input.
1623 ///
1624 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1625#[cfg(feature = "utf8_iter")]
1626pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
1627let up_to = self.is_normalized_utf8_up_to(text);
1628let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
1629// Internal bug, not even GIGO, never supposed to happen
1630debug_assert!(false);
1631 (&[], text)
1632 });
1633// SAFETY: The normalization check also checks for
1634 // UTF-8 well-formedness.
1635(unsafe { core::str::from_utf8_unchecked(head) }, tail)
1636 }
16371638/// Return the index a slice of potentially-invalid UTF-8 is normalized up to
1639 ///
1640 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1641#[cfg(feature = "utf8_iter")]
1642fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
1643let mut sink = IsNormalizedSinkUtf8::new(text);
1644let _ = self.normalize_utf8_to(text, &mut sink);
1645 text.len() - sink.remaining_len()
1646 }
16471648/// Check if a slice of potentially-invalid UTF-8 is normalized.
1649 ///
1650 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1651 /// according to the WHATWG Encoding Standard before checking.
1652 ///
1653 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1654#[cfg(feature = "utf8_iter")]
1655pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
1656self.is_normalized_utf8_up_to(text) == text.len()
1657 }
1658 };
1659}
16601661/// Borrowed version of a normalizer for performing decomposing normalization.
1662#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for DecomposingNormalizerBorrowed<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field5_finish(f,
"DecomposingNormalizerBorrowed", "decompositions",
&self.decompositions, "tables", &self.tables,
"supplementary_tables", &self.supplementary_tables,
"decomposition_passthrough_bound",
&self.decomposition_passthrough_bound,
"composition_passthrough_bound",
&&self.composition_passthrough_bound)
}
}Debug)]
1663pub struct DecomposingNormalizerBorrowed<'a> {
1664 decompositions: &'a DecompositionData<'a>,
1665 tables: &'a DecompositionTables<'a>,
1666 supplementary_tables: Option<&'a DecompositionTables<'a>>,
1667 decomposition_passthrough_bound: u8, // never above 0xC0
1668composition_passthrough_bound: u16, // never above 0x0300
1669}
16701671impl DecomposingNormalizerBorrowed<'static> {
1672/// Cheaply converts a [`DecomposingNormalizerBorrowed<'static>`] into a [`DecomposingNormalizer`].
1673 ///
1674 /// Note: Due to branching and indirection, using [`DecomposingNormalizer`] might inhibit some
1675 /// compile-time optimizations that are possible with [`DecomposingNormalizerBorrowed`].
1676pub const fn static_to_owned(self) -> DecomposingNormalizer {
1677DecomposingNormalizer {
1678 decompositions: DataPayload::from_static_ref(self.decompositions),
1679 tables: DataPayload::from_static_ref(self.tables),
1680 supplementary_tables: if let Some(s) = self.supplementary_tables {
1681// `map` not available in const context
1682Some(DataPayload::from_static_ref(s))
1683 } else {
1684None1685 },
1686 decomposition_passthrough_bound: self.decomposition_passthrough_bound,
1687 composition_passthrough_bound: self.composition_passthrough_bound,
1688 }
1689 }
16901691/// NFD constructor using compiled data.
1692 ///
1693 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1694 ///
1695 /// [📚 Help choosing a constructor](icu_provider::constructors)
1696#[cfg(feature = "compiled_data")]
1697pub const fn new_nfd() -> Self {
1698const _: () = if !(crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars16.const_len()
+
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars24.const_len()
<= 0xFFF) {
{ ::core::panicking::panic_fmt(format_args!("future extension")); }
}assert!(
1699crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1700 .scalars16
1701 .const_len()
1702 + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1703 .scalars24
1704 .const_len()
1705 <= 0xFFF,
1706"future extension"
1707);
17081709DecomposingNormalizerBorrowed {
1710 decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
1711 tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1712 supplementary_tables: None,
1713 decomposition_passthrough_bound: 0xC0,
1714 composition_passthrough_bound: 0x0300,
1715 }
1716 }
17171718/// NFKD constructor using compiled data.
1719 ///
1720 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1721 ///
1722 /// [📚 Help choosing a constructor](icu_provider::constructors)
1723#[cfg(feature = "compiled_data")]
1724pub const fn new_nfkd() -> Self {
1725const _: () = if !(crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars16.const_len()
+
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars24.const_len()
+
crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1.scalars16.const_len()
+
crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1.scalars24.const_len()
<= 0xFFF) {
{ ::core::panicking::panic_fmt(format_args!("future extension")); }
}assert!(
1726crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1727 .scalars16
1728 .const_len()
1729 + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1730 .scalars24
1731 .const_len()
1732 + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1733 .scalars16
1734 .const_len()
1735 + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1736 .scalars24
1737 .const_len()
1738 <= 0xFFF,
1739"future extension"
1740);
17411742const _: () = if !(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
<= 0x0300) {
{ ::core::panicking::panic_fmt(format_args!("invalid")); }
}assert!(
1743crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300,
1744"invalid"
1745);
17461747let decomposition_capped =
1748if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 {
1749crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1750 } else {
17510xC0
1752};
1753let composition_capped =
1754if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 {
1755crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1756 } else {
17570x0300
1758};
17591760DecomposingNormalizerBorrowed {
1761 decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1,
1762 tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1763 supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1764 decomposition_passthrough_bound: decomposition_cappedas u8,
1765 composition_passthrough_bound: composition_capped,
1766 }
1767 }
17681769#[cfg(feature = "compiled_data")]
1770pub(crate) const fn new_uts46_decomposed() -> Self {
1771const _: () = if !(crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars16.const_len()
+
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1.scalars24.const_len()
+
crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1.scalars16.const_len()
+
crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1.scalars24.const_len()
<= 0xFFF) {
{ ::core::panicking::panic_fmt(format_args!("future extension")); }
}assert!(
1772crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1773 .scalars16
1774 .const_len()
1775 + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1776 .scalars24
1777 .const_len()
1778 + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1779 .scalars16
1780 .const_len()
1781 + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1782 .scalars24
1783 .const_len()
1784 <= 0xFFF,
1785"future extension"
1786);
17871788const _: () = if !(crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
<= 0x0300) {
{ ::core::panicking::panic_fmt(format_args!("invalid")); }
}assert!(
1789crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300,
1790"invalid"
1791);
17921793let decomposition_capped =
1794if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 {
1795crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1796 } else {
17970xC0
1798};
1799let composition_capped = if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V11800 .passthrough_cap
1801 < 0x0300
1802{
1803crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1804 } else {
18050x0300
1806};
18071808DecomposingNormalizerBorrowed {
1809 decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1,
1810 tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1811 supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1812 decomposition_passthrough_bound: decomposition_cappedas u8,
1813 composition_passthrough_bound: composition_capped,
1814 }
1815 }
1816}
18171818impl<'data> DecomposingNormalizerBorrowed<'data> {
1819/// NFD constructor using already-loaded data.
1820 ///
1821 /// This constructor is intended for use by collations.
1822 ///
1823 /// [📚 Help choosing a constructor](icu_provider::constructors)
1824#[doc(hidden)]
1825pub fn new_with_data(
1826 decompositions: &'data DecompositionData<'data>,
1827 tables: &'data DecompositionTables<'data>,
1828 ) -> Self {
1829Self {
1830decompositions,
1831tables,
1832 supplementary_tables: None,
1833 decomposition_passthrough_bound: 0xC0,
1834 composition_passthrough_bound: 0x0300,
1835 }
1836 }
18371838/// Wraps a delegate iterator into a decomposing iterator
1839 /// adapter by using the data already held by this normalizer.
1840pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> {
1841Decomposition::new_with_supplements(
1842iter,
1843self.decompositions,
1844self.tables,
1845self.supplementary_tables,
1846self.decomposition_passthrough_bound,
1847 IgnorableBehavior::Unsupported,
1848 )
1849 }
18501851self
text
let (head, tail) = self.split_normalized(text);
if tail.is_empty() { return Cow::Borrowed(head); }
let mut ret = String::new();
ret.reserve(text.len());
ret.push_str(head);
let _ = self.normalize_to(tail, &mut ret);
Cow::Owned(ret);
self
text
let up_to = self.is_normalized_up_to(text);
text.split_at_checked(up_to).unwrap_or_else(||
{
if true {
if !false {
::core::panicking::panic("assertion failed: false")
};
};
("", text)
});
self
text
let mut sink = IsNormalizedSinkStr::new(text);
let _ = self.normalize_to(text, &mut sink);
text.len() - sink.remaining_len();
self
text
self.is_normalized_up_to(text) == text.len();normalizer_methods!();
18521853self
text
sink
{}
let mut decomposition = self.normalize_iter(text.chars());
if true {
match (&decomposition.ignorable_behavior, &IgnorableBehavior::Unsupported)
{
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
let decomposition_passthrough_bound =
decomposition.decomposition_passthrough_bound;
'outer: loop {
for cc in decomposition.buffer.drain(..) {
sink.write_char(cc.character())?;
}
if true {
match (&decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
let mut undecomposed_starter =
if let Some(pending) = decomposition.pending.take() {
pending
} else { return Ok(()); };
if undecomposed_starter.starter_and_decomposes_to_self() {
sink.write_char(undecomposed_starter.character)?;
let pending_slice = decomposition.delegate.as_str();
{
let decomposition_passthrough_byte_bound =
if decomposition_passthrough_bound == 0xC0 {
0xC3u8
} else { decomposition_passthrough_bound.min(0x80) as u8 };
#[expect(clippy::unwrap_used)]
'fast: loop {
let mut code_unit_iter =
decomposition.delegate.as_str().as_bytes().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < decomposition_passthrough_byte_bound {
continue 'fastest;
}
decomposition.delegate =
pending_slice[pending_slice.len() -
code_unit_iter.as_slice().len() - 1..].chars();
break 'fastest;
}
sink.write_str(pending_slice)?;
return Ok(());
}
let upcoming = decomposition.delegate.next().unwrap();
let upcoming_with_trie_value =
decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.starter_and_decomposes_to_self() {
continue 'fast;
}
let consumed_so_far_slice =
&pending_slice[..pending_slice.len() -
decomposition.delegate.as_str().len() -
upcoming.len_utf8()];
sink.write_str(consumed_so_far_slice)?;
if decomposition_starts_with_non_starter(upcoming_with_trie_value.trie_val)
{
decomposition.pending = Some(upcoming_with_trie_value);
decomposition.gather_and_sort_combining(0);
continue 'outer;
}
undecomposed_starter = upcoming_with_trie_value;
if true {
if !decomposition.pending.is_none() {
::core::panicking::panic("assertion failed: decomposition.pending.is_none()")
};
};
break 'fast;
}
}
}
let starter = decomposition.decomposing_next(undecomposed_starter);
sink.write_char(starter)?;
}decomposing_normalize_to!(
1854/// Normalize a string slice into a `Write` sink.
1855,
1856 normalize_to,
1857core::fmt::Write,
1858&str,
1859 {
1860 },
1861 as_str,
1862 {
1863let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
18640xC3u8
1865} else {
1866 decomposition_passthrough_bound.min(0x80) as u8
1867 };
1868// The attribute belongs on an inner statement, but Rust doesn't allow it there.
1869#[expect(clippy::unwrap_used)]
1870'fast: loop {
1871let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
1872'fastest: loop {
1873if let Some(&upcoming_byte) = code_unit_iter.next() {
1874if upcoming_byte < decomposition_passthrough_byte_bound {
1875// Fast-track succeeded!
1876continue 'fastest;
1877 }
1878// This deliberately isn't panic-free, since the code pattern
1879 // that was OK for the composing counterpart regressed
1880 // English and French performance if done here, too.
1881decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
1882break 'fastest;
1883 }
1884// End of stream
1885sink.write_str(pending_slice)?;
1886return Ok(());
1887 }
18881889// `unwrap()` OK, because the slice is valid UTF-8 and we know there
1890 // is an upcoming byte.
1891let upcoming = decomposition.delegate.next().unwrap();
1892let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1893if upcoming_with_trie_value.starter_and_decomposes_to_self() {
1894continue 'fast;
1895 }
1896let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1897 - decomposition.delegate.as_str().len()
1898 - upcoming.len_utf8()];
1899 sink.write_str(consumed_so_far_slice)?;
19001901// Now let's figure out if we got a starter or a non-starter.
1902if decomposition_starts_with_non_starter(
1903 upcoming_with_trie_value.trie_val,
1904 ) {
1905// Let this trie value to be reprocessed in case it is
1906 // one of the rare decomposing ones.
1907decomposition.pending = Some(upcoming_with_trie_value);
1908 decomposition.gather_and_sort_combining(0);
1909continue 'outer;
1910 }
1911 undecomposed_starter = upcoming_with_trie_value;
1912debug_assert!(decomposition.pending.is_none());
1913break 'fast;
1914 }
1915 },
1916 text,
1917 sink,
1918 decomposition,
1919 decomposition_passthrough_bound,
1920 undecomposed_starter,
1921 pending_slice,
1922'outer,
1923 );
19241925decomposing_normalize_to!(
1926/// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
1927 ///
1928 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1929 /// according to the WHATWG Encoding Standard.
1930 ///
1931 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1932#[cfg(feature = "utf8_iter")]
1933,
1934 normalize_utf8_to,
1935 core::fmt::Write,
1936&[u8],
1937 {
1938 },
1939 as_slice,
1940 {
1941let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
1942'fast: loop {
1943let mut code_unit_iter = decomposition.delegate.as_slice().iter();
1944'fastest: loop {
1945if let Some(&upcoming_byte) = code_unit_iter.next() {
1946if upcoming_byte < decomposition_passthrough_byte_bound {
1947// Fast-track succeeded!
1948continue 'fastest;
1949 }
1950break 'fastest;
1951 }
1952// End of stream
1953sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
1954return Ok(());
1955 }
1956#[expect(clippy::indexing_slicing)]
1957{decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();}
19581959// `unwrap()` OK, because the slice is valid UTF-8 and we know there
1960 // is an upcoming byte.
1961#[expect(clippy::unwrap_used)]
1962let upcoming = decomposition.delegate.next().unwrap();
1963let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1964if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() {
1965// Note: The trie value of the REPLACEMENT CHARACTER is
1966 // intentionally formatted to fail the
1967 // `starter_and_decomposes_to_self` test even though it
1968 // really is a starter that decomposes to self. This
1969 // Allows moving the branch on REPLACEMENT CHARACTER
1970 // below this `continue`.
1971continue 'fast;
1972 }
19731974// TODO: Annotate as unlikely.
1975if upcoming == REPLACEMENT_CHARACTER {
1976// We might have an error, so fall out of the fast path.
19771978 // Since the U+FFFD might signify an error, we can't
1979 // assume `upcoming.len_utf8()` for the backoff length.
1980#[expect(clippy::indexing_slicing)]
1981let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
1982let back = consumed_so_far.next_back();
1983debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
1984let consumed_so_far_slice = consumed_so_far.as_slice();
1985 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
19861987// We could call `gather_and_sort_combining` here and
1988 // `continue 'outer`, but this should be better for code
1989 // size.
1990undecomposed_starter = upcoming_with_trie_value;
1991debug_assert!(decomposition.pending.is_none());
1992break 'fast;
1993 }
19941995#[expect(clippy::indexing_slicing)]
1996let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1997 - decomposition.delegate.as_slice().len()
1998 - upcoming.len_utf8()];
1999 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
20002001// Now let's figure out if we got a starter or a non-starter.
2002if decomposition_starts_with_non_starter(
2003 upcoming_with_trie_value.trie_val,
2004 ) {
2005// Let this trie value to be reprocessed in case it is
2006 // one of the rare decomposing ones.
2007decomposition.pending = Some(upcoming_with_trie_value);
2008 decomposition.gather_and_sort_combining(0);
2009continue 'outer;
2010 }
2011 undecomposed_starter = upcoming_with_trie_value;
2012debug_assert!(decomposition.pending.is_none());
2013break 'fast;
2014 }
2015 },
2016 text,
2017 sink,
2018 decomposition,
2019 decomposition_passthrough_bound,
2020 undecomposed_starter,
2021 pending_slice,
2022'outer,
2023 );
20242025decomposing_normalize_to!(
2026/// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2027 ///
2028 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2029 /// before normalizing.
2030 ///
2031 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2032#[cfg(feature = "utf16_iter")]
2033,
2034 normalize_utf16_to,
2035 write16::Write16,
2036&[u16],
2037 {
2038 sink.size_hint(text.len())?;
2039 },
2040 as_slice,
2041 {
2042// This loop is only broken out of as goto forward and only as release-build recovery from
2043 // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2044#[expect(clippy::never_loop)]
2045'fastwrap: loop {
2046// Commented out `code_unit_iter` and used `ptr` and `end` to
2047 // work around https://github.com/rust-lang/rust/issues/144684 .
2048 //
2049 // let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2050let delegate_as_slice = decomposition.delegate.as_slice();
2051let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2052// SAFETY: materializing a pointer immediately past the end of an
2053 // allocation is OK.
2054let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2055'fast: loop {
2056// if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2057if ptr != end {
2058// SAFETY: We just checked that `ptr` has not reached `end`.
2059 // `ptr` always advances by one, and we always have a check
2060 // per advancement.
2061let upcoming_code_unit = unsafe { *ptr };
2062// SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2063 // by one points to the same allocation or to immediately
2064 // after, which is OK.
2065ptr = unsafe { ptr.add(1) };
20662067let mut upcoming32 = u32::from(upcoming_code_unit);
2068// The performance of what logically is supposed to be this
2069 // branch is _incredibly_ brittle and what LLVM ends up doing
2070 // that affects the performance of what's logically about this
2071 // decision can swing to double/halve the throughput for Basic
2072 // Latin in ways that are completely unintuitive. Basically _any_
2073 // change to _any_ code that participates in how LLVM sees the
2074 // code around here can make the perf fall over. In seems that
2075 // manually annotating this branch as likely has worse effects
2076 // on non-Basic-Latin input that the case where LLVM just happens to
2077 // do the right thing.
2078 //
2079 // What happens with this branch may depend on what sink type
2080 // this code is monomorphized over.
2081 //
2082 // What a terrible sink of developer time!
2083if upcoming32 < decomposition_passthrough_bound {
2084continue 'fast;
2085 }
2086// We might be doing a trie lookup by surrogate. Surrogates get
2087 // a decomposition to U+FFFD.
2088let mut trie_value = decomposition.trie.get16(upcoming_code_unit);
2089if starter_and_decomposes_to_self_impl(trie_value) {
2090continue 'fast;
2091 }
2092// We might now be looking at a surrogate.
2093 // The loop is only broken out of as goto forward
2094#[expect(clippy::never_loop)]
2095'surrogateloop: loop {
2096// LLVM's optimizations are incredibly brittle for the code _above_,
2097 // and using `likely` _below_ without using it _above_ helps!
2098 // What a massive sink of developer time!
2099 // Seriously, the effect of these annotations is massively
2100 // unintuitive. Measure everything!
2101 // Notably, the `if likely(...)` formulation optimizes differently
2102 // than just putting `cold_path()` on the `else` path!
2103let surrogate_base = upcoming32.wrapping_sub(0xD800);
2104if likely(surrogate_base > (0xDFFF - 0xD800)) {
2105// Not surrogate
2106break 'surrogateloop;
2107 }
2108if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2109// let iter_backup = code_unit_iter.clone();
2110 // if let Some(&low) = code_unit_iter.next() {
2111if ptr != end {
2112// SAFETY: We just checked that `ptr` has not reached `end`.
2113 // `ptr` always advances by one, and we always have a check
2114 // per advancement.
2115let low = unsafe { *ptr };
2116if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2117// SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2118 // by one points to the same allocation or to immediately
2119 // after, which is OK.
2120ptr = unsafe { ptr.add(1) };
21212122 upcoming32 = (upcoming32 << 10) + u32::from(low)
2123 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2124// Successfully-paired surrogate. Read from the trie again.
2125trie_value = {
2126// Semantically, this bit of conditional compilation makes no sense.
2127 // The purpose is to keep LLVM seeing the untyped trie case the way
2128 // it did before so as not to regress the performance of the untyped
2129 // case due to unintuitive optimizer effects. If you care about the
2130 // perf of the untyped trie case and have better ideas, please try
2131 // something better.
2132#[cfg(not(icu4x_unstable_fast_trie_only))]
2133{decomposition.trie.get32(upcoming32)}
2134#[cfg(icu4x_unstable_fast_trie_only)]
2135{decomposition.trie.get32_supplementary(upcoming32)}
2136 };
2137if likely(starter_and_decomposes_to_self_impl(trie_value)) {
2138continue 'fast;
2139 }
2140break 'surrogateloop;
2141// } else {
2142 // code_unit_iter = iter_backup;
2143}
2144 }
2145 }
2146// unpaired surrogate
2147upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2148 // trie_value already holds a decomposition to U+FFFD.
2149break 'surrogateloop;
2150 }
21512152let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2153let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
215421552156let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2157// code_unit_iter.as_slice().len()
2158 // SAFETY: `ptr` and `end` have been derived from the same allocation
2159 // and `ptr` is never greater than `end`.
2160unsafe { end.offset_from(ptr) as usize }
2161 - upcoming.len_utf16()) else {
2162// If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2163debug_assert!(false);
2164// Throw away the results of the fast path.
2165break 'fastwrap;
2166 };
2167 sink.write_slice(consumed_so_far_slice)?;
21682169if decomposition_starts_with_non_starter(
2170 upcoming_with_trie_value.trie_val,
2171 ) {
2172// Sync with main iterator
2173 // decomposition.delegate = code_unit_iter.as_slice().chars();
2174 // SAFETY: `ptr` and `end` have been derived from the same allocation
2175 // and `ptr` is never greater than `end`.
2176decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2177// Let this trie value to be reprocessed in case it is
2178 // one of the rare decomposing ones.
2179decomposition.pending = Some(upcoming_with_trie_value);
2180 decomposition.gather_and_sort_combining(0);
2181continue 'outer;
2182 }
2183 undecomposed_starter = upcoming_with_trie_value;
2184debug_assert!(decomposition.pending.is_none());
2185break 'fast;
2186 }
2187// End of stream
2188sink.write_slice(pending_slice)?;
2189return Ok(());
2190 }
2191// Sync the main iterator
2192 // decomposition.delegate = code_unit_iter.as_slice().chars();
2193 // SAFETY: `ptr` and `end` have been derived from the same allocation
2194 // and `ptr` is never greater than `end`.
2195decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2196break 'fastwrap;
2197 }
2198 },
2199 text,
2200 sink,
2201 decomposition,
2202 decomposition_passthrough_bound,
2203 undecomposed_starter,
2204 pending_slice,
2205'outer,
2206 );
2207}
22082209/// A normalizer for performing decomposing normalization.
2210#[derive(#[automatically_derived]
impl ::core::fmt::Debug for DecomposingNormalizer {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field5_finish(f,
"DecomposingNormalizer", "decompositions", &self.decompositions,
"tables", &self.tables, "supplementary_tables",
&self.supplementary_tables, "decomposition_passthrough_bound",
&self.decomposition_passthrough_bound,
"composition_passthrough_bound",
&&self.composition_passthrough_bound)
}
}Debug)]
2211pub struct DecomposingNormalizer {
2212 decompositions: DataPayload<NormalizerNfdDataV1>,
2213 tables: DataPayload<NormalizerNfdTablesV1>,
2214 supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>,
2215 decomposition_passthrough_bound: u8, // never above 0xC0
2216composition_passthrough_bound: u16, // never above 0x0300
2217}
22182219impl DecomposingNormalizer {
2220/// Constructs a borrowed version of this type for more efficient querying.
2221pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed<'_> {
2222DecomposingNormalizerBorrowed {
2223 decompositions: self.decompositions.get(),
2224 tables: self.tables.get(),
2225 supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()),
2226 decomposition_passthrough_bound: self.decomposition_passthrough_bound,
2227 composition_passthrough_bound: self.composition_passthrough_bound,
2228 }
2229 }
22302231/// NFD constructor using compiled data.
2232 ///
2233 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2234 ///
2235 /// [📚 Help choosing a constructor](icu_provider::constructors)
2236#[cfg(feature = "compiled_data")]
2237pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> {
2238DecomposingNormalizerBorrowed::new_nfd()
2239 }
22402241icu_provider::gen_buffer_data_constructors!(
2242 () -> error: DataError,
2243 functions: [
2244 new_nfd: skip,
2245 try_new_nfd_with_buffer_provider,
2246 try_new_nfd_unstable,
2247Self,
2248 ]
2249 );
22502251#[doc = "A version of [`Self::new_nfd`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
2252pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError>
2253where
2254D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized,
2255 {
2256let decompositions: DataPayload<NormalizerNfdDataV1> =
2257provider.load(Default::default())?.payload;
2258let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
22592260if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
2261// The data is from a future where there exists a normalization flavor whose
2262 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2263 // of space. If a good use case from such a decomposition flavor arises, we can
2264 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2265 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2266 // since for now the masks are hard-coded, error out.
2267return Err(
2268DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2269 );
2270 }
22712272let cap = decompositions.get().passthrough_cap;
2273if cap > 0x0300 {
2274return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO));
2275 }
2276let decomposition_capped = cap.min(0xC0);
2277let composition_capped = cap.min(0x0300);
22782279Ok(DecomposingNormalizer {
2280decompositions,
2281tables,
2282 supplementary_tables: None,
2283 decomposition_passthrough_bound: decomposition_cappedas u8,
2284 composition_passthrough_bound: composition_capped,
2285 })
2286 }
22872288icu_provider::gen_buffer_data_constructors!(
2289 () -> error: DataError,
2290 functions: [
2291 new_nfkd: skip,
2292 try_new_nfkd_with_buffer_provider,
2293 try_new_nfkd_unstable,
2294Self,
2295 ]
2296 );
22972298/// NFKD constructor using compiled data.
2299 ///
2300 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2301 ///
2302 /// [📚 Help choosing a constructor](icu_provider::constructors)
2303#[cfg(feature = "compiled_data")]
2304pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> {
2305DecomposingNormalizerBorrowed::new_nfkd()
2306 }
23072308#[doc = "A version of [`Self::new_nfkd`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
2309pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError>
2310where
2311D: DataProvider<NormalizerNfkdDataV1>
2312 + DataProvider<NormalizerNfdTablesV1>
2313 + DataProvider<NormalizerNfkdTablesV1>
2314 + ?Sized,
2315 {
2316let decompositions: DataPayload<NormalizerNfkdDataV1> =
2317provider.load(Default::default())?.payload;
2318let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2319let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2320provider.load(Default::default())?.payload;
23212322if tables.get().scalars16.len()
2323 + tables.get().scalars24.len()
2324 + supplementary_tables.get().scalars16.len()
2325 + supplementary_tables.get().scalars24.len()
2326 > 0xFFF
2327{
2328// The data is from a future where there exists a normalization flavor whose
2329 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2330 // of space. If a good use case from such a decomposition flavor arises, we can
2331 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2332 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2333 // since for now the masks are hard-coded, error out.
2334return Err(
2335DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2336 );
2337 }
23382339let cap = decompositions.get().passthrough_cap;
2340if cap > 0x0300 {
2341return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO));
2342 }
2343let decomposition_capped = cap.min(0xC0);
2344let composition_capped = cap.min(0x0300);
23452346Ok(DecomposingNormalizer {
2347 decompositions: decompositions.cast(),
2348tables,
2349 supplementary_tables: Some(supplementary_tables),
2350 decomposition_passthrough_bound: decomposition_cappedas u8,
2351 composition_passthrough_bound: composition_capped,
2352 })
2353 }
23542355/// UTS 46 decomposed constructor (testing only)
2356 ///
2357 /// This is a special building block normalization for IDNA. It is the decomposed counterpart of
2358 /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
2359 /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
2360 /// NFD in this normalization. In both cases, the previous UTS 46 processing before using
2361 /// normalization is expected to deal with these characters. Making the disallowed characters
2362 /// behave like this is beneficial to data size, and this normalizer implementation cannot
2363 /// deal with a character normalizing to the empty string, which doesn't happen in NFD or
2364 /// NFKD as of Unicode 14.
2365 ///
2366 /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2367 /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2368 /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2369 /// Therefore, the output of this normalization may differ for different inputs that are
2370 /// canonically equivalent with each other if they differ by how U+0345 is ordered relative
2371 /// to other reorderable characters.
2372pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError>
2373where
2374D: DataProvider<NormalizerUts46DataV1>
2375 + DataProvider<NormalizerNfdTablesV1>
2376 + DataProvider<NormalizerNfkdTablesV1>
2377// UTS 46 tables merged into CompatibilityDecompositionTablesV1
2378+ ?Sized,
2379 {
2380let decompositions: DataPayload<NormalizerUts46DataV1> =
2381provider.load(Default::default())?.payload;
2382let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2383let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2384provider.load(Default::default())?.payload;
23852386if tables.get().scalars16.len()
2387 + tables.get().scalars24.len()
2388 + supplementary_tables.get().scalars16.len()
2389 + supplementary_tables.get().scalars24.len()
2390 > 0xFFF
2391{
2392// The data is from a future where there exists a normalization flavor whose
2393 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2394 // of space. If a good use case from such a decomposition flavor arises, we can
2395 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2396 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2397 // since for now the masks are hard-coded, error out.
2398return Err(
2399DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2400 );
2401 }
24022403let cap = decompositions.get().passthrough_cap;
2404if cap > 0x0300 {
2405return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO));
2406 }
2407let decomposition_capped = cap.min(0xC0);
2408let composition_capped = cap.min(0x0300);
24092410Ok(DecomposingNormalizer {
2411 decompositions: decompositions.cast(),
2412tables,
2413 supplementary_tables: Some(supplementary_tables),
2414 decomposition_passthrough_bound: decomposition_cappedas u8,
2415 composition_passthrough_bound: composition_capped,
2416 })
2417 }
2418}
24192420/// Borrowed version of a normalizer for performing composing normalization.
2421#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ComposingNormalizerBorrowed<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f,
"ComposingNormalizerBorrowed", "decomposing_normalizer",
&self.decomposing_normalizer, "canonical_compositions",
&&self.canonical_compositions)
}
}Debug)]
2422pub struct ComposingNormalizerBorrowed<'a> {
2423 decomposing_normalizer: DecomposingNormalizerBorrowed<'a>,
2424 canonical_compositions: &'a CanonicalCompositions<'a>,
2425}
24262427impl ComposingNormalizerBorrowed<'static> {
2428/// Cheaply converts a [`ComposingNormalizerBorrowed<'static>`] into a [`ComposingNormalizer`].
2429 ///
2430 /// Note: Due to branching and indirection, using [`ComposingNormalizer`] might inhibit some
2431 /// compile-time optimizations that are possible with [`ComposingNormalizerBorrowed`].
2432pub const fn static_to_owned(self) -> ComposingNormalizer {
2433ComposingNormalizer {
2434 decomposing_normalizer: self.decomposing_normalizer.static_to_owned(),
2435 canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
2436 }
2437 }
24382439/// NFC constructor using compiled data.
2440 ///
2441 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2442 ///
2443 /// [📚 Help choosing a constructor](icu_provider::constructors)
2444#[cfg(feature = "compiled_data")]
2445pub const fn new_nfc() -> Self {
2446ComposingNormalizerBorrowed {
2447 decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(),
2448 canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2449 }
2450 }
24512452/// NFKC constructor using compiled data.
2453 ///
2454 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2455 ///
2456 /// [📚 Help choosing a constructor](icu_provider::constructors)
2457#[cfg(feature = "compiled_data")]
2458pub const fn new_nfkc() -> Self {
2459ComposingNormalizerBorrowed {
2460 decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(),
2461 canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2462 }
2463 }
24642465/// This is a special building block normalization for IDNA that implements parts of the Map
2466 /// step and the following Normalize step.
2467 ///
2468 /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2469 /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2470 /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2471 /// Therefore, the output of this normalization may differ for different inputs that are
2472 /// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2473 /// to other reorderable characters.
2474#[cfg(feature = "compiled_data")]
2475pub(crate) const fn new_uts46() -> Self {
2476ComposingNormalizerBorrowed {
2477 decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(),
2478 canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2479 }
2480 }
2481}
24822483impl<'data> ComposingNormalizerBorrowed<'data> {
2484/// Wraps a delegate iterator into a composing iterator
2485 /// adapter by using the data already held by this normalizer.
2486pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> {
2487self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2488 }
24892490fn normalize_iter_private<I: Iterator<Item = char>>(
2491&self,
2492 iter: I,
2493 ignorable_behavior: IgnorableBehavior,
2494 ) -> Composition<'data, I> {
2495Composition::new(
2496Decomposition::new_with_supplements(
2497iter,
2498self.decomposing_normalizer.decompositions,
2499self.decomposing_normalizer.tables,
2500self.decomposing_normalizer.supplementary_tables,
2501self.decomposing_normalizer.decomposition_passthrough_bound,
2502ignorable_behavior,
2503 ),
2504self.canonical_compositions.canonical_compositions.clone(),
2505self.decomposing_normalizer.composition_passthrough_bound,
2506 )
2507 }
25082509self
text
let (head, tail) = self.split_normalized(text);
if tail.is_empty() { return Cow::Borrowed(head); }
let mut ret = String::new();
ret.reserve(text.len());
ret.push_str(head);
let _ = self.normalize_to(tail, &mut ret);
Cow::Owned(ret);
self
text
let up_to = self.is_normalized_up_to(text);
text.split_at_checked(up_to).unwrap_or_else(||
{
if true {
if !false {
::core::panicking::panic("assertion failed: false")
};
};
("", text)
});
self
text
let mut sink = IsNormalizedSinkStr::new(text);
let _ = self.normalize_to(text, &mut sink);
text.len() - sink.remaining_len();
self
text
self.is_normalized_up_to(text) == text.len();normalizer_methods!();
25102511self
text
sink
{}
let mut composition = self.normalize_iter(text.chars());
if true {
match (&composition.decomposition.ignorable_behavior,
&IgnorableBehavior::Unsupported) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
for cc in composition.decomposition.buffer.drain(..) {
sink.write_char(cc.character())?;
}
let composition_passthrough_bound = composition.composition_passthrough_bound;
'outer: loop {
if true {
match (&composition.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
let mut undecomposed_starter =
if let Some(pending) = composition.decomposition.pending.take() {
pending
} else { return Ok(()); };
if u32::from(undecomposed_starter.character) <
composition_passthrough_bound ||
undecomposed_starter.potential_passthrough() {
if true || undecomposed_starter.character != REPLACEMENT_CHARACTER {
let pending_slice =
&text[text.len() -
composition.decomposition.delegate.as_str().len() -
undecomposed_starter.character.len_utf8()..];
{
let composition_passthrough_byte_bound =
if composition_passthrough_bound == 0x300 {
0xCCu8
} else { composition_passthrough_bound.min(0x80) as u8 };
#[expect(clippy::unwrap_used)]
'fast: loop {
let mut code_unit_iter =
composition.decomposition.delegate.as_str().as_bytes().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < composition_passthrough_byte_bound {
continue 'fastest;
}
let Some(remaining_slice) =
pending_slice.get(pending_slice.len() -
code_unit_iter.as_slice().len() -
1..) else {
if true {
if !false {
::core::panicking::panic("assertion failed: false")
};
};
break 'fastest;
};
composition.decomposition.delegate =
remaining_slice.chars();
break 'fastest;
}
sink.write_str(pending_slice)?;
return Ok(());
}
let upcoming =
composition.decomposition.delegate.next().unwrap();
let upcoming_with_trie_value =
composition.decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards()
{
continue 'fast;
}
composition.decomposition.pending =
Some(upcoming_with_trie_value);
let mut consumed_so_far =
pending_slice[..pending_slice.len() -
composition.decomposition.delegate.as_str().len() -
upcoming.len_utf8()].chars();
undecomposed_starter =
composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
let consumed_so_far_slice = consumed_so_far.as_str();
sink.write_str(consumed_so_far_slice)?;
break 'fast;
}
}
}
}
let mut starter =
composition.decomposition.decomposing_next(undecomposed_starter);
'bufferloop: loop {
loop {
let (character, ccc) =
if let Some((character, ccc)) =
composition.decomposition.buffer.get(composition.decomposition.buffer_pos).map(|c|
c.character_and_ccc()) {
(character, ccc)
} else {
composition.decomposition.buffer.clear();
composition.decomposition.buffer_pos = 0;
break;
};
if let Some(composed) = composition.compose(starter, character) {
starter = composed;
composition.decomposition.buffer_pos += 1;
continue;
}
let mut most_recent_skipped_ccc = ccc;
if most_recent_skipped_ccc == CCC_NOT_REORDERED {
sink.write_char(starter)?;
starter = character;
composition.decomposition.buffer_pos += 1;
continue 'bufferloop;
} else {
{
let _ =
composition.decomposition.buffer.drain(0..composition.decomposition.buffer_pos);
}
composition.decomposition.buffer_pos = 0;
}
let mut i = 1;
while let Some((character, ccc)) =
composition.decomposition.buffer.get(i).map(|c|
c.character_and_ccc()) {
if ccc == CCC_NOT_REORDERED {
sink.write_char(starter)?;
for cc in composition.decomposition.buffer.drain(..i) {
sink.write_char(cc.character())?;
}
starter = character;
{
let removed = composition.decomposition.buffer.remove(0);
if true {
match (&starter, &removed.character()) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
}
if true {
match (&composition.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
continue 'bufferloop;
}
if true {
if !(ccc >= most_recent_skipped_ccc) {
::core::panicking::panic("assertion failed: ccc >= most_recent_skipped_ccc")
};
};
if ccc != most_recent_skipped_ccc {
if let Some(composed) =
composition.compose_non_hangul(starter, character) {
composition.decomposition.buffer.remove(i);
starter = composed;
continue;
}
}
most_recent_skipped_ccc = ccc;
i += 1;
}
break;
}
if true {
match (&composition.decomposition.buffer_pos, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val,
&*right_val, ::core::option::Option::None);
}
}
};
};
if !composition.decomposition.buffer.is_empty() {
sink.write_char(starter)?;
for cc in composition.decomposition.buffer.drain(..) {
sink.write_char(cc.character())?;
}
continue 'outer;
}
if composition.decomposition.pending.is_some() {
let pending = composition.decomposition.pending.as_ref().unwrap();
if u32::from(pending.character) <
composition.composition_passthrough_bound ||
!pending.can_combine_backwards() {
sink.write_char(starter)?;
continue 'outer;
}
let pending_starter =
composition.decomposition.pending.take().unwrap();
let decomposed =
composition.decomposition.decomposing_next(pending_starter);
if let Some(composed) = composition.compose(starter, decomposed) {
starter = composed;
} else { sink.write_char(starter)?; starter = decomposed; }
continue 'bufferloop;
}
sink.write_char(starter)?;
return Ok(());
}
}composing_normalize_to!(
2512/// Normalize a string slice into a `Write` sink.
2513,
2514 normalize_to,
2515core::fmt::Write,
2516&str,
2517 {},
2518true,
2519 as_str,
2520 {
2521// Let's hope LICM hoists this outside `'outer`.
2522let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
25230xCCu8
2524} else {
2525// We can make this fancy if a normalization other than NFC where looking at
2526 // non-ASCII lead bytes is worthwhile is ever introduced.
2527composition_passthrough_bound.min(0x80) as u8
2528 };
2529// Attributes have to be on blocks, so hoisting all the way here.
2530#[expect(clippy::unwrap_used)]
2531'fast: loop {
2532let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
2533'fastest: loop {
2534if let Some(&upcoming_byte) = code_unit_iter.next() {
2535if upcoming_byte < composition_passthrough_byte_bound {
2536// Fast-track succeeded!
2537continue 'fastest;
2538 }
2539let Some(remaining_slice) = pending_slice.get(pending_slice.len() - code_unit_iter.as_slice().len() - 1..) else {
2540// If we ever come here, it's an internal bug. Let's avoid panic code paths in release builds.
2541debug_assert!(false);
2542// Throw away the fastest-path result in case of an internal bug.
2543break 'fastest;
2544 };
2545 composition.decomposition.delegate = remaining_slice.chars();
2546break 'fastest;
2547 }
2548// End of stream
2549sink.write_str(pending_slice)?;
2550return Ok(());
2551 }
2552// `unwrap()` OK, because the slice is valid UTF-8 and we know there
2553 // is an upcoming byte.
2554let upcoming = composition.decomposition.delegate.next().unwrap();
2555let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2556if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2557// Can't combine backwards, hence a plain (non-backwards-combining)
2558 // starter albeit past `composition_passthrough_bound`
25592560 // Fast-track succeeded!
2561continue 'fast;
2562 }
2563// We need to fall off the fast path.
2564composition.decomposition.pending = Some(upcoming_with_trie_value);
25652566// slicing and unwrap OK, because we've just evidently read enough previously.
2567let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
2568// `unwrap` OK, because we've previously manage to read the previous character
2569undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2570let consumed_so_far_slice = consumed_so_far.as_str();
2571 sink.write_str(consumed_so_far_slice)?;
2572break 'fast;
2573 }
2574 },
2575 text,
2576 sink,
2577 composition,
2578 composition_passthrough_bound,
2579 undecomposed_starter,
2580 pending_slice,
2581 len_utf8,
2582 );
25832584composing_normalize_to!(
2585/// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2586 ///
2587 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2588 /// according to the WHATWG Encoding Standard.
2589 ///
2590 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
2591#[cfg(feature = "utf8_iter")]
2592,
2593 normalize_utf8_to,
2594 core::fmt::Write,
2595&[u8],
2596 {},
2597false,
2598 as_slice,
2599 {
2600'fast: loop {
2601if let Some(upcoming) = composition.decomposition.delegate.next() {
2602if u32::from(upcoming) < composition_passthrough_bound {
2603// Fast-track succeeded!
2604continue 'fast;
2605 }
2606// TODO: Be statically aware of fast/small trie.
2607let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2608if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2609// Note: The trie value of the REPLACEMENT CHARACTER is
2610 // intentionally formatted to fail the
2611 // `potential_passthrough_and_cannot_combine_backwards`
2612 // test even though it really is a starter that decomposes
2613 // to self and cannot combine backwards. This
2614 // Allows moving the branch on REPLACEMENT CHARACTER
2615 // below this `continue`.
2616continue 'fast;
2617 }
2618// We need to fall off the fast path.
26192620 // TODO(#2006): Annotate as unlikely
2621if upcoming == REPLACEMENT_CHARACTER {
2622// Can't tell if this is an error or a literal U+FFFD in
2623 // the input. Assuming the former to be sure.
26242625 // Since the U+FFFD might signify an error, we can't
2626 // assume `upcoming.len_utf8()` for the backoff length.
2627#[expect(clippy::indexing_slicing)]
2628let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
2629let back = consumed_so_far.next_back();
2630debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2631let consumed_so_far_slice = consumed_so_far.as_slice();
2632 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
2633 undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2634 composition.decomposition.pending = None;
2635break 'fast;
2636 }
26372638 composition.decomposition.pending = Some(upcoming_with_trie_value);
2639// slicing and unwrap OK, because we've just evidently read enough previously.
2640 // `unwrap` OK, because we've previously manage to read the previous character
2641#[expect(clippy::indexing_slicing)]
2642let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
2643#[expect(clippy::unwrap_used)]
2644{
2645// TODO: If the previous character was below the passthrough bound,
2646 // we really need to read from the trie. Otherwise, we could maintain
2647 // the most-recent trie value. Need to measure what's more expensive:
2648 // Remembering the trie value on each iteration or re-reading the
2649 // last one after the fast-track run.
2650undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2651 }
2652let consumed_so_far_slice = consumed_so_far.as_slice();
2653 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
2654break 'fast;
2655 }
2656// End of stream
2657sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
2658return Ok(());
2659 }
2660 },
2661 text,
2662 sink,
2663 composition,
2664 composition_passthrough_bound,
2665 undecomposed_starter,
2666 pending_slice,
2667 len_utf8,
2668 );
26692670composing_normalize_to!(
2671/// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2672 ///
2673 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2674 /// before normalizing.
2675 ///
2676 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2677#[cfg(feature = "utf16_iter")]
2678,
2679 normalize_utf16_to,
2680 write16::Write16,
2681&[u16],
2682 {
2683 sink.size_hint(text.len())?;
2684 },
2685false,
2686 as_slice,
2687 {
2688// This loop is only broken out of as goto forward and only as release-build recovery from
2689 // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2690#[expect(clippy::never_loop)]
2691'fastwrap: loop {
2692// Commented out `code_unit_iter` and used `ptr` and `end` to
2693 // work around https://github.com/rust-lang/rust/issues/144684 .
2694 //
2695 // let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
2696let delegate_as_slice = composition.decomposition.delegate.as_slice();
2697let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2698// SAFETY: materializing a pointer immediately past the end of an
2699 // allocation is OK.
2700let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
27012702'fast: loop {
2703// if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2704if ptr != end {
2705// SAFETY: We just checked that `ptr` has not reached `end`.
2706 // `ptr` always advances by one, and we always have a check
2707 // per advancement.
2708let upcoming_code_unit = unsafe { *ptr };
2709// SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2710 // by one points to the same allocation or to immediately
2711 // after, which is OK.
2712ptr = unsafe { ptr.add(1) };
27132714let mut upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
2715 // The performance of what logically is supposed to be this
2716 // branch is somewhat brittle and what LLVM ends up doing
2717 // that affects the performance of what's logically about this
2718 // decision can swing to double/halve the throughput for Basic
2719 // Latin in ways that are completely unintuitive. Basically _any_
2720 // change to _any_ code that participates in how LLVM sees the
2721 // code around here can make the perf fall over. In seems that
2722 // manually annotating this branch as likely has worse effects
2723 // on non-Basic-Latin input that the case where LLVM just happens to
2724 // do the right thing.
2725 //
2726 // What happens with this branch may depend on what sink type
2727 // this code is monomorphized over.
2728 //
2729 // What a terrible sink of developer time!
2730if upcoming32 < composition_passthrough_bound {
2731// No need for surrogate or U+FFFD check, because
2732 // `composition_passthrough_bound` cannot be higher than
2733 // U+0300.
2734 // Fast-track succeeded!
2735continue 'fast;
2736 }
2737// We might be doing a trie lookup by surrogate. Surrogates get
2738 // a decomposition to U+FFFD.
2739let mut trie_value = composition.decomposition.trie.get16(upcoming_code_unit);
2740if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
2741// Can't combine backwards, hence a plain (non-backwards-combining)
2742 // starter albeit past `composition_passthrough_bound`
27432744 // Fast-track succeeded!
2745continue 'fast;
2746 }
27472748// We might now be looking at a surrogate.
2749 // The loop is only broken out of as goto forward
2750#[expect(clippy::never_loop)]
2751'surrogateloop: loop {
2752// The `likely` annotations _below_ exist to make the code _above_
2753 // go faster!
2754let surrogate_base = upcoming32.wrapping_sub(0xD800);
2755if likely(surrogate_base > (0xDFFF - 0xD800)) {
2756// Not surrogate
2757break 'surrogateloop;
2758 }
2759if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2760// let iter_backup = code_unit_iter.clone();
2761 // if let Some(&low) = code_unit_iter.next() {
2762if ptr != end {
2763// SAFETY: We just checked that `ptr` has not reached `end`.
2764 // `ptr` always advances by one, and we always have a check
2765 // per advancement.
2766let low = unsafe { *ptr };
2767if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2768// SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2769 // by one points to the same allocation or to immediately
2770 // after, which is OK.
2771ptr = unsafe { ptr.add(1) };
27722773 upcoming32 = (upcoming32 << 10) + u32::from(low)
2774 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2775// Successfully-paired surrogate. Read from the trie again.
2776trie_value = {
2777// Semantically, this bit of conditional compilation makes no sense.
2778 // The purpose is to keep LLVM seeing the untyped trie case the way
2779 // it did before so as not to regress the performance of the untyped
2780 // case due to unintuitive optimizer effects. If you care about the
2781 // perf of the untyped trie case and have better ideas, please try
2782 // something better.
2783#[cfg(not(icu4x_unstable_fast_trie_only))]
2784{composition.decomposition.trie.get32(upcoming32)}
2785#[cfg(icu4x_unstable_fast_trie_only)]
2786{composition.decomposition.trie.get32_supplementary(upcoming32)}
2787 };
2788if likely(potential_passthrough_and_cannot_combine_backwards_impl(trie_value)) {
2789// Fast-track succeeded!
2790continue 'fast;
2791 }
2792break 'surrogateloop;
2793// } else {
2794 // code_unit_iter = iter_backup;
2795}
2796 }
2797 }
2798// unpaired surrogate
2799upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2800 // trie_value already holds a decomposition to U+FFFD.
2801debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD);
2802break 'surrogateloop;
2803 }
28042805// SAFETY: upcoming32 can no longer be a surrogate.
2806let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2807let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2808// We need to fall off the fast path.
2809composition.decomposition.pending = Some(upcoming_with_trie_value);
2810let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2811// code_unit_iter.as_slice().len()
2812 // SAFETY: `ptr` and `end` have been derived from the same allocation
2813 // and `ptr` is never greater than `end`.
2814unsafe { end.offset_from(ptr) as usize }
2815 - upcoming.len_utf16()) else {
2816// If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2817debug_assert!(false);
2818// Throw away the results of the fast path.
2819break 'fastwrap;
2820 };
2821let mut consumed_so_far = consumed_so_far_slice.chars();
2822let Some(c_from_back) = consumed_so_far.next_back() else {
2823// If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2824debug_assert!(false);
2825// Throw away the results of the fast path.
2826break 'fastwrap;
2827 };
2828// TODO: If the previous character was below the passthrough bound,
2829 // we really need to read from the trie. Otherwise, we could maintain
2830 // the most-recent trie value. Need to measure what's more expensive:
2831 // Remembering the trie value on each iteration or re-reading the
2832 // last one after the fast-track run.
2833undecomposed_starter = composition.decomposition.attach_trie_value(c_from_back);
2834 sink.write_slice(consumed_so_far.as_slice())?;
2835break 'fast;
2836 }
2837// End of stream
2838sink.write_slice(pending_slice)?;
2839return Ok(());
2840 }
2841// Sync the main iterator
2842 // composition.decomposition.delegate = code_unit_iter.as_slice().chars();
2843 // SAFETY: `ptr` and `end` have been derive from the same allocation
2844 // and `ptr` is never greater than `end`.
2845composition.decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2846break 'fastwrap;
2847 }
2848 },
2849 text,
2850 sink,
2851 composition,
2852 composition_passthrough_bound,
2853 undecomposed_starter,
2854 pending_slice,
2855 len_utf16,
2856 );
2857}
28582859/// A normalizer for performing composing normalization.
2860#[derive(#[automatically_derived]
impl ::core::fmt::Debug for ComposingNormalizer {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f,
"ComposingNormalizer", "decomposing_normalizer",
&self.decomposing_normalizer, "canonical_compositions",
&&self.canonical_compositions)
}
}Debug)]
2861pub struct ComposingNormalizer {
2862 decomposing_normalizer: DecomposingNormalizer,
2863 canonical_compositions: DataPayload<NormalizerNfcV1>,
2864}
28652866impl ComposingNormalizer {
2867/// Constructs a borrowed version of this type for more efficient querying.
2868pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> {
2869ComposingNormalizerBorrowed {
2870 decomposing_normalizer: self.decomposing_normalizer.as_borrowed(),
2871 canonical_compositions: self.canonical_compositions.get(),
2872 }
2873 }
28742875/// NFC constructor using compiled data.
2876 ///
2877 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2878 ///
2879 /// [📚 Help choosing a constructor](icu_provider::constructors)
2880#[cfg(feature = "compiled_data")]
2881pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> {
2882ComposingNormalizerBorrowed::new_nfc()
2883 }
28842885icu_provider::gen_buffer_data_constructors!(
2886 () -> error: DataError,
2887 functions: [
2888 new_nfc: skip,
2889 try_new_nfc_with_buffer_provider,
2890 try_new_nfc_unstable,
2891Self,
2892 ]
2893 );
28942895#[doc = "A version of [`Self::new_nfc`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
2896pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError>
2897where
2898D: DataProvider<NormalizerNfdDataV1>
2899 + DataProvider<NormalizerNfdTablesV1>
2900 + DataProvider<NormalizerNfcV1>
2901 + ?Sized,
2902 {
2903let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
29042905let canonical_compositions: DataPayload<NormalizerNfcV1> =
2906provider.load(Default::default())?.payload;
29072908Ok(ComposingNormalizer {
2909decomposing_normalizer,
2910canonical_compositions,
2911 })
2912 }
29132914/// NFKC constructor using compiled data.
2915 ///
2916 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2917 ///
2918 /// [📚 Help choosing a constructor](icu_provider::constructors)
2919#[cfg(feature = "compiled_data")]
2920pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> {
2921ComposingNormalizerBorrowed::new_nfkc()
2922 }
29232924icu_provider::gen_buffer_data_constructors!(
2925 () -> error: DataError,
2926 functions: [
2927 new_nfkc: skip,
2928 try_new_nfkc_with_buffer_provider,
2929 try_new_nfkc_unstable,
2930Self,
2931 ]
2932 );
29332934#[doc = "A version of [`Self::new_nfkc`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
2935pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError>
2936where
2937D: DataProvider<NormalizerNfkdDataV1>
2938 + DataProvider<NormalizerNfdTablesV1>
2939 + DataProvider<NormalizerNfkdTablesV1>
2940 + DataProvider<NormalizerNfcV1>
2941 + ?Sized,
2942 {
2943let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
29442945let canonical_compositions: DataPayload<NormalizerNfcV1> =
2946provider.load(Default::default())?.payload;
29472948Ok(ComposingNormalizer {
2949decomposing_normalizer,
2950canonical_compositions,
2951 })
2952 }
29532954#[doc = "A version of [`Self::new_uts46`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2955pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError>
2956where
2957D: DataProvider<NormalizerUts46DataV1>
2958 + DataProvider<NormalizerNfdTablesV1>
2959 + DataProvider<NormalizerNfkdTablesV1>
2960// UTS 46 tables merged into CompatibilityDecompositionTablesV1
2961+ DataProvider<NormalizerNfcV1>
2962 + ?Sized,
2963 {
2964let decomposing_normalizer =
2965DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
29662967let canonical_compositions: DataPayload<NormalizerNfcV1> =
2968provider.load(Default::default())?.payload;
29692970Ok(ComposingNormalizer {
2971decomposing_normalizer,
2972canonical_compositions,
2973 })
2974 }
2975}
29762977#[cfg(feature = "utf16_iter")]
2978struct IsNormalizedSinkUtf16<'a> {
2979 expect: &'a [u16],
2980}
29812982#[cfg(feature = "utf16_iter")]
2983impl<'a> IsNormalizedSinkUtf16<'a> {
2984pub fn new(slice: &'a [u16]) -> Self {
2985 IsNormalizedSinkUtf16 { expect: slice }
2986 }
2987pub fn remaining_len(&self) -> usize {
2988self.expect.len()
2989 }
2990}
29912992#[cfg(feature = "utf16_iter")]
2993impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
2994fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
2995// We know that if we get a slice, it's a pass-through,
2996 // so we can compare addresses. Indexing is OK, because
2997 // an indexing failure would be a code bug rather than
2998 // an input or data issue.
2999#[expect(clippy::indexing_slicing)]
3000if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3001self.expect = &self.expect[s.len()..];
3002Ok(())
3003 } else {
3004Err(core::fmt::Error {})
3005 }
3006 }
30073008fn write_char(&mut self, c: char) -> core::fmt::Result {
3009let mut iter = self.expect.chars();
3010if iter.next() == Some(c) {
3011self.expect = iter.as_slice();
3012Ok(())
3013 } else {
3014Err(core::fmt::Error {})
3015 }
3016 }
3017}
30183019#[cfg(feature = "utf8_iter")]
3020struct IsNormalizedSinkUtf8<'a> {
3021 expect: &'a [u8],
3022}
30233024#[cfg(feature = "utf8_iter")]
3025impl<'a> IsNormalizedSinkUtf8<'a> {
3026pub fn new(slice: &'a [u8]) -> Self {
3027 IsNormalizedSinkUtf8 { expect: slice }
3028 }
3029pub fn remaining_len(&self) -> usize {
3030self.expect.len()
3031 }
3032}
30333034#[cfg(feature = "utf8_iter")]
3035impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
3036fn write_str(&mut self, s: &str) -> core::fmt::Result {
3037// We know that if we get a slice, it's a pass-through,
3038 // so we can compare addresses. Indexing is OK, because
3039 // an indexing failure would be a code bug rather than
3040 // an input or data issue.
3041#[expect(clippy::indexing_slicing)]
3042if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3043self.expect = &self.expect[s.len()..];
3044Ok(())
3045 } else {
3046Err(core::fmt::Error {})
3047 }
3048 }
30493050fn write_char(&mut self, c: char) -> core::fmt::Result {
3051let mut iter = self.expect.chars();
3052if iter.next() == Some(c) {
3053self.expect = iter.as_slice();
3054Ok(())
3055 } else {
3056Err(core::fmt::Error {})
3057 }
3058 }
3059}
30603061struct IsNormalizedSinkStr<'a> {
3062 expect: &'a str,
3063}
30643065impl<'a> IsNormalizedSinkStr<'a> {
3066pub fn new(slice: &'a str) -> Self {
3067IsNormalizedSinkStr { expect: slice }
3068 }
3069pub fn remaining_len(&self) -> usize {
3070self.expect.len()
3071 }
3072}
30733074impl core::fmt::Writefor IsNormalizedSinkStr<'_> {
3075fn write_str(&mut self, s: &str) -> core::fmt::Result {
3076// We know that if we get a slice, it's a pass-through,
3077 // so we can compare addresses. Indexing is OK, because
3078 // an indexing failure would be a code bug rather than
3079 // an input or data issue.
3080if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3081self.expect = &self.expect[s.len()..];
3082Ok(())
3083 } else {
3084Err(core::fmt::Error {})
3085 }
3086 }
30873088fn write_char(&mut self, c: char) -> core::fmt::Result {
3089let mut iter = self.expect.chars();
3090if iter.next() == Some(c) {
3091self.expect = iter.as_str();
3092Ok(())
3093 } else {
3094Err(core::fmt::Error {})
3095 }
3096 }
3097}