icu_normalizer/uts46.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Bundles the part of UTS 46 that makes sense to implement as a
6//! normalization.
7//!
8//! This is meant to be used as a building block of an UTS 46
9//! implementation, such as the `idna` crate.
10
11use crate::CanonicalCompositionsV1Marker;
12use crate::CanonicalDecompositionDataV1Marker;
13use crate::CanonicalDecompositionTablesV1Marker;
14use crate::CompatibilityDecompositionTablesV1Marker;
15use crate::ComposingNormalizer;
16use crate::NormalizerError;
17use crate::Uts46DecompositionSupplementV1Marker;
18use icu_provider::DataProvider;
19
20// Implementation note: Despite merely wrapping a `ComposingNormalizer`,
21// having a `Uts46Mapper` serves two purposes:
22//
23// 1. Denying public access to parts of the `ComposingNormalizer` API
24// that don't work when the data contains markers for ignorables.
25// 2. Providing a place where additional iterator pre-processing or
26// post-processing can take place if needed in the future. (When
27// writing this, it looked like such processing was needed but
28// now isn't needed after all.)
29
30/// A mapper that knows how to performs the subsets of UTS 46 processing
31/// documented on the methods.
32#[derive(Debug)]
33pub struct Uts46Mapper {
34 normalizer: ComposingNormalizer,
35}
36
37#[cfg(feature = "compiled_data")]
38impl Default for Uts46Mapper {
39 fn default() -> Self {
40 Self::new()
41 }
42}
43
44impl Uts46Mapper {
45 /// Construct with compiled data.
46 #[cfg(feature = "compiled_data")]
47 pub const fn new() -> Self {
48 Uts46Mapper {
49 normalizer: ComposingNormalizer::new_uts46(),
50 }
51 }
52
53 /// Construct with provider.
54 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
55 pub fn try_new<D>(provider: &D) -> Result<Self, NormalizerError>
56 where
57 D: DataProvider<CanonicalDecompositionDataV1Marker>
58 + DataProvider<Uts46DecompositionSupplementV1Marker>
59 + DataProvider<CanonicalDecompositionTablesV1Marker>
60 + DataProvider<CompatibilityDecompositionTablesV1Marker>
61 // UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
62 + DataProvider<CanonicalCompositionsV1Marker>
63 + ?Sized,
64 {
65 let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?;
66
67 Ok(Uts46Mapper { normalizer })
68 }
69
70 /// Returns an iterator adaptor that turns an `Iterator` over `char`
71 /// into an iterator yielding a `char` sequence that gets the following
72 /// operations from the "Map" and "Normalize" steps of the "Processing"
73 /// section of UTS 46 lazily applied to it:
74 ///
75 /// 1. The _ignored_ characters are ignored.
76 /// 2. The _mapped_ characters are mapped.
77 /// 3. The _disallowed_ characters are replaced with U+FFFD,
78 /// which itself is a disallowed character.
79 /// 4. The _deviation_ characters are treated as _mapped_ or _valid_
80 /// as appropriate.
81 /// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
82 /// 6. The _disallowed_STD3_mapped_ characters are treated as
83 /// _mapped_.
84 /// 7. The result is normalized to NFC.
85 ///
86 /// Notably:
87 ///
88 /// * The STD3 or WHATWG ASCII deny list should be implemented as a
89 /// post-processing step.
90 /// * Transitional processing is not performed. Transitional mapping
91 /// would be a pre-processing step, but transitional processing is
92 /// deprecated, and none of Firefox, Safari, or Chrome use it.
93 pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>(
94 &'delegate self,
95 iter: I,
96 ) -> impl Iterator<Item = char> + 'delegate {
97 self.normalizer
98 .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored)
99 }
100
101 /// Returns an iterator adaptor that turns an `Iterator` over `char`
102 /// into an iterator yielding a `char` sequence that gets the following
103 /// operations from the NFC check and statucs steps of the "Validity
104 /// Criteria" section of UTS 46 lazily applied to it:
105 ///
106 /// 1. The _ignored_ characters are treated as _disallowed_.
107 /// 2. The _mapped_ characters are mapped.
108 /// 3. The _disallowed_ characters are replaced with U+FFFD,
109 /// which itself is a disallowed character.
110 /// 4. The _deviation_ characters are treated as _mapped_ or _valid_
111 /// as appropriate.
112 /// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
113 /// 6. The _disallowed_STD3_mapped_ characters are treated as
114 /// _mapped_.
115 /// 7. The result is normalized to NFC.
116 ///
117 /// Notably:
118 ///
119 /// * The STD3 or WHATWG ASCII deny list should be implemented as a
120 /// post-processing step.
121 /// * Transitional processing is not performed. Transitional mapping
122 /// would be a pre-processing step, but transitional processing is
123 /// deprecated, and none of Firefox, Safari, or Chrome use it.
124 /// * The output needs to be compared with input to see if anything
125 /// changed. This check catches failures to adhere to the normalization
126 /// and status requirements. In particular, this comparison results
127 /// in _mapped_ characters resulting in error like "Validity Criteria"
128 /// requires.
129 pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>(
130 &'delegate self,
131 iter: I,
132 ) -> impl Iterator<Item = char> + 'delegate {
133 self.normalizer
134 .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter)
135 }
136}