icu_normalizer/
uts46.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Bundles the part of UTS 46 that makes sense to implement as a
6//! normalization.
7//!
8//! This is meant to be used as a building block of an UTS 46
9//! implementation, such as the `idna` crate.
10
11use crate::CanonicalCompositionsV1Marker;
12use crate::CanonicalDecompositionDataV1Marker;
13use crate::CanonicalDecompositionTablesV1Marker;
14use crate::CompatibilityDecompositionTablesV1Marker;
15use crate::ComposingNormalizer;
16use crate::NormalizerError;
17use crate::Uts46DecompositionSupplementV1Marker;
18use icu_provider::DataProvider;
19
20// Implementation note: Despite merely wrapping a `ComposingNormalizer`,
21// having a `Uts46Mapper` serves two purposes:
22//
23// 1. Denying public access to parts of the `ComposingNormalizer` API
24//    that don't work when the data contains markers for ignorables.
25// 2. Providing a place where additional iterator pre-processing or
26//    post-processing can take place if needed in the future. (When
27//    writing this, it looked like such processing was needed but
28//    now isn't needed after all.)
29
30/// A mapper that knows how to performs the subsets of UTS 46 processing
31/// documented on the methods.
32#[derive(Debug)]
33pub struct Uts46Mapper {
34    normalizer: ComposingNormalizer,
35}
36
37#[cfg(feature = "compiled_data")]
38impl Default for Uts46Mapper {
39    fn default() -> Self {
40        Self::new()
41    }
42}
43
44impl Uts46Mapper {
45    /// Construct with compiled data.
46    #[cfg(feature = "compiled_data")]
47    pub const fn new() -> Self {
48        Uts46Mapper {
49            normalizer: ComposingNormalizer::new_uts46(),
50        }
51    }
52
53    /// Construct with provider.
54    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
55    pub fn try_new<D>(provider: &D) -> Result<Self, NormalizerError>
56    where
57        D: DataProvider<CanonicalDecompositionDataV1Marker>
58            + DataProvider<Uts46DecompositionSupplementV1Marker>
59            + DataProvider<CanonicalDecompositionTablesV1Marker>
60            + DataProvider<CompatibilityDecompositionTablesV1Marker>
61            // UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
62            + DataProvider<CanonicalCompositionsV1Marker>
63            + ?Sized,
64    {
65        let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?;
66
67        Ok(Uts46Mapper { normalizer })
68    }
69
70    /// Returns an iterator adaptor that turns an `Iterator` over `char`
71    /// into an iterator yielding a `char` sequence that gets the following
72    /// operations from the "Map" and "Normalize" steps of the "Processing"
73    /// section of UTS 46 lazily applied to it:
74    ///
75    /// 1. The _ignored_ characters are ignored.
76    /// 2. The _mapped_ characters are mapped.
77    /// 3. The _disallowed_ characters are replaced with U+FFFD,
78    ///    which itself is a disallowed character.
79    /// 4. The _deviation_ characters are treated as _mapped_ or _valid_
80    ///    as appropriate.
81    /// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
82    /// 6. The _disallowed_STD3_mapped_ characters are treated as
83    ///    _mapped_.
84    /// 7. The result is normalized to NFC.
85    ///
86    /// Notably:
87    ///
88    /// * The STD3 or WHATWG ASCII deny list should be implemented as a
89    ///   post-processing step.
90    /// * Transitional processing is not performed. Transitional mapping
91    ///   would be a pre-processing step, but transitional processing is
92    ///   deprecated, and none of Firefox, Safari, or Chrome use it.
93    pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>(
94        &'delegate self,
95        iter: I,
96    ) -> impl Iterator<Item = char> + 'delegate {
97        self.normalizer
98            .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored)
99    }
100
101    /// Returns an iterator adaptor that turns an `Iterator` over `char`
102    /// into an iterator yielding a `char` sequence that gets the following
103    /// operations from the NFC check and statucs steps of the "Validity
104    /// Criteria" section of UTS 46 lazily applied to it:
105    ///
106    /// 1. The _ignored_ characters are treated as _disallowed_.
107    /// 2. The _mapped_ characters are mapped.
108    /// 3. The _disallowed_ characters are replaced with U+FFFD,
109    ///    which itself is a disallowed character.
110    /// 4. The _deviation_ characters are treated as _mapped_ or _valid_
111    ///    as appropriate.
112    /// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
113    /// 6. The _disallowed_STD3_mapped_ characters are treated as
114    ///    _mapped_.
115    /// 7. The result is normalized to NFC.
116    ///
117    /// Notably:
118    ///
119    /// * The STD3 or WHATWG ASCII deny list should be implemented as a
120    ///   post-processing step.
121    /// * Transitional processing is not performed. Transitional mapping
122    ///   would be a pre-processing step, but transitional processing is
123    ///   deprecated, and none of Firefox, Safari, or Chrome use it.
124    /// * The output needs to be compared with input to see if anything
125    ///   changed. This check catches failures to adhere to the normalization
126    ///   and status requirements. In particular, this comparison results
127    ///   in _mapped_ characters resulting in error like "Validity Criteria"
128    ///   requires.
129    pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>(
130        &'delegate self,
131        iter: I,
132    ) -> impl Iterator<Item = char> + 'delegate {
133        self.normalizer
134            .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter)
135    }
136}