icu_normalizer/
provider.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
6//!
7//! <div class="stab unstable">
8//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10//! to be stable, their Rust representation might not be. Use with caution.
11//! </div>
12//!
13//! Read more about data providers: [`icu_provider`]
14
15// Provider structs must be stable
16#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
17
18use icu_collections::char16trie::Char16Trie;
19use icu_collections::codepointtrie::CodePointTrie;
20use icu_provider::prelude::*;
21use zerovec::ZeroVec;
22
23#[cfg(feature = "compiled_data")]
24#[derive(Debug)]
25/// Baked data
26///
27/// <div class="stab unstable">
28/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
29/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
30/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
31/// </div>
32pub struct Baked;
33
34#[cfg(feature = "compiled_data")]
35const _: () = {
36    pub mod icu {
37        pub use crate as normalizer;
38        pub use icu_collections as collections;
39    }
40    icu_normalizer_data::make_provider!(Baked);
41    icu_normalizer_data::impl_normalizer_comp_v1!(Baked);
42    icu_normalizer_data::impl_normalizer_decomp_v1!(Baked);
43    icu_normalizer_data::impl_normalizer_nfd_v1!(Baked);
44    icu_normalizer_data::impl_normalizer_nfdex_v1!(Baked);
45    icu_normalizer_data::impl_normalizer_nfkd_v1!(Baked);
46    icu_normalizer_data::impl_normalizer_nfkdex_v1!(Baked);
47    icu_normalizer_data::impl_normalizer_uts46d_v1!(Baked);
48};
49
50#[cfg(feature = "datagen")]
51/// The latest minimum set of keys required by this component.
52pub const KEYS: &[DataKey] = &[
53    CanonicalCompositionsV1Marker::KEY,
54    CanonicalDecompositionDataV1Marker::KEY,
55    CanonicalDecompositionTablesV1Marker::KEY,
56    CompatibilityDecompositionSupplementV1Marker::KEY,
57    CompatibilityDecompositionTablesV1Marker::KEY,
58    NonRecursiveDecompositionSupplementV1Marker::KEY,
59    Uts46DecompositionSupplementV1Marker::KEY,
60];
61
62/// Main data for NFD
63///
64/// <div class="stab unstable">
65/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
66/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
67/// to be stable, their Rust representation might not be. Use with caution.
68/// </div>
69#[icu_provider::data_struct(marker(
70    CanonicalDecompositionDataV1Marker,
71    "normalizer/nfd@1",
72    singleton
73))]
74#[derive(Debug, PartialEq, Clone)]
75#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
76#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
77pub struct DecompositionDataV1<'data> {
78    /// Trie for NFD decomposition.
79    #[cfg_attr(feature = "serde", serde(borrow))]
80    pub trie: CodePointTrie<'data, u32>,
81}
82
83/// Data that either NFKD or the decomposed form of UTS 46 needs
84/// _in addition to_ the NFD data.
85///
86/// <div class="stab unstable">
87/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
88/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
89/// to be stable, their Rust representation might not be. Use with caution.
90/// </div>
91#[icu_provider::data_struct(
92    marker(
93        CompatibilityDecompositionSupplementV1Marker,
94        "normalizer/nfkd@1",
95        singleton
96    ),
97    marker(Uts46DecompositionSupplementV1Marker, "normalizer/uts46d@1", singleton)
98)]
99#[derive(Debug, PartialEq, Clone)]
100#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
101#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
102pub struct DecompositionSupplementV1<'data> {
103    /// Trie for the decompositions that differ from NFD.
104    /// Getting a zero from this trie means that you need
105    /// to make another lookup from `DecompositionDataV1::trie`.
106    #[cfg_attr(feature = "serde", serde(borrow))]
107    pub trie: CodePointTrie<'data, u32>,
108    /// Flags that indicate how the set of characters whose
109    /// decompositions starts with a non-starter differs from
110    /// the set for NFD.
111    ///
112    /// Bit 0: Whether half-width kana voicing marks decompose
113    ///        into non-starters (their full-width combining
114    ///        counterparts).
115    /// Bit 1: Whether U+0345 COMBINING GREEK YPOGEGRAMMENI
116    ///        decomposes into a starter (U+03B9 GREEK SMALL
117    ///        LETTER IOTA).
118    /// (Other bits unused.)
119    pub flags: u8,
120    /// The passthrough bounds of NFD/NFC are lowered to this
121    /// maximum instead. (16-bit, because cannot be higher
122    /// than 0x0300, which is the bound for NFC.)
123    pub passthrough_cap: u16,
124}
125
126impl DecompositionSupplementV1<'_> {
127    const HALF_WIDTH_VOICING_MARK_MASK: u8 = 1;
128
129    /// Whether half-width kana voicing marks decompose into non-starters
130    /// (their full-width combining counterparts).
131    pub fn half_width_voicing_marks_become_non_starters(&self) -> bool {
132        (self.flags & DecompositionSupplementV1::HALF_WIDTH_VOICING_MARK_MASK) != 0
133    }
134}
135
136/// The expansion tables for cases where the decomposition isn't
137/// contained in the trie value
138///
139/// <div class="stab unstable">
140/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
141/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
142/// to be stable, their Rust representation might not be. Use with caution.
143/// </div>
144#[icu_provider::data_struct(
145    marker(CanonicalDecompositionTablesV1Marker, "normalizer/nfdex@1", singleton),
146    marker(
147        CompatibilityDecompositionTablesV1Marker,
148        "normalizer/nfkdex@1",
149        singleton
150    )
151)]
152#[derive(Debug, PartialEq, Clone)]
153#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
154#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
155pub struct DecompositionTablesV1<'data> {
156    /// Decompositions that are fully within the BMP
157    #[cfg_attr(feature = "serde", serde(borrow))]
158    pub scalars16: ZeroVec<'data, u16>,
159    /// Decompositions with at least one character outside
160    /// the BMP
161    #[cfg_attr(feature = "serde", serde(borrow))]
162    pub scalars24: ZeroVec<'data, char>,
163}
164
165/// Non-Hangul canonical compositions
166///
167/// <div class="stab unstable">
168/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
169/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
170/// to be stable, their Rust representation might not be. Use with caution.
171/// </div>
172#[icu_provider::data_struct(marker(CanonicalCompositionsV1Marker, "normalizer/comp@1", singleton))]
173#[derive(Debug, PartialEq, Clone)]
174#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
175#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
176pub struct CanonicalCompositionsV1<'data> {
177    /// Trie keys are two-`char` strings with the second
178    /// character coming first. The value, if any, is the
179    /// (non-Hangul) canonical composition.
180    #[cfg_attr(feature = "serde", serde(borrow))]
181    pub canonical_compositions: Char16Trie<'data>,
182}
183
184/// Non-recursive canonical decompositions that differ from
185/// `DecompositionDataV1`.
186///
187/// <div class="stab unstable">
188/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
189/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
190/// to be stable, their Rust representation might not be. Use with caution.
191/// </div>
192#[icu_provider::data_struct(marker(
193    NonRecursiveDecompositionSupplementV1Marker,
194    "normalizer/decomp@1",
195    singleton
196))]
197#[derive(Debug, PartialEq, Clone)]
198#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
199#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
200pub struct NonRecursiveDecompositionSupplementV1<'data> {
201    /// Trie for the supplementary non-recursive decompositions
202    #[cfg_attr(feature = "serde", serde(borrow))]
203    pub trie: CodePointTrie<'data, u32>,
204    /// Decompositions with at least one character outside
205    /// the BMP
206    #[cfg_attr(feature = "serde", serde(borrow))]
207    pub scalars24: ZeroVec<'data, char>,
208}