icu_normalizer/provider.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
6//!
7//! <div class="stab unstable">
8//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10//! to be stable, their Rust representation might not be. Use with caution.
11//! </div>
12//!
13//! Read more about data providers: [`icu_provider`]
14
15// Provider structs must be stable
16#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
17
18use icu_collections::char16trie::Char16Trie;
19use icu_collections::codepointtrie::CodePointTrie;
20use icu_provider::prelude::*;
21use zerovec::ZeroVec;
22
23#[cfg(feature = "compiled_data")]
24#[derive(Debug)]
25/// Baked data
26///
27/// <div class="stab unstable">
28/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
29/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
30/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
31/// </div>
32pub struct Baked;
33
34#[cfg(feature = "compiled_data")]
35const _: () = {
36 pub mod icu {
37 pub use crate as normalizer;
38 pub use icu_collections as collections;
39 }
40 icu_normalizer_data::make_provider!(Baked);
41 icu_normalizer_data::impl_normalizer_comp_v1!(Baked);
42 icu_normalizer_data::impl_normalizer_decomp_v1!(Baked);
43 icu_normalizer_data::impl_normalizer_nfd_v1!(Baked);
44 icu_normalizer_data::impl_normalizer_nfdex_v1!(Baked);
45 icu_normalizer_data::impl_normalizer_nfkd_v1!(Baked);
46 icu_normalizer_data::impl_normalizer_nfkdex_v1!(Baked);
47 icu_normalizer_data::impl_normalizer_uts46d_v1!(Baked);
48};
49
50#[cfg(feature = "datagen")]
51/// The latest minimum set of keys required by this component.
52pub const KEYS: &[DataKey] = &[
53 CanonicalCompositionsV1Marker::KEY,
54 CanonicalDecompositionDataV1Marker::KEY,
55 CanonicalDecompositionTablesV1Marker::KEY,
56 CompatibilityDecompositionSupplementV1Marker::KEY,
57 CompatibilityDecompositionTablesV1Marker::KEY,
58 NonRecursiveDecompositionSupplementV1Marker::KEY,
59 Uts46DecompositionSupplementV1Marker::KEY,
60];
61
62/// Main data for NFD
63///
64/// <div class="stab unstable">
65/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
66/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
67/// to be stable, their Rust representation might not be. Use with caution.
68/// </div>
69#[icu_provider::data_struct(marker(
70 CanonicalDecompositionDataV1Marker,
71 "normalizer/nfd@1",
72 singleton
73))]
74#[derive(Debug, PartialEq, Clone)]
75#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
76#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
77pub struct DecompositionDataV1<'data> {
78 /// Trie for NFD decomposition.
79 #[cfg_attr(feature = "serde", serde(borrow))]
80 pub trie: CodePointTrie<'data, u32>,
81}
82
83/// Data that either NFKD or the decomposed form of UTS 46 needs
84/// _in addition to_ the NFD data.
85///
86/// <div class="stab unstable">
87/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
88/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
89/// to be stable, their Rust representation might not be. Use with caution.
90/// </div>
91#[icu_provider::data_struct(
92 marker(
93 CompatibilityDecompositionSupplementV1Marker,
94 "normalizer/nfkd@1",
95 singleton
96 ),
97 marker(Uts46DecompositionSupplementV1Marker, "normalizer/uts46d@1", singleton)
98)]
99#[derive(Debug, PartialEq, Clone)]
100#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
101#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
102pub struct DecompositionSupplementV1<'data> {
103 /// Trie for the decompositions that differ from NFD.
104 /// Getting a zero from this trie means that you need
105 /// to make another lookup from `DecompositionDataV1::trie`.
106 #[cfg_attr(feature = "serde", serde(borrow))]
107 pub trie: CodePointTrie<'data, u32>,
108 /// Flags that indicate how the set of characters whose
109 /// decompositions starts with a non-starter differs from
110 /// the set for NFD.
111 ///
112 /// Bit 0: Whether half-width kana voicing marks decompose
113 /// into non-starters (their full-width combining
114 /// counterparts).
115 /// Bit 1: Whether U+0345 COMBINING GREEK YPOGEGRAMMENI
116 /// decomposes into a starter (U+03B9 GREEK SMALL
117 /// LETTER IOTA).
118 /// (Other bits unused.)
119 pub flags: u8,
120 /// The passthrough bounds of NFD/NFC are lowered to this
121 /// maximum instead. (16-bit, because cannot be higher
122 /// than 0x0300, which is the bound for NFC.)
123 pub passthrough_cap: u16,
124}
125
126impl DecompositionSupplementV1<'_> {
127 const HALF_WIDTH_VOICING_MARK_MASK: u8 = 1;
128
129 /// Whether half-width kana voicing marks decompose into non-starters
130 /// (their full-width combining counterparts).
131 pub fn half_width_voicing_marks_become_non_starters(&self) -> bool {
132 (self.flags & DecompositionSupplementV1::HALF_WIDTH_VOICING_MARK_MASK) != 0
133 }
134}
135
136/// The expansion tables for cases where the decomposition isn't
137/// contained in the trie value
138///
139/// <div class="stab unstable">
140/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
141/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
142/// to be stable, their Rust representation might not be. Use with caution.
143/// </div>
144#[icu_provider::data_struct(
145 marker(CanonicalDecompositionTablesV1Marker, "normalizer/nfdex@1", singleton),
146 marker(
147 CompatibilityDecompositionTablesV1Marker,
148 "normalizer/nfkdex@1",
149 singleton
150 )
151)]
152#[derive(Debug, PartialEq, Clone)]
153#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
154#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
155pub struct DecompositionTablesV1<'data> {
156 /// Decompositions that are fully within the BMP
157 #[cfg_attr(feature = "serde", serde(borrow))]
158 pub scalars16: ZeroVec<'data, u16>,
159 /// Decompositions with at least one character outside
160 /// the BMP
161 #[cfg_attr(feature = "serde", serde(borrow))]
162 pub scalars24: ZeroVec<'data, char>,
163}
164
165/// Non-Hangul canonical compositions
166///
167/// <div class="stab unstable">
168/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
169/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
170/// to be stable, their Rust representation might not be. Use with caution.
171/// </div>
172#[icu_provider::data_struct(marker(CanonicalCompositionsV1Marker, "normalizer/comp@1", singleton))]
173#[derive(Debug, PartialEq, Clone)]
174#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
175#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
176pub struct CanonicalCompositionsV1<'data> {
177 /// Trie keys are two-`char` strings with the second
178 /// character coming first. The value, if any, is the
179 /// (non-Hangul) canonical composition.
180 #[cfg_attr(feature = "serde", serde(borrow))]
181 pub canonical_compositions: Char16Trie<'data>,
182}
183
184/// Non-recursive canonical decompositions that differ from
185/// `DecompositionDataV1`.
186///
187/// <div class="stab unstable">
188/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
189/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
190/// to be stable, their Rust representation might not be. Use with caution.
191/// </div>
192#[icu_provider::data_struct(marker(
193 NonRecursiveDecompositionSupplementV1Marker,
194 "normalizer/decomp@1",
195 singleton
196))]
197#[derive(Debug, PartialEq, Clone)]
198#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
199#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
200pub struct NonRecursiveDecompositionSupplementV1<'data> {
201 /// Trie for the supplementary non-recursive decompositions
202 #[cfg_attr(feature = "serde", serde(borrow))]
203 pub trie: CodePointTrie<'data, u32>,
204 /// Decompositions with at least one character outside
205 /// the BMP
206 #[cfg_attr(feature = "serde", serde(borrow))]
207 pub scalars24: ZeroVec<'data, char>,
208}