icu_properties/provider/names.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! 🚧 \[Unstable\] Property names-related data for this component
6//!
7//! <div class="stab unstable">
8//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10//! to be stable, their Rust representation might not be. Use with caution.
11//! </div>
12//!
13//! Read more about data providers: [`icu_provider`]
14
15use alloc::boxed::Box;
16use core::cmp::Ordering;
17
18use core::str;
19
20use icu_provider::prelude::*;
21
22use tinystr::TinyStr4;
23use zerovec::ule::{UnvalidatedStr, VarULE};
24use zerovec::{maps::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroMap, ZeroVec};
25
26/// This is a property name that can be "loose matched" as according to
27/// [PropertyValueAliases.txt](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt)
28///
29/// (matched case-insensitively in ASCII, ignoring underscores, whitespace, and hyphens)
30///
31/// This is expected to be ASCII, but we do not rely on this invariant anywhere except during
32/// datagen.
33///
34/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items
35/// will sort into the same area, such that a map can be searched for both strict and loose equality.
36///
37/// <div class="stab unstable">
38/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
39/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
40/// to be stable, their Rust representation might not be. Use with caution.
41/// </div>
42///
43/// # Examples
44///
45/// Using a [`NormalizedPropertyNameStr`] as the key of a [`ZeroMap`]:
46///
47/// ```
48/// use icu::properties::provider::names::NormalizedPropertyNameStr;
49/// use zerovec::ZeroMap;
50///
51/// let map: ZeroMap<NormalizedPropertyNameStr, usize> = [
52/// (NormalizedPropertyNameStr::from_str("A_BC"), 11),
53/// (NormalizedPropertyNameStr::from_str("dEf"), 22),
54/// (NormalizedPropertyNameStr::from_str("G_H-I"), 33),
55/// ]
56/// .into_iter()
57/// .collect();
58///
59/// let key_approx = NormalizedPropertyNameStr::from_str("AB-C");
60/// let key_exact = NormalizedPropertyNameStr::from_str("A_BC");
61///
62/// // Strict lookup:
63/// assert_eq!(None, map.get_copied(key_approx));
64/// assert_eq!(Some(11), map.get_copied(key_exact));
65///
66/// // Loose lookup:
67/// assert_eq!(Some(11), map.get_copied_by(|u| u.cmp_loose(key_approx)));
68/// assert_eq!(Some(11), map.get_copied_by(|u| u.cmp_loose(key_exact)));
69/// ```
70#[derive(PartialEq, Eq)] // VarULE wants these to be byte equality
71#[derive(Debug, VarULE)]
72#[cfg_attr(feature = "serde", derive(serde::Serialize))]
73#[repr(transparent)]
74pub struct NormalizedPropertyNameStr(UnvalidatedStr);
75
76/// This impl requires enabling the optional `serde` Cargo feature of the `icu::properties` crate
77#[cfg(feature = "serde")]
78impl<'de> serde::Deserialize<'de> for Box<NormalizedPropertyNameStr> {
79 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
80 where
81 D: serde::Deserializer<'de>,
82 {
83 <Box<UnvalidatedStr>>::deserialize(deserializer).map(NormalizedPropertyNameStr::cast_box)
84 }
85}
86
87/// This impl requires enabling the optional `serde` Cargo feature of the `icu::properties` crate
88#[cfg(feature = "serde")]
89impl<'de, 'a> serde::Deserialize<'de> for &'a NormalizedPropertyNameStr
90where
91 'de: 'a,
92{
93 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
94 where
95 D: serde::Deserializer<'de>,
96 {
97 <&UnvalidatedStr>::deserialize(deserializer).map(NormalizedPropertyNameStr::cast_ref)
98 }
99}
100
101impl<'a> ZeroMapKV<'a> for NormalizedPropertyNameStr {
102 type Container = VarZeroVec<'a, NormalizedPropertyNameStr>;
103 type Slice = VarZeroSlice<NormalizedPropertyNameStr>;
104 type GetType = NormalizedPropertyNameStr;
105 type OwnedType = Box<NormalizedPropertyNameStr>;
106}
107
108/// The Ord/PartialOrd impl will sort things using strict equality, but in such a way that all loose-equal items
109/// will sort into the same area, such that a map can be searched for both strict and loose equality.
110impl PartialOrd for NormalizedPropertyNameStr {
111 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
112 Some(self.cmp(other))
113 }
114}
115
116/// Normalize a character based on the "loose matching" described in PropertyValueAliases.txt,
117/// returning `None` for skippable characters
118///
119/// ICU has [code for this][1] (and [during property lookup][2]) which we emulate.
120/// In particular, ICU only does normalization within ASCII, which makes sense since character names
121/// seem to be only ASCII.
122///
123/// [1]: https://github.com/unicode-org/icu/blob/288c4c7555915ce7b1fb675d94ddd495058fc039/icu4c/source/common/propname.cpp#L35
124/// [2]: https://github.com/unicode-org/icu/blob/288c4c7555915ce7b1fb675d94ddd495058fc039/icu4c/source/common/propname.cpp#L226-L230
125fn normalize_char(ch: u8) -> Option<u8> {
126 match ch {
127 // all ascii whitespace
128 ch if ch.is_ascii_whitespace() => None,
129 // underscores, hyphens, and the vertical tab character
130 // not covered by is_ascii_whitespace()
131 b'_' | b'-' | 0x0B => None,
132 // ignore case by lowercasing
133 ch => Some(ch.to_ascii_lowercase()),
134 }
135}
136
137/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items
138/// will sort into the same area, such that a map can be searched for both strict and loose equality.
139impl Ord for NormalizedPropertyNameStr {
140 fn cmp(&self, other: &Self) -> Ordering {
141 let cmp = self.cmp_loose(other);
142 // When loose equality holds, fall back to strict equality
143 if cmp == Ordering::Equal {
144 self.0.cmp(&other.0)
145 } else {
146 cmp
147 }
148 }
149}
150
151impl NormalizedPropertyNameStr {
152 /// Perform the loose comparison as defined in [`NormalizedPropertyNameStr`].
153 pub fn cmp_loose(&self, other: &Self) -> Ordering {
154 let self_iter = self.0.iter().copied().filter_map(normalize_char);
155 let other_iter = other.0.iter().copied().filter_map(normalize_char);
156 self_iter.cmp(other_iter)
157 }
158
159 /// Convert a string reference to a [`NormalizedPropertyNameStr`].
160 pub const fn from_str(s: &str) -> &Self {
161 Self::cast_ref(UnvalidatedStr::from_str(s))
162 }
163
164 /// Convert a [`UnvalidatedStr`] reference to a [`NormalizedPropertyNameStr`] reference.
165 pub const fn cast_ref(value: &UnvalidatedStr) -> &Self {
166 // Safety: repr(transparent)
167 unsafe { core::mem::transmute(value) }
168 }
169
170 /// Convert a [`UnvalidatedStr`] box to a [`NormalizedPropertyNameStr`] box.
171 pub const fn cast_box(value: Box<UnvalidatedStr>) -> Box<Self> {
172 // Safety: repr(transparent)
173 unsafe { core::mem::transmute(value) }
174 }
175
176 /// Get a [`NormalizedPropertyNameStr`] box from a byte slice.
177 pub fn boxed_from_bytes(b: &[u8]) -> Box<Self> {
178 Self::cast_box(UnvalidatedStr::from_boxed_bytes(b.into()))
179 }
180}
181
182/// A set of characters and strings which share a particular property value.
183///
184/// <div class="stab unstable">
185/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
186/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
187/// to be stable, their Rust representation might not be. Use with caution.
188/// </div>
189#[derive(Debug, Clone, PartialEq)]
190#[icu_provider::data_struct(marker(
191 GeneralCategoryMaskNameToValueV1Marker,
192 "propnames/from/gcm@1",
193 singleton,
194))]
195#[cfg_attr(
196 feature = "datagen",
197 derive(serde::Serialize, databake::Bake),
198 databake(path = icu_properties::provider::names),
199)]
200#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
201#[yoke(prove_covariance_manually)]
202pub struct PropertyValueNameToEnumMapV1<'data> {
203 /// A map from names to their value discriminant
204 #[cfg_attr(feature = "serde", serde(borrow))]
205 pub map: ZeroMap<'data, NormalizedPropertyNameStr, u16>,
206}
207
208/// A mapping of property values to their names. A single instance of this map will only cover
209/// either long or short names, determined whilst loading data.
210///
211/// <div class="stab unstable">
212/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
213/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
214/// to be stable, their Rust representation might not be. Use with caution.
215/// </div>
216#[derive(Debug, Clone, PartialEq)]
217#[icu_provider::data_struct]
218#[cfg_attr(
219 feature = "datagen",
220 derive(serde::Serialize, databake::Bake),
221 databake(path = icu_properties::provider::names),
222)]
223#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
224#[yoke(prove_covariance_manually)]
225pub struct PropertyEnumToValueNameSparseMapV1<'data> {
226 /// A map from the value discriminant to the names
227 #[cfg_attr(feature = "serde", serde(borrow))]
228 pub map: ZeroMap<'data, u16, str>,
229}
230
231/// A mapping of property values to their names. A single instance of this map will only cover
232/// either long or short names, determined whilst loading data.
233///
234/// <div class="stab unstable">
235/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
236/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
237/// to be stable, their Rust representation might not be. Use with caution.
238/// </div>
239#[derive(Debug, Clone, PartialEq)]
240#[icu_provider::data_struct]
241#[cfg_attr(
242 feature = "datagen",
243 derive(serde::Serialize, databake::Bake),
244 databake(path = icu_properties::provider::names),
245)]
246#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
247#[yoke(prove_covariance_manually)]
248pub struct PropertyEnumToValueNameLinearMapV1<'data> {
249 /// A map from the value discriminant (the index) to the names, for mostly
250 /// contiguous data. Empty strings count as missing.
251 #[cfg_attr(feature = "serde", serde(borrow))]
252 pub map: VarZeroVec<'data, str>,
253}
254
255/// A mapping of property values to their names. A single instance of this map will only cover
256/// either long or short names, determined whilst loading data.
257///
258/// <div class="stab unstable">
259/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
260/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
261/// to be stable, their Rust representation might not be. Use with caution.
262/// </div>
263#[derive(Debug, Clone, PartialEq)]
264#[icu_provider::data_struct]
265#[cfg_attr(
266 feature = "datagen",
267 derive(serde::Serialize, databake::Bake),
268 databake(path = icu_properties::provider::names),
269)]
270#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
271#[yoke(prove_covariance_manually)]
272pub struct PropertyEnumToValueNameLinearTiny4MapV1<'data> {
273 /// A map from the value discriminant (the index) to the names, for mostly
274 /// contiguous data. Empty strings count as missing.
275 #[cfg_attr(feature = "serde", serde(borrow))]
276 pub map: ZeroVec<'data, TinyStr4>,
277}