icu_properties/provider/
names.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! 🚧 \[Unstable\] Property names-related data for this component
6//!
7//! <div class="stab unstable">
8//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10//! to be stable, their Rust representation might not be. Use with caution.
11//! </div>
12//!
13//! Read more about data providers: [`icu_provider`]
14
15use alloc::boxed::Box;
16use core::cmp::Ordering;
17
18use core::str;
19
20use icu_provider::prelude::*;
21
22use tinystr::TinyStr4;
23use zerovec::ule::{UnvalidatedStr, VarULE};
24use zerovec::{maps::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroMap, ZeroVec};
25
26/// This is a property name that can be "loose matched" as according to
27/// [PropertyValueAliases.txt](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt)
28///
29/// (matched case-insensitively in ASCII, ignoring underscores, whitespace, and hyphens)
30///
31/// This is expected to be ASCII, but we do not rely on this invariant anywhere except during
32/// datagen.
33///
34/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items
35/// will sort into the same area, such that a map can be searched for both strict and loose equality.
36///
37/// <div class="stab unstable">
38/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
39/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
40/// to be stable, their Rust representation might not be. Use with caution.
41/// </div>
42///
43/// # Examples
44///
45/// Using a [`NormalizedPropertyNameStr`] as the key of a [`ZeroMap`]:
46///
47/// ```
48/// use icu::properties::provider::names::NormalizedPropertyNameStr;
49/// use zerovec::ZeroMap;
50///
51/// let map: ZeroMap<NormalizedPropertyNameStr, usize> = [
52///     (NormalizedPropertyNameStr::from_str("A_BC"), 11),
53///     (NormalizedPropertyNameStr::from_str("dEf"), 22),
54///     (NormalizedPropertyNameStr::from_str("G_H-I"), 33),
55/// ]
56/// .into_iter()
57/// .collect();
58///
59/// let key_approx = NormalizedPropertyNameStr::from_str("AB-C");
60/// let key_exact = NormalizedPropertyNameStr::from_str("A_BC");
61///
62/// // Strict lookup:
63/// assert_eq!(None, map.get_copied(key_approx));
64/// assert_eq!(Some(11), map.get_copied(key_exact));
65///
66/// // Loose lookup:
67/// assert_eq!(Some(11), map.get_copied_by(|u| u.cmp_loose(key_approx)));
68/// assert_eq!(Some(11), map.get_copied_by(|u| u.cmp_loose(key_exact)));
69/// ```
70#[derive(PartialEq, Eq)] // VarULE wants these to be byte equality
71#[derive(Debug, VarULE)]
72#[cfg_attr(feature = "serde", derive(serde::Serialize))]
73#[repr(transparent)]
74pub struct NormalizedPropertyNameStr(UnvalidatedStr);
75
76/// This impl requires enabling the optional `serde` Cargo feature of the `icu::properties` crate
77#[cfg(feature = "serde")]
78impl<'de> serde::Deserialize<'de> for Box<NormalizedPropertyNameStr> {
79    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
80    where
81        D: serde::Deserializer<'de>,
82    {
83        <Box<UnvalidatedStr>>::deserialize(deserializer).map(NormalizedPropertyNameStr::cast_box)
84    }
85}
86
87/// This impl requires enabling the optional `serde` Cargo feature of the `icu::properties` crate
88#[cfg(feature = "serde")]
89impl<'de, 'a> serde::Deserialize<'de> for &'a NormalizedPropertyNameStr
90where
91    'de: 'a,
92{
93    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
94    where
95        D: serde::Deserializer<'de>,
96    {
97        <&UnvalidatedStr>::deserialize(deserializer).map(NormalizedPropertyNameStr::cast_ref)
98    }
99}
100
101impl<'a> ZeroMapKV<'a> for NormalizedPropertyNameStr {
102    type Container = VarZeroVec<'a, NormalizedPropertyNameStr>;
103    type Slice = VarZeroSlice<NormalizedPropertyNameStr>;
104    type GetType = NormalizedPropertyNameStr;
105    type OwnedType = Box<NormalizedPropertyNameStr>;
106}
107
108/// The Ord/PartialOrd impl will sort things using strict equality, but in such a way that all loose-equal items
109/// will sort into the same area, such that a map can be searched for both strict and loose equality.
110impl PartialOrd for NormalizedPropertyNameStr {
111    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
112        Some(self.cmp(other))
113    }
114}
115
116/// Normalize a character based on the "loose matching" described in PropertyValueAliases.txt,
117/// returning `None` for skippable characters
118///
119/// ICU has [code for this][1] (and [during property lookup][2]) which we emulate.
120/// In particular, ICU only does normalization within ASCII, which makes sense since character names
121/// seem to be only ASCII.
122///
123/// [1]: https://github.com/unicode-org/icu/blob/288c4c7555915ce7b1fb675d94ddd495058fc039/icu4c/source/common/propname.cpp#L35
124/// [2]: https://github.com/unicode-org/icu/blob/288c4c7555915ce7b1fb675d94ddd495058fc039/icu4c/source/common/propname.cpp#L226-L230
125fn normalize_char(ch: u8) -> Option<u8> {
126    match ch {
127        // all ascii whitespace
128        ch if ch.is_ascii_whitespace() => None,
129        // underscores, hyphens, and the vertical tab character
130        // not covered by is_ascii_whitespace()
131        b'_' | b'-' | 0x0B => None,
132        // ignore case by lowercasing
133        ch => Some(ch.to_ascii_lowercase()),
134    }
135}
136
137/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items
138/// will sort into the same area, such that a map can be searched for both strict and loose equality.
139impl Ord for NormalizedPropertyNameStr {
140    fn cmp(&self, other: &Self) -> Ordering {
141        let cmp = self.cmp_loose(other);
142        // When loose equality holds, fall back to strict equality
143        if cmp == Ordering::Equal {
144            self.0.cmp(&other.0)
145        } else {
146            cmp
147        }
148    }
149}
150
151impl NormalizedPropertyNameStr {
152    /// Perform the loose comparison as defined in [`NormalizedPropertyNameStr`].
153    pub fn cmp_loose(&self, other: &Self) -> Ordering {
154        let self_iter = self.0.iter().copied().filter_map(normalize_char);
155        let other_iter = other.0.iter().copied().filter_map(normalize_char);
156        self_iter.cmp(other_iter)
157    }
158
159    /// Convert a string reference to a [`NormalizedPropertyNameStr`].
160    pub const fn from_str(s: &str) -> &Self {
161        Self::cast_ref(UnvalidatedStr::from_str(s))
162    }
163
164    /// Convert a [`UnvalidatedStr`] reference to a [`NormalizedPropertyNameStr`] reference.
165    pub const fn cast_ref(value: &UnvalidatedStr) -> &Self {
166        // Safety: repr(transparent)
167        unsafe { core::mem::transmute(value) }
168    }
169
170    /// Convert a [`UnvalidatedStr`] box to a [`NormalizedPropertyNameStr`] box.
171    pub const fn cast_box(value: Box<UnvalidatedStr>) -> Box<Self> {
172        // Safety: repr(transparent)
173        unsafe { core::mem::transmute(value) }
174    }
175
176    /// Get a [`NormalizedPropertyNameStr`] box from a byte slice.
177    pub fn boxed_from_bytes(b: &[u8]) -> Box<Self> {
178        Self::cast_box(UnvalidatedStr::from_boxed_bytes(b.into()))
179    }
180}
181
182/// A set of characters and strings which share a particular property value.
183///
184/// <div class="stab unstable">
185/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
186/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
187/// to be stable, their Rust representation might not be. Use with caution.
188/// </div>
189#[derive(Debug, Clone, PartialEq)]
190#[icu_provider::data_struct(marker(
191    GeneralCategoryMaskNameToValueV1Marker,
192    "propnames/from/gcm@1",
193    singleton,
194))]
195#[cfg_attr(
196    feature = "datagen", 
197    derive(serde::Serialize, databake::Bake),
198    databake(path = icu_properties::provider::names),
199)]
200#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
201#[yoke(prove_covariance_manually)]
202pub struct PropertyValueNameToEnumMapV1<'data> {
203    /// A map from names to their value discriminant
204    #[cfg_attr(feature = "serde", serde(borrow))]
205    pub map: ZeroMap<'data, NormalizedPropertyNameStr, u16>,
206}
207
208/// A mapping of property values to their names. A single instance of this map will only cover
209/// either long or short names, determined whilst loading data.
210///
211/// <div class="stab unstable">
212/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
213/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
214/// to be stable, their Rust representation might not be. Use with caution.
215/// </div>
216#[derive(Debug, Clone, PartialEq)]
217#[icu_provider::data_struct]
218#[cfg_attr(
219    feature = "datagen", 
220    derive(serde::Serialize, databake::Bake),
221    databake(path = icu_properties::provider::names),
222)]
223#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
224#[yoke(prove_covariance_manually)]
225pub struct PropertyEnumToValueNameSparseMapV1<'data> {
226    /// A map from the value discriminant to the names
227    #[cfg_attr(feature = "serde", serde(borrow))]
228    pub map: ZeroMap<'data, u16, str>,
229}
230
231/// A mapping of property values to their names. A single instance of this map will only cover
232/// either long or short names, determined whilst loading data.
233///
234/// <div class="stab unstable">
235/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
236/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
237/// to be stable, their Rust representation might not be. Use with caution.
238/// </div>
239#[derive(Debug, Clone, PartialEq)]
240#[icu_provider::data_struct]
241#[cfg_attr(
242    feature = "datagen", 
243    derive(serde::Serialize, databake::Bake),
244    databake(path = icu_properties::provider::names),
245)]
246#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
247#[yoke(prove_covariance_manually)]
248pub struct PropertyEnumToValueNameLinearMapV1<'data> {
249    /// A map from the value discriminant (the index) to the names, for mostly
250    /// contiguous data. Empty strings count as missing.
251    #[cfg_attr(feature = "serde", serde(borrow))]
252    pub map: VarZeroVec<'data, str>,
253}
254
255/// A mapping of property values to their names. A single instance of this map will only cover
256/// either long or short names, determined whilst loading data.
257///
258/// <div class="stab unstable">
259/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
260/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
261/// to be stable, their Rust representation might not be. Use with caution.
262/// </div>
263#[derive(Debug, Clone, PartialEq)]
264#[icu_provider::data_struct]
265#[cfg_attr(
266    feature = "datagen", 
267    derive(serde::Serialize, databake::Bake),
268    databake(path = icu_properties::provider::names),
269)]
270#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
271#[yoke(prove_covariance_manually)]
272pub struct PropertyEnumToValueNameLinearTiny4MapV1<'data> {
273    /// A map from the value discriminant (the index) to the names, for mostly
274    /// contiguous data. Empty strings count as missing.
275    #[cfg_attr(feature = "serde", serde(borrow))]
276    pub map: ZeroVec<'data, TinyStr4>,
277}