icu_properties/code_point_map.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "alloc")]
6use crate::code_point_set::CodePointSetData;
7use crate::props::GeneralCategory;
8use crate::props::GeneralCategoryGroup;
9use crate::provider::*;
10use core::ops::RangeInclusive;
11use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
12use icu_provider::marker::ErasedMarker;
13use icu_provider::prelude::*;
14
15/// A wrapper around code point map data.
16///
17/// It is returned by APIs that return Unicode
18/// property data in a map-like form, ex: enumerated property value data keyed
19/// by code point. Access its data via the borrowed version,
20/// [`CodePointMapDataBorrowed`].
21#[derive(Debug, Clone)]
22pub struct CodePointMapData<T: TrieValue> {
23 data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>,
24}
25
26impl<T: TrieValue> CodePointMapData<T> {
27 /// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`].
28 ///
29 /// See the documentation on [`EnumeratedProperty`] implementations for details.
30 ///
31 /// β¨ *Enabled with the `compiled_data` Cargo feature.*
32 ///
33 /// [π Help choosing a constructor](icu_provider::constructors)
34 #[cfg(feature = "compiled_data")]
35 #[expect(clippy::new_ret_no_self)]
36 pub const fn new() -> CodePointMapDataBorrowed<'static, T>
37 where
38 T: EnumeratedProperty,
39 {
40 CodePointMapDataBorrowed::new()
41 }
42
43 #[cfg(feature = "serde")]
44 #[doc = icu_provider::gen_buffer_unstable_docs!(BUFFER, Self::new)]
45 pub fn try_new_with_buffer_provider(
46 provider: &(impl BufferProvider + ?Sized),
47 ) -> Result<Self, DataError>
48 where
49 T: EnumeratedProperty + for<'a> serde::Deserialize<'a>,
50 {
51 use icu_provider::buf::AsDeserializingBufferProvider;
52 Self::try_new_unstable(&provider.as_deserializing())
53 }
54
55 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
56 pub fn try_new_unstable(
57 provider: &(impl DataProvider<T::DataMarker> + ?Sized),
58 ) -> Result<Self, DataError>
59 where
60 T: EnumeratedProperty,
61 {
62 Ok(Self {
63 data: provider.load(Default::default())?.payload.cast(),
64 })
65 }
66
67 /// Construct a borrowed version of this type that can be queried.
68 ///
69 /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
70 /// up front.
71 ///
72 /// This owned version if returned by functions that use a runtime data provider.
73 #[inline]
74 pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
75 CodePointMapDataBorrowed {
76 map: self.data.get(),
77 }
78 }
79
80 /// Convert this map to a map around another type
81 ///
82 /// Typically useful for type-erasing maps into maps around integers.
83 ///
84 /// β¨ *Enabled with the `alloc` Cargo feature.*
85 ///
86 /// # Panics
87 /// Will panic if T and P are different sizes
88 ///
89 /// # Example
90 ///
91 /// ```
92 /// use icu::properties::CodePointMapData;
93 /// use icu::properties::props::GeneralCategory;
94 ///
95 /// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned();
96 ///
97 /// let gc = data.try_into_converted::<u8>().unwrap();
98 /// let gc = gc.as_borrowed();
99 ///
100 /// assert_eq!(gc.get('ζ¨'), GeneralCategory::OtherLetter as u8); // U+6728
101 /// assert_eq!(gc.get('π'), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN
102 /// ```
103 #[cfg(feature = "alloc")]
104 pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError>
105 where
106 P: TrieValue,
107 {
108 self.data
109 .try_map_project(|data, _| data.try_into_converted())
110 .map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>)
111 }
112
113 /// Construct a new one from loaded data
114 ///
115 /// Typically it is preferable to use getters like [`load_general_category()`] instead
116 pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
117 where
118 M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,
119 {
120 Self { data: data.cast() }
121 }
122
123 /// Construct a new one an owned [`CodePointTrie`]
124 pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
125 let set = PropertyCodePointMap::from_code_point_trie(trie);
126 CodePointMapData::from_data(
127 DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set),
128 )
129 }
130
131 /// Convert this type to a [`CodePointTrie`] as a borrowed value.
132 ///
133 /// The data backing this is extensible and supports multiple implementations.
134 /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
135 /// added, and users may select which at data generation time.
136 ///
137 /// This method returns an `Option` in order to return `None` when the backing data provider
138 /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
139 /// constraint.
140 pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
141 self.data.get().as_code_point_trie()
142 }
143
144 /// Convert this type to a [`CodePointTrie`], borrowing if possible,
145 /// otherwise allocating a new [`CodePointTrie`].
146 ///
147 /// The data backing this is extensible and supports multiple implementations.
148 /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
149 /// added, and users may select which at data generation time.
150 ///
151 /// The performance of the conversion to this specific return type will vary
152 /// depending on the data structure that is backing `self`.
153 pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
154 self.data.get().to_code_point_trie()
155 }
156}
157
158/// A borrowed wrapper around code point set data, returned by
159/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
160#[derive(Clone, Copy, Debug)]
161pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
162 map: &'a PropertyCodePointMap<'a, T>,
163}
164
165impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
166 /// Get the value this map has associated with code point `ch`
167 ///
168 /// # Example
169 ///
170 /// ```
171 /// use icu::properties::CodePointMapData;
172 /// use icu::properties::props::GeneralCategory;
173 ///
174 /// let gc = CodePointMapData::<GeneralCategory>::new();
175 ///
176 /// assert_eq!(gc.get('ζ¨'), GeneralCategory::OtherLetter); // U+6728
177 /// assert_eq!(gc.get('π'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
178 /// ```
179 #[inline]
180 pub fn get(self, ch: char) -> T {
181 self.map.get(ch)
182 }
183
184 /// See [`Self::get`].
185 #[inline]
186 pub fn get32(self, ch: u32) -> T {
187 self.map.get32(ch)
188 }
189
190 /// Get a [`CodePointSetData`] for all elements corresponding to a particular value
191 ///
192 /// β¨ *Enabled with the `alloc` Cargo feature.*
193 ///
194 /// # Example
195 ///
196 /// ```
197 /// use icu::properties::props::GeneralCategory;
198 /// use icu::properties::CodePointMapData;
199 ///
200 /// let gc = CodePointMapData::<GeneralCategory>::new();
201 ///
202 /// let other_letter_set_data =
203 /// gc.get_set_for_value(GeneralCategory::OtherLetter);
204 /// let other_letter_set = other_letter_set_data.as_borrowed();
205 ///
206 /// assert!(other_letter_set.contains('ζ¨')); // U+6728
207 /// assert!(!other_letter_set.contains('π')); // U+1F383 JACK-O-LANTERN
208 /// ```
209 #[cfg(feature = "alloc")]
210 pub fn get_set_for_value(self, value: T) -> CodePointSetData {
211 let set = self.map.get_set_for_value(value);
212 CodePointSetData::from_code_point_inversion_list(set)
213 }
214
215 /// Yields an [`Iterator`] returning ranges of consecutive code points that
216 /// share the same value in the [`CodePointMapData`].
217 ///
218 /// # Examples
219 ///
220 /// ```
221 /// use icu::properties::props::GeneralCategory;
222 /// use icu::properties::CodePointMapData;
223 ///
224 /// let gc = CodePointMapData::<GeneralCategory>::new();
225 /// let mut ranges = gc.iter_ranges();
226 /// let next = ranges.next().unwrap();
227 /// assert_eq!(next.range, 0..=31);
228 /// assert_eq!(next.value, GeneralCategory::Control);
229 /// let next = ranges.next().unwrap();
230 /// assert_eq!(next.range, 32..=32);
231 /// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
232 /// ```
233 pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
234 self.map.iter_ranges()
235 }
236
237 /// Yields an [`Iterator`] returning ranges of consecutive code points that
238 /// share the same value `v` in the [`CodePointMapData`].
239 ///
240 /// # Examples
241 ///
242 ///
243 /// ```
244 /// use icu::properties::props::GeneralCategory;
245 /// use icu::properties::CodePointMapData;
246 ///
247 /// let gc = CodePointMapData::<GeneralCategory>::new();
248 /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
249 /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
250 /// assert_eq!(ranges.next().unwrap(), 'Γ' as u32..='Γ' as u32);
251 /// assert_eq!(ranges.next().unwrap(), 'Γ' as u32..='Γ' as u32);
252 /// ```
253 pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
254 self.map
255 .iter_ranges()
256 .filter(move |r| r.value == val)
257 .map(|r| r.range)
258 }
259
260 /// Yields an [`Iterator`] returning ranges of consecutive code points that
261 /// do *not* have the value `v` in the [`CodePointMapData`].
262 pub fn iter_ranges_for_value_complemented(
263 self,
264 val: T,
265 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
266 self.map
267 .iter_ranges_mapped(move |value| value != val)
268 .filter(|v| v.value)
269 .map(|v| v.range)
270 }
271
272 /// Exposed for FFI needs, could be exposed in general in the future but we should
273 /// have a use case first.
274 ///
275 /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
276 #[doc(hidden)] // used by FFI code
277 pub fn iter_ranges_mapped<U: Eq + 'a>(
278 self,
279 predicate: impl FnMut(T) -> U + Copy + 'a,
280 ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
281 self.map.iter_ranges_mapped(predicate)
282 }
283}
284
285impl CodePointMapDataBorrowed<'_, GeneralCategory> {
286 /// Get a [`CodePointSetData`] for all elements corresponding to a particular value group
287 ///
288 /// β¨ *Enabled with the `alloc` Cargo feature.*
289 ///
290 /// # Example
291 ///
292 /// ```
293 /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
294 /// use icu::properties::CodePointMapData;
295 ///
296 /// let gc = CodePointMapData::<GeneralCategory>::new();
297 ///
298 /// let other_letter_set_data =
299 /// gc.get_set_for_value_group(GeneralCategoryGroup::OtherLetter);
300 /// let other_letter_set = other_letter_set_data.as_borrowed();
301 ///
302 /// assert!(other_letter_set.contains('ζ¨')); // U+6728
303 /// assert!(!other_letter_set.contains('π')); // U+1F383 JACK-O-LANTERN
304 /// ```
305 #[cfg(feature = "alloc")]
306 pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> CodePointSetData {
307 let matching_gc_ranges = self
308 .iter_ranges()
309 .filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0)
310 .map(|cpm_range| cpm_range.range);
311 CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect())
312 }
313}
314
315#[cfg(feature = "compiled_data")]
316impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> {
317 fn default() -> Self {
318 Self::new()
319 }
320}
321
322impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
323 /// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`].
324 ///
325 /// See the documentation on [`EnumeratedProperty`] implementations for details.
326 ///
327 /// β¨ *Enabled with the `compiled_data` Cargo feature.*
328 ///
329 /// [π Help choosing a constructor](icu_provider::constructors)
330 #[cfg(feature = "compiled_data")]
331 pub const fn new() -> Self
332 where
333 T: EnumeratedProperty,
334 {
335 CodePointMapDataBorrowed { map: T::SINGLETON }
336 }
337
338 /// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`].
339 ///
340 /// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some
341 /// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`].
342 pub const fn static_to_owned(self) -> CodePointMapData<T> {
343 CodePointMapData {
344 data: DataPayload::from_static_ref(self.map),
345 }
346 }
347}
348
349impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> {
350 /// Yields an [`Iterator`] returning ranges of consecutive code points that
351 /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
352 ///
353 /// # Examples
354 ///
355 /// ```
356 /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
357 /// use icu::properties::CodePointMapData;
358 ///
359 /// let gc = CodePointMapData::<GeneralCategory>::new();
360 /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
361 /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
362 /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
363 /// assert_eq!(ranges.next().unwrap(), 'Βͺ' as u32..='Βͺ' as u32);
364 /// assert_eq!(ranges.next().unwrap(), 'Β΅' as u32..='Β΅' as u32);
365 /// assert_eq!(ranges.next().unwrap(), 'ΒΊ' as u32..='ΒΊ' as u32);
366 /// assert_eq!(ranges.next().unwrap(), 'Γ' as u32..='Γ' as u32);
367 /// assert_eq!(ranges.next().unwrap(), 'Γ' as u32..='ΓΆ' as u32);
368 /// ```
369 pub fn iter_ranges_for_group(
370 self,
371 group: GeneralCategoryGroup,
372 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
373 self.map
374 .iter_ranges_mapped(move |value| group.contains(value))
375 .filter(|v| v.value)
376 .map(|v| v.range)
377 }
378}
379
380/// A Unicode character property that assigns a value to each code point.
381///
382/// The descriptions of most properties are taken from [`TR44`], the documentation for the
383/// Unicode Character Database.
384///
385/// <div class="stab unstable">
386/// π« This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
387/// trait, please consider using a type from the implementors listed below.
388/// </div>
389///
390/// [`TR44`]: https://www.unicode.org/reports/tr44
391pub trait EnumeratedProperty: crate::private::Sealed + TrieValue {
392 #[doc(hidden)]
393 type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>;
394 #[doc(hidden)]
395 #[cfg(feature = "compiled_data")]
396 const SINGLETON: &'static PropertyCodePointMap<'static, Self>;
397 /// The name of this property
398 const NAME: &'static [u8];
399 /// The abbreviated name of this property, if it exists, otherwise the name
400 const SHORT_NAME: &'static [u8];
401
402 /// Convenience method for `CodePointMapData::new().get(ch)`
403 ///
404 /// β¨ *Enabled with the `compiled_data` Cargo feature.*
405 #[cfg(feature = "compiled_data")]
406 fn for_char(ch: char) -> Self {
407 CodePointMapData::new().get(ch)
408 }
409}