icu_properties/code_point_map.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "alloc")]
6use crate::code_point_set::CodePointSetData;
7use crate::props::GeneralCategory;
8use crate::props::GeneralCategoryGroup;
9use crate::provider::*;
10use core::ops::RangeInclusive;
11use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
12use icu_provider::marker::ErasedMarker;
13use icu_provider::prelude::*;
14
15/// A wrapper around code point map data.
16///
17/// It is returned by APIs that return Unicode
18/// property data in a map-like form, ex: enumerated property value data keyed
19/// by code point. Access its data via the borrowed version,
20/// [`CodePointMapDataBorrowed`].
21#[derive(Debug, Clone)]
22pub struct CodePointMapData<T: TrieValue> {
23 data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>,
24}
25
26impl<T: TrieValue> CodePointMapData<T> {
27 /// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`].
28 ///
29 /// See the documentation on [`EnumeratedProperty`] implementations for details.
30 ///
31 /// β¨ *Enabled with the `compiled_data` Cargo feature.*
32 ///
33 /// [π Help choosing a constructor](icu_provider::constructors)
34 #[cfg(feature = "compiled_data")]
35 #[expect(clippy::new_ret_no_self)]
36 pub const fn new() -> CodePointMapDataBorrowed<'static, T>
37 where
38 T: EnumeratedProperty,
39 {
40 CodePointMapDataBorrowed::new()
41 }
42
43 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
44 pub fn try_new_unstable(
45 provider: &(impl DataProvider<T::DataMarker> + ?Sized),
46 ) -> Result<Self, DataError>
47 where
48 T: EnumeratedProperty,
49 {
50 Ok(Self {
51 data: provider.load(Default::default())?.payload.cast(),
52 })
53 }
54
55 /// Construct a borrowed version of this type that can be queried.
56 ///
57 /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
58 /// up front.
59 ///
60 /// This owned version if returned by functions that use a runtime data provider.
61 #[inline]
62 pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
63 CodePointMapDataBorrowed {
64 map: self.data.get(),
65 }
66 }
67
68 /// Convert this map to a map around another type
69 ///
70 /// Typically useful for type-erasing maps into maps around integers.
71 ///
72 /// β¨ *Enabled with the `alloc` Cargo feature.*
73 ///
74 /// # Panics
75 /// Will panic if T and P are different sizes
76 ///
77 /// # Example
78 ///
79 /// ```
80 /// use icu::properties::CodePointMapData;
81 /// use icu::properties::props::GeneralCategory;
82 ///
83 /// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned();
84 ///
85 /// let gc = data.try_into_converted::<u8>().unwrap();
86 /// let gc = gc.as_borrowed();
87 ///
88 /// assert_eq!(gc.get('ζ¨'), GeneralCategory::OtherLetter as u8); // U+6728
89 /// assert_eq!(gc.get('π'), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN
90 /// ```
91 #[cfg(feature = "alloc")]
92 pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError>
93 where
94 P: TrieValue,
95 {
96 self.data
97 .try_map_project(|data, _| data.try_into_converted())
98 .map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>)
99 }
100
101 /// Construct a new one from loaded data
102 ///
103 /// Typically it is preferable to use getters like [`load_general_category()`] instead
104 pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
105 where
106 M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,
107 {
108 Self { data: data.cast() }
109 }
110
111 /// Construct a new one an owned [`CodePointTrie`]
112 pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
113 let set = PropertyCodePointMap::from_code_point_trie(trie);
114 CodePointMapData::from_data(
115 DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set),
116 )
117 }
118
119 /// Convert this type to a [`CodePointTrie`] as a borrowed value.
120 ///
121 /// The data backing this is extensible and supports multiple implementations.
122 /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
123 /// added, and users may select which at data generation time.
124 ///
125 /// This method returns an `Option` in order to return `None` when the backing data provider
126 /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
127 /// constraint.
128 pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
129 self.data.get().as_code_point_trie()
130 }
131
132 /// Convert this type to a [`CodePointTrie`], borrowing if possible,
133 /// otherwise allocating a new [`CodePointTrie`].
134 ///
135 /// The data backing this is extensible and supports multiple implementations.
136 /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
137 /// added, and users may select which at data generation time.
138 ///
139 /// The performance of the conversion to this specific return type will vary
140 /// depending on the data structure that is backing `self`.
141 pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
142 self.data.get().to_code_point_trie()
143 }
144}
145
146/// A borrowed wrapper around code point set data, returned by
147/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
148#[derive(Clone, Copy, Debug)]
149pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
150 map: &'a PropertyCodePointMap<'a, T>,
151}
152
153impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
154 /// Get the value this map has associated with code point `ch`
155 ///
156 /// # Example
157 ///
158 /// ```
159 /// use icu::properties::CodePointMapData;
160 /// use icu::properties::props::GeneralCategory;
161 ///
162 /// let gc = CodePointMapData::<GeneralCategory>::new();
163 ///
164 /// assert_eq!(gc.get('ζ¨'), GeneralCategory::OtherLetter); // U+6728
165 /// assert_eq!(gc.get('π'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
166 /// ```
167 #[inline]
168 pub fn get(self, ch: char) -> T {
169 self.map.get(ch)
170 }
171
172 /// See [`Self::get`].
173 #[inline]
174 pub fn get32(self, ch: u32) -> T {
175 self.map.get32(ch)
176 }
177
178 /// Get a [`CodePointSetData`] for all elements corresponding to a particular value
179 ///
180 /// β¨ *Enabled with the `alloc` Cargo feature.*
181 ///
182 /// # Example
183 ///
184 /// ```
185 /// use icu::properties::props::GeneralCategory;
186 /// use icu::properties::CodePointMapData;
187 ///
188 /// let gc = CodePointMapData::<GeneralCategory>::new();
189 ///
190 /// let other_letter_set_data =
191 /// gc.get_set_for_value(GeneralCategory::OtherLetter);
192 /// let other_letter_set = other_letter_set_data.as_borrowed();
193 ///
194 /// assert!(other_letter_set.contains('ζ¨')); // U+6728
195 /// assert!(!other_letter_set.contains('π')); // U+1F383 JACK-O-LANTERN
196 /// ```
197 #[cfg(feature = "alloc")]
198 pub fn get_set_for_value(self, value: T) -> CodePointSetData {
199 let set = self.map.get_set_for_value(value);
200 CodePointSetData::from_code_point_inversion_list(set)
201 }
202
203 /// Yields an [`Iterator`] returning ranges of consecutive code points that
204 /// share the same value in the [`CodePointMapData`].
205 ///
206 /// # Examples
207 ///
208 /// ```
209 /// use icu::properties::props::GeneralCategory;
210 /// use icu::properties::CodePointMapData;
211 ///
212 /// let gc = CodePointMapData::<GeneralCategory>::new();
213 /// let mut ranges = gc.iter_ranges();
214 /// let next = ranges.next().unwrap();
215 /// assert_eq!(next.range, 0..=31);
216 /// assert_eq!(next.value, GeneralCategory::Control);
217 /// let next = ranges.next().unwrap();
218 /// assert_eq!(next.range, 32..=32);
219 /// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
220 /// ```
221 pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
222 self.map.iter_ranges()
223 }
224
225 /// Yields an [`Iterator`] returning ranges of consecutive code points that
226 /// share the same value `v` in the [`CodePointMapData`].
227 ///
228 /// # Examples
229 ///
230 ///
231 /// ```
232 /// use icu::properties::props::GeneralCategory;
233 /// use icu::properties::CodePointMapData;
234 ///
235 /// let gc = CodePointMapData::<GeneralCategory>::new();
236 /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
237 /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
238 /// assert_eq!(ranges.next().unwrap(), 'Γ' as u32..='Γ' as u32);
239 /// assert_eq!(ranges.next().unwrap(), 'Γ' as u32..='Γ' as u32);
240 /// ```
241 pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
242 self.map
243 .iter_ranges()
244 .filter(move |r| r.value == val)
245 .map(|r| r.range)
246 }
247
248 /// Yields an [`Iterator`] returning ranges of consecutive code points that
249 /// do *not* have the value `v` in the [`CodePointMapData`].
250 pub fn iter_ranges_for_value_complemented(
251 self,
252 val: T,
253 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
254 self.map
255 .iter_ranges_mapped(move |value| value != val)
256 .filter(|v| v.value)
257 .map(|v| v.range)
258 }
259
260 /// Exposed for FFI needs, could be exposed in general in the future but we should
261 /// have a use case first.
262 ///
263 /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
264 #[doc(hidden)] // used by FFI code
265 pub fn iter_ranges_mapped<U: Eq + 'a>(
266 self,
267 predicate: impl FnMut(T) -> U + Copy + 'a,
268 ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
269 self.map.iter_ranges_mapped(predicate)
270 }
271}
272
273impl CodePointMapDataBorrowed<'_, GeneralCategory> {
274 /// Get a [`CodePointSetData`] for all elements corresponding to a particular value group
275 ///
276 /// β¨ *Enabled with the `alloc` Cargo feature.*
277 ///
278 /// # Example
279 ///
280 /// ```
281 /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
282 /// use icu::properties::CodePointMapData;
283 ///
284 /// let gc = CodePointMapData::<GeneralCategory>::new();
285 ///
286 /// let other_letter_set_data =
287 /// gc.get_set_for_value_group(GeneralCategoryGroup::OtherLetter);
288 /// let other_letter_set = other_letter_set_data.as_borrowed();
289 ///
290 /// assert!(other_letter_set.contains('ζ¨')); // U+6728
291 /// assert!(!other_letter_set.contains('π')); // U+1F383 JACK-O-LANTERN
292 /// ```
293 #[cfg(feature = "alloc")]
294 pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData {
295 let matching_gc_ranges = self
296 .iter_ranges()
297 .filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0)
298 .map(|cpm_range| cpm_range.range);
299 CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect())
300 }
301}
302
303#[cfg(feature = "compiled_data")]
304impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> {
305 fn default() -> Self {
306 Self::new()
307 }
308}
309
310impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
311 /// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`].
312 ///
313 /// See the documentation on [`EnumeratedProperty`] implementations for details.
314 ///
315 /// β¨ *Enabled with the `compiled_data` Cargo feature.*
316 ///
317 /// [π Help choosing a constructor](icu_provider::constructors)
318 #[cfg(feature = "compiled_data")]
319 pub const fn new() -> Self
320 where
321 T: EnumeratedProperty,
322 {
323 CodePointMapDataBorrowed { map: T::SINGLETON }
324 }
325
326 /// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`].
327 ///
328 /// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some
329 /// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`].
330 pub const fn static_to_owned(self) -> CodePointMapData<T> {
331 CodePointMapData {
332 data: DataPayload::from_static_ref(self.map),
333 }
334 }
335}
336
337impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> {
338 /// Yields an [`Iterator`] returning ranges of consecutive code points that
339 /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
340 ///
341 /// # Examples
342 ///
343 /// ```
344 /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
345 /// use icu::properties::CodePointMapData;
346 ///
347 /// let gc = CodePointMapData::<GeneralCategory>::new();
348 /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
349 /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
350 /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
351 /// assert_eq!(ranges.next().unwrap(), 'Βͺ' as u32..='Βͺ' as u32);
352 /// assert_eq!(ranges.next().unwrap(), 'Β΅' as u32..='Β΅' as u32);
353 /// assert_eq!(ranges.next().unwrap(), 'ΒΊ' as u32..='ΒΊ' as u32);
354 /// assert_eq!(ranges.next().unwrap(), 'Γ' as u32..='Γ' as u32);
355 /// assert_eq!(ranges.next().unwrap(), 'Γ' as u32..='ΓΆ' as u32);
356 /// ```
357 pub fn iter_ranges_for_group(
358 self,
359 group: GeneralCategoryGroup,
360 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
361 self.map
362 .iter_ranges_mapped(move |value| group.contains(value))
363 .filter(|v| v.value)
364 .map(|v| v.range)
365 }
366}
367
368/// A Unicode character property that assigns a value to each code point.
369///
370/// The descriptions of most properties are taken from [`TR44`], the documentation for the
371/// Unicode Character Database.
372///
373/// <div class="stab unstable">
374/// π« This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
375/// trait, please consider using a type from the implementors listed below.
376/// </div>
377///
378/// [`TR44`]: https://www.unicode.org/reports/tr44
379pub trait EnumeratedProperty: crate::private::Sealed + TrieValue {
380 #[doc(hidden)]
381 type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>;
382 #[doc(hidden)]
383 #[cfg(feature = "compiled_data")]
384 const SINGLETON: &'static PropertyCodePointMap<'static, Self>;
385 /// The name of this property
386 const NAME: &'static [u8];
387 /// The abbreviated name of this property, if it exists, otherwise the name
388 const SHORT_NAME: &'static [u8];
389
390 /// Convenience method for `CodePointMapData::new().get(ch)`
391 ///
392 /// β¨ *Enabled with the `compiled_data` Cargo feature.*
393 #[cfg(feature = "compiled_data")]
394 fn for_char(ch: char) -> Self {
395 CodePointMapData::new().get(ch)
396 }
397}