icu_properties/
code_point_map.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "alloc")]
6use crate::code_point_set::CodePointSetData;
7use crate::props::GeneralCategory;
8use crate::props::GeneralCategoryGroup;
9use crate::provider::*;
10use core::ops::RangeInclusive;
11use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
12use icu_provider::marker::ErasedMarker;
13use icu_provider::prelude::*;
14
15/// A wrapper around code point map data.
16///
17/// It is returned by APIs that return Unicode
18/// property data in a map-like form, ex: enumerated property value data keyed
19/// by code point. Access its data via the borrowed version,
20/// [`CodePointMapDataBorrowed`].
21#[derive(#[automatically_derived]
impl<T: ::core::fmt::Debug + TrieValue> ::core::fmt::Debug for
    CodePointMapData<T> {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "CodePointMapData", "data", &&self.data)
    }
}Debug, #[automatically_derived]
impl<T: ::core::clone::Clone + TrieValue> ::core::clone::Clone for
    CodePointMapData<T> {
    #[inline]
    fn clone(&self) -> CodePointMapData<T> {
        CodePointMapData { data: ::core::clone::Clone::clone(&self.data) }
    }
}Clone)]
22pub struct CodePointMapData<T: TrieValue> {
23    data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>,
24}
25
26impl<T: TrieValue> CodePointMapData<T> {
27    /// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`].
28    ///
29    /// See the documentation on [`EnumeratedProperty`] implementations for details.
30    ///
31    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
32    ///
33    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
34    #[cfg(feature = "compiled_data")]
35    #[expect(clippy::new_ret_no_self)]
36    pub const fn new() -> CodePointMapDataBorrowed<'static, T>
37    where
38        T: EnumeratedProperty,
39    {
40        CodePointMapDataBorrowed::new()
41    }
42
43    #[doc = "A version of [`Self::new`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
44    pub fn try_new_unstable(
45        provider: &(impl DataProvider<T::DataMarker> + ?Sized),
46    ) -> Result<Self, DataError>
47    where
48        T: EnumeratedProperty,
49    {
50        Ok(Self {
51            data: provider.load(Default::default())?.payload.cast(),
52        })
53    }
54
55    /// Construct a borrowed version of this type that can be queried.
56    ///
57    /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
58    /// up front.
59    ///
60    /// This owned version if returned by functions that use a runtime data provider.
61    #[inline]
62    pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
63        CodePointMapDataBorrowed {
64            map: self.data.get(),
65        }
66    }
67
68    /// Convert this map to a map around another type
69    ///
70    /// Typically useful for type-erasing maps into maps around integers.
71    ///
72    /// ✨ *Enabled with the `alloc` Cargo feature.*
73    ///
74    /// # Panics
75    /// Will panic if T and P are different sizes
76    ///
77    /// # Example
78    ///
79    /// ```
80    /// use icu::properties::CodePointMapData;
81    /// use icu::properties::props::GeneralCategory;
82    ///
83    /// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned();
84    ///
85    /// let gc = data.try_into_converted::<u8>().unwrap();
86    /// let gc = gc.as_borrowed();
87    ///
88    /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8);  // U+6728
89    /// assert_eq!(gc.get('πŸŽƒ'), GeneralCategory::OtherSymbol as u8);  // U+1F383 JACK-O-LANTERN
90    /// ```
91    #[cfg(feature = "alloc")]
92    pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError>
93    where
94        P: TrieValue,
95    {
96        self.data
97            .try_map_project(|data, _| data.try_into_converted())
98            .map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>)
99    }
100
101    /// Construct a new one from loaded data
102    ///
103    /// Typically it is preferable to use getters like [`load_general_category()`] instead
104    pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
105    where
106        M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,
107    {
108        Self { data: data.cast() }
109    }
110
111    /// Construct a new one an owned [`CodePointTrie`]
112    pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
113        let set = PropertyCodePointMap::from_code_point_trie(trie);
114        CodePointMapData::from_data(
115            DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set),
116        )
117    }
118
119    /// Convert this type to a [`CodePointTrie`] as a borrowed value.
120    ///
121    /// The data backing this is extensible and supports multiple implementations.
122    /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
123    /// added, and users may select which at data generation time.
124    ///
125    /// This method returns an `Option` in order to return `None` when the backing data provider
126    /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
127    /// constraint.
128    pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
129        self.data.get().as_code_point_trie()
130    }
131
132    /// Convert this type to a [`CodePointTrie`], borrowing if possible,
133    /// otherwise allocating a new [`CodePointTrie`].
134    ///
135    /// The data backing this is extensible and supports multiple implementations.
136    /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
137    /// added, and users may select which at data generation time.
138    ///
139    /// The performance of the conversion to this specific return type will vary
140    /// depending on the data structure that is backing `self`.
141    pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
142        self.data.get().to_code_point_trie()
143    }
144}
145
146/// A borrowed wrapper around code point set data, returned by
147/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
148#[derive(#[automatically_derived]
impl<'a, T: ::core::clone::Clone + TrieValue> ::core::clone::Clone for
    CodePointMapDataBorrowed<'a, T> {
    #[inline]
    fn clone(&self) -> CodePointMapDataBorrowed<'a, T> {
        CodePointMapDataBorrowed {
            map: ::core::clone::Clone::clone(&self.map),
        }
    }
}Clone, #[automatically_derived]
impl<'a, T: ::core::marker::Copy + TrieValue> ::core::marker::Copy for
    CodePointMapDataBorrowed<'a, T> {
}Copy, #[automatically_derived]
impl<'a, T: ::core::fmt::Debug + TrieValue> ::core::fmt::Debug for
    CodePointMapDataBorrowed<'a, T> {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "CodePointMapDataBorrowed", "map", &&self.map)
    }
}Debug)]
149pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
150    map: &'a PropertyCodePointMap<'a, T>,
151}
152
153impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
154    /// Get the value this map has associated with code point `ch`
155    ///
156    /// # Example
157    ///
158    /// ```
159    /// use icu::properties::CodePointMapData;
160    /// use icu::properties::props::GeneralCategory;
161    ///
162    /// let gc = CodePointMapData::<GeneralCategory>::new();
163    ///
164    /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter);  // U+6728
165    /// assert_eq!(gc.get('πŸŽƒ'), GeneralCategory::OtherSymbol);  // U+1F383 JACK-O-LANTERN
166    /// ```
167    #[inline]
168    pub fn get(self, ch: char) -> T {
169        self.map.get(ch)
170    }
171
172    /// See [`Self::get`].
173    #[inline]
174    pub fn get32(self, ch: u32) -> T {
175        self.map.get32(ch)
176    }
177
178    /// Get a [`CodePointSetData`] for all elements corresponding to a particular value
179    ///
180    /// ✨ *Enabled with the `alloc` Cargo feature.*
181    ///
182    /// # Example
183    ///
184    /// ```
185    /// use icu::properties::props::GeneralCategory;
186    /// use icu::properties::CodePointMapData;
187    ///
188    /// let gc = CodePointMapData::<GeneralCategory>::new();
189    ///
190    /// let other_letter_set_data =
191    ///     gc.get_set_for_value(GeneralCategory::OtherLetter);
192    /// let other_letter_set = other_letter_set_data.as_borrowed();
193    ///
194    /// assert!(other_letter_set.contains('木')); // U+6728
195    /// assert!(!other_letter_set.contains('πŸŽƒ')); // U+1F383 JACK-O-LANTERN
196    /// ```
197    #[cfg(feature = "alloc")]
198    pub fn get_set_for_value(self, value: T) -> CodePointSetData {
199        let set = self.map.get_set_for_value(value);
200        CodePointSetData::from_code_point_inversion_list(set)
201    }
202
203    /// Yields an [`Iterator`] returning ranges of consecutive code points that
204    /// share the same value in the [`CodePointMapData`].
205    ///
206    /// # Examples
207    ///
208    /// ```
209    /// use icu::properties::props::GeneralCategory;
210    /// use icu::properties::CodePointMapData;
211    ///
212    /// let gc = CodePointMapData::<GeneralCategory>::new();
213    /// let mut ranges = gc.iter_ranges();
214    /// let next = ranges.next().unwrap();
215    /// assert_eq!(next.range, 0..=31);
216    /// assert_eq!(next.value, GeneralCategory::Control);
217    /// let next = ranges.next().unwrap();
218    /// assert_eq!(next.range, 32..=32);
219    /// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
220    /// ```
221    pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
222        self.map.iter_ranges()
223    }
224
225    /// Yields an [`Iterator`] returning ranges of consecutive code points that
226    /// share the same value `v` in the [`CodePointMapData`].
227    ///
228    /// # Examples
229    ///
230    ///
231    /// ```
232    /// use icu::properties::props::GeneralCategory;
233    /// use icu::properties::CodePointMapData;
234    ///
235    /// let gc = CodePointMapData::<GeneralCategory>::new();
236    /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
237    /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
238    /// assert_eq!(ranges.next().unwrap(), 'Γ€' as u32..='Γ–' as u32);
239    /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32);
240    /// ```
241    pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
242        self.map
243            .iter_ranges()
244            .filter(move |r| r.value == val)
245            .map(|r| r.range)
246    }
247
248    /// Yields an [`Iterator`] returning ranges of consecutive code points that
249    /// do *not* have the value `v` in the [`CodePointMapData`].
250    pub fn iter_ranges_for_value_complemented(
251        self,
252        val: T,
253    ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
254        self.map
255            .iter_ranges_mapped(move |value| value != val)
256            .filter(|v| v.value)
257            .map(|v| v.range)
258    }
259
260    /// Exposed for FFI needs, could be exposed in general in the future but we should
261    /// have a use case first.
262    ///
263    /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
264    #[doc(hidden)] // used by FFI code
265    pub fn iter_ranges_mapped<U: Eq + 'a>(
266        self,
267        predicate: impl FnMut(T) -> U + Copy + 'a,
268    ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
269        self.map.iter_ranges_mapped(predicate)
270    }
271}
272
273impl CodePointMapDataBorrowed<'_, GeneralCategory> {
274    /// Get a [`CodePointSetData`] for all elements corresponding to a particular value group
275    ///
276    /// ✨ *Enabled with the `alloc` Cargo feature.*
277    ///
278    /// # Example
279    ///
280    /// ```
281    /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
282    /// use icu::properties::CodePointMapData;
283    ///
284    /// let gc = CodePointMapData::<GeneralCategory>::new();
285    ///
286    /// let other_letter_set_data =
287    ///     gc.get_set_for_value_group(GeneralCategoryGroup::OtherLetter);
288    /// let other_letter_set = other_letter_set_data.as_borrowed();
289    ///
290    /// assert!(other_letter_set.contains('木')); // U+6728
291    /// assert!(!other_letter_set.contains('πŸŽƒ')); // U+1F383 JACK-O-LANTERN
292    /// ```
293    #[cfg(feature = "alloc")]
294    pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData {
295        let matching_gc_ranges = self
296            .iter_ranges()
297            .filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0)
298            .map(|cpm_range| cpm_range.range);
299        CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect())
300    }
301}
302
303#[cfg(feature = "compiled_data")]
304impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> {
305    fn default() -> Self {
306        Self::new()
307    }
308}
309
310impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
311    /// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`].
312    ///
313    /// See the documentation on [`EnumeratedProperty`] implementations for details.
314    ///
315    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
316    ///
317    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
318    #[cfg(feature = "compiled_data")]
319    pub const fn new() -> Self
320    where
321        T: EnumeratedProperty,
322    {
323        CodePointMapDataBorrowed { map: T::SINGLETON }
324    }
325
326    /// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`].
327    ///
328    /// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some
329    /// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`].
330    pub const fn static_to_owned(self) -> CodePointMapData<T> {
331        CodePointMapData {
332            data: DataPayload::from_static_ref(self.map),
333        }
334    }
335}
336
337impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> {
338    /// Yields an [`Iterator`] returning ranges of consecutive code points that
339    /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
340    ///
341    /// # Examples
342    ///
343    /// ```
344    /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
345    /// use icu::properties::CodePointMapData;
346    ///
347    /// let gc = CodePointMapData::<GeneralCategory>::new();
348    /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
349    /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
350    /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
351    /// assert_eq!(ranges.next().unwrap(), 'Βͺ' as u32..='Βͺ' as u32);
352    /// assert_eq!(ranges.next().unwrap(), 'Β΅' as u32..='Β΅' as u32);
353    /// assert_eq!(ranges.next().unwrap(), 'ΒΊ' as u32..='ΒΊ' as u32);
354    /// assert_eq!(ranges.next().unwrap(), 'Γ€' as u32..='Γ–' as u32);
355    /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='â' as u32);
356    /// ```
357    pub fn iter_ranges_for_group(
358        self,
359        group: GeneralCategoryGroup,
360    ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
361        self.map
362            .iter_ranges_mapped(move |value| group.contains(value))
363            .filter(|v| v.value)
364            .map(|v| v.range)
365    }
366}
367
368/// A Unicode character property that assigns a value to each code point.
369///
370/// The descriptions of most properties are taken from [`TR44`], the documentation for the
371/// Unicode Character Database.
372///
373/// <div class="stab unstable">
374/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
375/// trait, please consider using a type from the implementors listed below.
376/// </div>
377///
378/// [`TR44`]: https://www.unicode.org/reports/tr44
379pub trait EnumeratedProperty: crate::private::Sealed + TrieValue {
380    #[doc(hidden)]
381    type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>;
382    #[doc(hidden)]
383    #[cfg(feature = "compiled_data")]
384    const SINGLETON: &'static PropertyCodePointMap<'static, Self>;
385    /// The name of this property
386    const NAME: &'static [u8];
387    /// The abbreviated name of this property, if it exists, otherwise the name
388    const SHORT_NAME: &'static [u8];
389
390    /// Convenience method for `CodePointMapData::new().get(ch)`
391    ///
392    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
393    #[cfg(feature = "compiled_data")]
394    fn for_char(ch: char) -> Self {
395        CodePointMapData::new().get(ch)
396    }
397}