Skip to main content

icu_properties/
code_point_map.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "alloc")]
6use crate::code_point_set::CodePointSetData;
7use crate::props::GeneralCategory;
8use crate::props::GeneralCategoryGroup;
9use crate::provider::*;
10use core::ops::RangeInclusive;
11use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
12use icu_provider::marker::ErasedMarker;
13use icu_provider::prelude::*;
14
15/// A wrapper around code point map data.
16///
17/// It is returned by APIs that return Unicode
18/// property data in a map-like form, ex: enumerated property value data keyed
19/// by code point. Access its data via the borrowed version,
20/// [`CodePointMapDataBorrowed`].
21#[derive(#[automatically_derived]
impl<T: ::core::fmt::Debug + TrieValue> ::core::fmt::Debug for
    CodePointMapData<T> {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "CodePointMapData", "data", &&self.data)
    }
}Debug, #[automatically_derived]
impl<T: ::core::clone::Clone + TrieValue> ::core::clone::Clone for
    CodePointMapData<T> {
    #[inline]
    fn clone(&self) -> CodePointMapData<T> {
        CodePointMapData { data: ::core::clone::Clone::clone(&self.data) }
    }
}Clone)]
22pub struct CodePointMapData<T: TrieValue> {
23    data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>,
24}
25
26impl<T: TrieValue> CodePointMapData<T> {
27    /// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`].
28    ///
29    /// See the documentation on [`EnumeratedProperty`] implementations for details.
30    ///
31    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
32    ///
33    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
34    #[cfg(feature = "compiled_data")]
35    #[expect(clippy::new_ret_no_self)]
36    pub const fn new() -> CodePointMapDataBorrowed<'static, T>
37    where
38        T: EnumeratedProperty,
39    {
40        CodePointMapDataBorrowed::new()
41    }
42
43    #[cfg(feature = "serde")]
44    #[doc = icu_provider::gen_buffer_unstable_docs!(BUFFER, Self::new)]
45    pub fn try_new_with_buffer_provider(
46        provider: &(impl BufferProvider + ?Sized),
47    ) -> Result<Self, DataError>
48    where
49        T: EnumeratedProperty + for<'a> serde::Deserialize<'a>,
50    {
51        use icu_provider::buf::AsDeserializingBufferProvider;
52        Self::try_new_unstable(&provider.as_deserializing())
53    }
54
55    #[doc = "A version of [`Self::new`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
56    pub fn try_new_unstable(
57        provider: &(impl DataProvider<T::DataMarker> + ?Sized),
58    ) -> Result<Self, DataError>
59    where
60        T: EnumeratedProperty,
61    {
62        Ok(Self {
63            data: provider.load(Default::default())?.payload.cast(),
64        })
65    }
66
67    /// Construct a borrowed version of this type that can be queried.
68    ///
69    /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
70    /// up front.
71    ///
72    /// This owned version if returned by functions that use a runtime data provider.
73    #[inline]
74    pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
75        CodePointMapDataBorrowed {
76            map: self.data.get(),
77        }
78    }
79
80    /// Convert this map to a map around another type
81    ///
82    /// Typically useful for type-erasing maps into maps around integers.
83    ///
84    /// ✨ *Enabled with the `alloc` Cargo feature.*
85    ///
86    /// # Panics
87    /// Will panic if T and P are different sizes
88    ///
89    /// # Example
90    ///
91    /// ```
92    /// use icu::properties::CodePointMapData;
93    /// use icu::properties::props::GeneralCategory;
94    ///
95    /// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned();
96    ///
97    /// let gc = data.try_into_converted::<u8>().unwrap();
98    /// let gc = gc.as_borrowed();
99    ///
100    /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8);  // U+6728
101    /// assert_eq!(gc.get('πŸŽƒ'), GeneralCategory::OtherSymbol as u8);  // U+1F383 JACK-O-LANTERN
102    /// ```
103    #[cfg(feature = "alloc")]
104    pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError>
105    where
106        P: TrieValue,
107    {
108        self.data
109            .try_map_project(|data, _| data.try_into_converted())
110            .map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>)
111    }
112
113    /// Construct a new one from loaded data
114    ///
115    /// Typically it is preferable to use getters like [`load_general_category()`] instead
116    pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
117    where
118        M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,
119    {
120        Self { data: data.cast() }
121    }
122
123    /// Construct a new one an owned [`CodePointTrie`]
124    pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
125        let set = PropertyCodePointMap::from_code_point_trie(trie);
126        CodePointMapData::from_data(
127            DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set),
128        )
129    }
130
131    /// Convert this type to a [`CodePointTrie`] as a borrowed value.
132    ///
133    /// The data backing this is extensible and supports multiple implementations.
134    /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
135    /// added, and users may select which at data generation time.
136    ///
137    /// This method returns an `Option` in order to return `None` when the backing data provider
138    /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
139    /// constraint.
140    pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
141        self.data.get().as_code_point_trie()
142    }
143
144    /// Convert this type to a [`CodePointTrie`], borrowing if possible,
145    /// otherwise allocating a new [`CodePointTrie`].
146    ///
147    /// The data backing this is extensible and supports multiple implementations.
148    /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
149    /// added, and users may select which at data generation time.
150    ///
151    /// The performance of the conversion to this specific return type will vary
152    /// depending on the data structure that is backing `self`.
153    pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
154        self.data.get().to_code_point_trie()
155    }
156}
157
158/// A borrowed wrapper around code point set data, returned by
159/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
160#[derive(#[automatically_derived]
impl<'a, T: ::core::clone::Clone + TrieValue> ::core::clone::Clone for
    CodePointMapDataBorrowed<'a, T> {
    #[inline]
    fn clone(&self) -> CodePointMapDataBorrowed<'a, T> {
        CodePointMapDataBorrowed {
            map: ::core::clone::Clone::clone(&self.map),
        }
    }
}Clone, #[automatically_derived]
impl<'a, T: ::core::marker::Copy + TrieValue> ::core::marker::Copy for
    CodePointMapDataBorrowed<'a, T> {
}Copy, #[automatically_derived]
impl<'a, T: ::core::fmt::Debug + TrieValue> ::core::fmt::Debug for
    CodePointMapDataBorrowed<'a, T> {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "CodePointMapDataBorrowed", "map", &&self.map)
    }
}Debug)]
161pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
162    map: &'a PropertyCodePointMap<'a, T>,
163}
164
165impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
166    /// Get the value this map has associated with code point `ch`
167    ///
168    /// # Example
169    ///
170    /// ```
171    /// use icu::properties::CodePointMapData;
172    /// use icu::properties::props::GeneralCategory;
173    ///
174    /// let gc = CodePointMapData::<GeneralCategory>::new();
175    ///
176    /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter);  // U+6728
177    /// assert_eq!(gc.get('πŸŽƒ'), GeneralCategory::OtherSymbol);  // U+1F383 JACK-O-LANTERN
178    /// ```
179    #[inline]
180    pub fn get(self, ch: char) -> T {
181        self.map.get(ch)
182    }
183
184    /// See [`Self::get`].
185    #[inline]
186    pub fn get32(self, ch: u32) -> T {
187        self.map.get32(ch)
188    }
189
190    /// Get a [`CodePointSetData`] for all elements corresponding to a particular value
191    ///
192    /// ✨ *Enabled with the `alloc` Cargo feature.*
193    ///
194    /// # Example
195    ///
196    /// ```
197    /// use icu::properties::props::GeneralCategory;
198    /// use icu::properties::CodePointMapData;
199    ///
200    /// let gc = CodePointMapData::<GeneralCategory>::new();
201    ///
202    /// let other_letter_set_data =
203    ///     gc.get_set_for_value(GeneralCategory::OtherLetter);
204    /// let other_letter_set = other_letter_set_data.as_borrowed();
205    ///
206    /// assert!(other_letter_set.contains('木')); // U+6728
207    /// assert!(!other_letter_set.contains('πŸŽƒ')); // U+1F383 JACK-O-LANTERN
208    /// ```
209    #[cfg(feature = "alloc")]
210    pub fn get_set_for_value(self, value: T) -> CodePointSetData {
211        let set = self.map.get_set_for_value(value);
212        CodePointSetData::from_code_point_inversion_list(set)
213    }
214
215    /// Yields an [`Iterator`] returning ranges of consecutive code points that
216    /// share the same value in the [`CodePointMapData`].
217    ///
218    /// # Examples
219    ///
220    /// ```
221    /// use icu::properties::props::GeneralCategory;
222    /// use icu::properties::CodePointMapData;
223    ///
224    /// let gc = CodePointMapData::<GeneralCategory>::new();
225    /// let mut ranges = gc.iter_ranges();
226    /// let next = ranges.next().unwrap();
227    /// assert_eq!(next.range, 0..=31);
228    /// assert_eq!(next.value, GeneralCategory::Control);
229    /// let next = ranges.next().unwrap();
230    /// assert_eq!(next.range, 32..=32);
231    /// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
232    /// ```
233    pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
234        self.map.iter_ranges()
235    }
236
237    /// Yields an [`Iterator`] returning ranges of consecutive code points that
238    /// share the same value `v` in the [`CodePointMapData`].
239    ///
240    /// # Examples
241    ///
242    ///
243    /// ```
244    /// use icu::properties::props::GeneralCategory;
245    /// use icu::properties::CodePointMapData;
246    ///
247    /// let gc = CodePointMapData::<GeneralCategory>::new();
248    /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
249    /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
250    /// assert_eq!(ranges.next().unwrap(), 'Γ€' as u32..='Γ–' as u32);
251    /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32);
252    /// ```
253    pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
254        self.map
255            .iter_ranges()
256            .filter(move |r| r.value == val)
257            .map(|r| r.range)
258    }
259
260    /// Yields an [`Iterator`] returning ranges of consecutive code points that
261    /// do *not* have the value `v` in the [`CodePointMapData`].
262    pub fn iter_ranges_for_value_complemented(
263        self,
264        val: T,
265    ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
266        self.map
267            .iter_ranges_mapped(move |value| value != val)
268            .filter(|v| v.value)
269            .map(|v| v.range)
270    }
271
272    /// Exposed for FFI needs, could be exposed in general in the future but we should
273    /// have a use case first.
274    ///
275    /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
276    #[doc(hidden)] // used by FFI code
277    pub fn iter_ranges_mapped<U: Eq + 'a>(
278        self,
279        predicate: impl FnMut(T) -> U + Copy + 'a,
280    ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
281        self.map.iter_ranges_mapped(predicate)
282    }
283}
284
285impl CodePointMapDataBorrowed<'_, GeneralCategory> {
286    /// Get a [`CodePointSetData`] for all elements corresponding to a particular value group
287    ///
288    /// ✨ *Enabled with the `alloc` Cargo feature.*
289    ///
290    /// # Example
291    ///
292    /// ```
293    /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
294    /// use icu::properties::CodePointMapData;
295    ///
296    /// let gc = CodePointMapData::<GeneralCategory>::new();
297    ///
298    /// let other_letter_set_data =
299    ///     gc.get_set_for_value_group(GeneralCategoryGroup::OtherLetter);
300    /// let other_letter_set = other_letter_set_data.as_borrowed();
301    ///
302    /// assert!(other_letter_set.contains('木')); // U+6728
303    /// assert!(!other_letter_set.contains('πŸŽƒ')); // U+1F383 JACK-O-LANTERN
304    /// ```
305    #[cfg(feature = "alloc")]
306    pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> CodePointSetData {
307        let matching_gc_ranges = self
308            .iter_ranges()
309            .filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0)
310            .map(|cpm_range| cpm_range.range);
311        CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect())
312    }
313}
314
315#[cfg(feature = "compiled_data")]
316impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> {
317    fn default() -> Self {
318        Self::new()
319    }
320}
321
322impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
323    /// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`].
324    ///
325    /// See the documentation on [`EnumeratedProperty`] implementations for details.
326    ///
327    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
328    ///
329    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
330    #[cfg(feature = "compiled_data")]
331    pub const fn new() -> Self
332    where
333        T: EnumeratedProperty,
334    {
335        CodePointMapDataBorrowed { map: T::SINGLETON }
336    }
337
338    /// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`].
339    ///
340    /// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some
341    /// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`].
342    pub const fn static_to_owned(self) -> CodePointMapData<T> {
343        CodePointMapData {
344            data: DataPayload::from_static_ref(self.map),
345        }
346    }
347}
348
349impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> {
350    /// Yields an [`Iterator`] returning ranges of consecutive code points that
351    /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
352    ///
353    /// # Examples
354    ///
355    /// ```
356    /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
357    /// use icu::properties::CodePointMapData;
358    ///
359    /// let gc = CodePointMapData::<GeneralCategory>::new();
360    /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
361    /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
362    /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
363    /// assert_eq!(ranges.next().unwrap(), 'Βͺ' as u32..='Βͺ' as u32);
364    /// assert_eq!(ranges.next().unwrap(), 'Β΅' as u32..='Β΅' as u32);
365    /// assert_eq!(ranges.next().unwrap(), 'ΒΊ' as u32..='ΒΊ' as u32);
366    /// assert_eq!(ranges.next().unwrap(), 'Γ€' as u32..='Γ–' as u32);
367    /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='â' as u32);
368    /// ```
369    pub fn iter_ranges_for_group(
370        self,
371        group: GeneralCategoryGroup,
372    ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
373        self.map
374            .iter_ranges_mapped(move |value| group.contains(value))
375            .filter(|v| v.value)
376            .map(|v| v.range)
377    }
378}
379
380/// A Unicode character property that assigns a value to each code point.
381///
382/// The descriptions of most properties are taken from [`TR44`], the documentation for the
383/// Unicode Character Database.
384///
385/// <div class="stab unstable">
386/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
387/// trait, please consider using a type from the implementors listed below.
388/// </div>
389///
390/// [`TR44`]: https://www.unicode.org/reports/tr44
391pub trait EnumeratedProperty: crate::private::Sealed + TrieValue {
392    #[doc(hidden)]
393    type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>;
394    #[doc(hidden)]
395    #[cfg(feature = "compiled_data")]
396    const SINGLETON: &'static PropertyCodePointMap<'static, Self>;
397    /// The name of this property
398    const NAME: &'static [u8];
399    /// The abbreviated name of this property, if it exists, otherwise the name
400    const SHORT_NAME: &'static [u8];
401
402    /// Convenience method for `CodePointMapData::new().get(ch)`
403    ///
404    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
405    #[cfg(feature = "compiled_data")]
406    fn for_char(ch: char) -> Self {
407        CodePointMapData::new().get(ch)
408    }
409}