Skip to main content

icu_properties/
code_point_set.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::provider::*;
6use core::ops::RangeInclusive;
7use icu_collections::codepointinvlist::CodePointInversionList;
8use icu_provider::marker::ErasedMarker;
9use icu_provider::prelude::*;
10
11/// A set of Unicode code points. Access its data via the borrowed version,
12/// [`CodePointSetDataBorrowed`].
13///
14/// # Example
15/// ```rust
16/// use icu::properties::CodePointSetData;
17/// use icu::properties::props::Alphabetic;
18///
19/// let alphabetic = CodePointSetData::new::<Alphabetic>();
20///
21/// assert!(!alphabetic.contains('3'));
22/// assert!(!alphabetic.contains('੩'));  // U+0A69 GURMUKHI DIGIT THREE
23/// assert!(alphabetic.contains('A'));
24/// assert!(alphabetic.contains('Ä'));  // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
25/// ```
26#[derive(#[automatically_derived]
impl ::core::fmt::Debug for CodePointSetData {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "CodePointSetData", "data", &&self.data)
    }
}Debug)]
27pub struct CodePointSetData {
28    data: DataPayload<ErasedMarker<PropertyCodePointSet<'static>>>,
29}
30
31impl CodePointSetData {
32    /// Creates a new [`CodePointSetDataBorrowed`] for a [`BinaryProperty`].
33    ///
34    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
35    ///
36    /// [📚 Help choosing a constructor](icu_provider::constructors)
37    #[expect(clippy::new_ret_no_self)]
38    #[cfg(feature = "compiled_data")]
39    pub const fn new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static> {
40        CodePointSetDataBorrowed::new::<P>()
41    }
42
43    #[cfg(feature = "serde")]
44    #[doc = icu_provider::gen_buffer_unstable_docs!(BUFFER, Self::new)]
45    pub fn try_new_with_buffer_provider<P: BinaryProperty>(
46        provider: &(impl BufferProvider + ?Sized),
47    ) -> Result<CodePointSetData, DataError> {
48        use icu_provider::buf::AsDeserializingBufferProvider;
49        Self::try_new_unstable::<P>(&provider.as_deserializing())
50    }
51
52    #[doc = "A version of [`Self::new`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
53    pub fn try_new_unstable<P: BinaryProperty>(
54        provider: &(impl DataProvider<P::DataMarker> + ?Sized),
55    ) -> Result<CodePointSetData, DataError> {
56        Ok(CodePointSetData::from_data(
57            provider.load(Default::default())?.payload,
58        ))
59    }
60
61    /// Construct a borrowed version of this type that can be queried.
62    ///
63    /// This owned version if returned by functions that use a runtime data provider.
64    #[inline]
65    pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> {
66        CodePointSetDataBorrowed {
67            set: self.data.get(),
68        }
69    }
70
71    /// Construct a new one from loaded data
72    ///
73    /// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead
74    pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
75    where
76        M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,
77    {
78        Self { data: data.cast() }
79    }
80
81    /// Construct a new owned [`CodePointInversionList`]
82    pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self {
83        let set = PropertyCodePointSet::from_code_point_inversion_list(set);
84        CodePointSetData::from_data(
85            DataPayload::<ErasedMarker<PropertyCodePointSet<'static>>>::from_owned(set),
86        )
87    }
88
89    /// Convert this type to a [`CodePointInversionList`] as a borrowed value.
90    ///
91    /// The data backing this is extensible and supports multiple implementations.
92    /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
93    /// added, and users may select which at data generation time.
94    ///
95    /// This method returns an `Option` in order to return `None` when the backing data provider
96    /// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time
97    /// constraint.
98    pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> {
99        self.data.get().as_code_point_inversion_list()
100    }
101
102    /// Convert this type to a [`CodePointInversionList`], borrowing if possible,
103    /// otherwise allocating a new [`CodePointInversionList`].
104    ///
105    /// The data backing this is extensible and supports multiple implementations.
106    /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
107    /// added, and users may select which at data generation time.
108    ///
109    /// The performance of the conversion to this specific return type will vary
110    /// depending on the data structure that is backing `self`.
111    pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
112        self.data.get().to_code_point_inversion_list()
113    }
114}
115
116/// A borrowed wrapper around code point set data, returned by
117/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
118#[derive(#[automatically_derived]
impl<'a> ::core::clone::Clone for CodePointSetDataBorrowed<'a> {
    #[inline]
    fn clone(&self) -> CodePointSetDataBorrowed<'a> {
        let _:
                ::core::clone::AssertParamIsClone<&'a PropertyCodePointSet<'a>>;
        *self
    }
}Clone, #[automatically_derived]
impl<'a> ::core::marker::Copy for CodePointSetDataBorrowed<'a> { }Copy, #[automatically_derived]
impl<'a> ::core::fmt::Debug for CodePointSetDataBorrowed<'a> {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "CodePointSetDataBorrowed", "set", &&self.set)
    }
}Debug)]
119pub struct CodePointSetDataBorrowed<'a> {
120    set: &'a PropertyCodePointSet<'a>,
121}
122
123impl CodePointSetDataBorrowed<'static> {
124    /// Creates a new [`CodePointSetData`] for a [`BinaryProperty`].
125    ///
126    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
127    ///
128    /// [📚 Help choosing a constructor](icu_provider::constructors)
129    #[inline]
130    #[cfg(feature = "compiled_data")]
131    pub const fn new<P: BinaryProperty>() -> Self {
132        CodePointSetDataBorrowed { set: P::SINGLETON }
133    }
134    /// Cheaply converts a [`CodePointSetDataBorrowed<'static>`] into a [`CodePointSetData`].
135    ///
136    /// Note: Due to branching and indirection, using [`CodePointSetData`] might inhibit some
137    /// compile-time optimizations that are possible with [`CodePointSetDataBorrowed`].
138    pub const fn static_to_owned(self) -> CodePointSetData {
139        CodePointSetData {
140            data: DataPayload::from_static_ref(self.set),
141        }
142    }
143}
144
145impl<'a> CodePointSetDataBorrowed<'a> {
146    /// Check if the set contains a character
147    ///
148    /// ```rust
149    /// use icu::properties::CodePointSetData;
150    /// use icu::properties::props::Alphabetic;
151    ///
152    /// let alphabetic = CodePointSetData::new::<Alphabetic>();
153    ///
154    /// assert!(!alphabetic.contains('3'));
155    /// assert!(!alphabetic.contains('੩'));  // U+0A69 GURMUKHI DIGIT THREE
156    /// assert!(alphabetic.contains('A'));
157    /// assert!(alphabetic.contains('Ä'));  // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
158    /// ```
159    #[inline]
160    pub fn contains(self, ch: char) -> bool {
161        self.set.contains(ch)
162    }
163
164    /// See [`Self::contains`].
165    #[inline]
166    pub fn contains32(self, ch: u32) -> bool {
167        self.set.contains32(ch)
168    }
169
170    // Yields an [`Iterator`] returning the ranges of the code points that are
171    /// included in the [`CodePointSetData`]
172    ///
173    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
174    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
175    /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
176    ///
177    /// # Example
178    ///
179    /// ```
180    /// use icu::properties::props::Alphabetic;
181    /// use icu::properties::CodePointSetData;
182    ///
183    /// let alphabetic = CodePointSetData::new::<Alphabetic>();
184    /// let mut ranges = alphabetic.iter_ranges();
185    ///
186    /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
187    /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
188    /// ```
189    #[inline]
190    pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
191        self.set.iter_ranges()
192    }
193
194    // Yields an [`Iterator`] returning the ranges of the code points that are
195    /// *not* included in the [`CodePointSetData`]
196    ///
197    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
198    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
199    /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
200    ///
201    /// # Example
202    ///
203    /// ```
204    /// use icu::properties::props::Alphabetic;
205    /// use icu::properties::CodePointSetData;
206    ///
207    /// let alphabetic = CodePointSetData::new::<Alphabetic>();
208    /// let mut ranges = alphabetic.iter_ranges();
209    ///
210    /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
211    /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
212    /// ```
213    #[inline]
214    pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
215        self.set.iter_ranges_complemented()
216    }
217}
218
219/// A binary Unicode character property.
220///
221/// The descriptions of most properties are taken from [`TR44`], the documentation for the
222/// Unicode Character Database.  Some properties are instead defined in [`TR18`], the
223/// documentation for Unicode regular expressions. In particular, Annex C of this document
224/// defines properties for POSIX compatibility.
225///
226/// <div class="stab unstable">
227/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
228/// trait, please consider using a type from the implementors listed below.
229/// </div>
230///
231/// [`TR44`]: https://www.unicode.org/reports/tr44
232/// [`TR18`]: https://www.unicode.org/reports/tr18
233pub trait BinaryProperty: crate::private::Sealed + Sized {
234    #[doc(hidden)]
235    type DataMarker: DataMarker<DataStruct = PropertyCodePointSet<'static>>;
236    #[doc(hidden)]
237    #[cfg(feature = "compiled_data")]
238    const SINGLETON: &'static PropertyCodePointSet<'static>;
239    /// The name of this property
240    const NAME: &'static [u8];
241    /// The abbreviated name of this property, if it exists, otherwise the name
242    const SHORT_NAME: &'static [u8];
243
244    /// Convenience method for `CodePointSetData::new().contains(ch)`
245    ///
246    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
247    #[cfg(feature = "compiled_data")]
248    fn for_char(ch: char) -> bool {
249        CodePointSetData::new::<Self>().contains(ch)
250    }
251}
252
253#[cfg(test)]
254mod tests {
255    #[test]
256    fn test_general_category() {
257        use crate::props::GeneralCategory;
258        use crate::props::GeneralCategoryGroup;
259        use crate::CodePointMapData;
260
261        let digits_data = CodePointMapData::<GeneralCategory>::new()
262            .get_set_for_value_group(GeneralCategoryGroup::Number);
263        let digits = digits_data.as_borrowed();
264
265        assert!(digits.contains('5'));
266        assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE
267        assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE
268
269        assert!(!digits.contains('A'));
270    }
271
272    #[test]
273    fn test_script() {
274        use crate::props::Script;
275        use crate::CodePointMapData;
276
277        let thai_data = CodePointMapData::<Script>::new().get_set_for_value(Script::Thai);
278        let thai = thai_data.as_borrowed();
279
280        assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI
281        assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO
282
283        assert!(!thai.contains('A'));
284        assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT
285    }
286
287    #[test]
288    fn test_gc_groupings() {
289        use crate::props::{GeneralCategory, GeneralCategoryGroup};
290        use crate::CodePointMapData;
291        use icu_collections::codepointinvlist::CodePointInversionListBuilder;
292
293        let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| {
294            let category_set =
295                CodePointMapData::<GeneralCategory>::new().get_set_for_value_group(category);
296            let category_set = category_set
297                .as_code_point_inversion_list()
298                .expect("The data should be valid");
299
300            let mut builder = CodePointInversionListBuilder::new();
301            for &subcategory in subcategories {
302                let gc_set_data =
303                    CodePointMapData::<GeneralCategory>::new().get_set_for_value(subcategory);
304                let gc_set = gc_set_data.as_borrowed();
305                for range in gc_set.iter_ranges() {
306                    builder.add_range32(range);
307                }
308            }
309            let combined_set = builder.build();
310            println!("{category:?} {subcategories:?}");
311            assert_eq!(
312                category_set.get_inversion_list_vec(),
313                combined_set.get_inversion_list_vec()
314            );
315        };
316
317        test_group(
318            GeneralCategoryGroup::Letter,
319            &[
320                GeneralCategory::UppercaseLetter,
321                GeneralCategory::LowercaseLetter,
322                GeneralCategory::TitlecaseLetter,
323                GeneralCategory::ModifierLetter,
324                GeneralCategory::OtherLetter,
325            ],
326        );
327        test_group(
328            GeneralCategoryGroup::Other,
329            &[
330                GeneralCategory::Control,
331                GeneralCategory::Format,
332                GeneralCategory::Unassigned,
333                GeneralCategory::PrivateUse,
334                GeneralCategory::Surrogate,
335            ],
336        );
337        test_group(
338            GeneralCategoryGroup::Mark,
339            &[
340                GeneralCategory::SpacingMark,
341                GeneralCategory::EnclosingMark,
342                GeneralCategory::NonspacingMark,
343            ],
344        );
345        test_group(
346            GeneralCategoryGroup::Number,
347            &[
348                GeneralCategory::DecimalNumber,
349                GeneralCategory::LetterNumber,
350                GeneralCategory::OtherNumber,
351            ],
352        );
353        test_group(
354            GeneralCategoryGroup::Punctuation,
355            &[
356                GeneralCategory::ConnectorPunctuation,
357                GeneralCategory::DashPunctuation,
358                GeneralCategory::ClosePunctuation,
359                GeneralCategory::FinalPunctuation,
360                GeneralCategory::InitialPunctuation,
361                GeneralCategory::OtherPunctuation,
362                GeneralCategory::OpenPunctuation,
363            ],
364        );
365        test_group(
366            GeneralCategoryGroup::Symbol,
367            &[
368                GeneralCategory::CurrencySymbol,
369                GeneralCategory::ModifierSymbol,
370                GeneralCategory::MathSymbol,
371                GeneralCategory::OtherSymbol,
372            ],
373        );
374        test_group(
375            GeneralCategoryGroup::Separator,
376            &[
377                GeneralCategory::LineSeparator,
378                GeneralCategory::ParagraphSeparator,
379                GeneralCategory::SpaceSeparator,
380            ],
381        );
382    }
383
384    #[test]
385    fn test_gc_surrogate() {
386        use crate::props::GeneralCategory;
387        use crate::CodePointMapData;
388
389        let surrogates_data = CodePointMapData::<GeneralCategory>::new()
390            .get_set_for_value(GeneralCategory::Surrogate);
391        let surrogates = surrogates_data.as_borrowed();
392
393        assert!(surrogates.contains32(0xd800));
394        assert!(surrogates.contains32(0xd900));
395        assert!(surrogates.contains32(0xdfff));
396
397        assert!(!surrogates.contains('A'));
398    }
399}