1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
45use crate::provider::*;
6use core::ops::RangeInclusive;
7use icu_collections::codepointinvlist::CodePointInversionList;
8use icu_provider::marker::ErasedMarker;
9use icu_provider::prelude::*;
1011/// A set of Unicode code points. Access its data via the borrowed version,
12/// [`CodePointSetDataBorrowed`].
13///
14/// # Example
15/// ```rust
16/// use icu::properties::CodePointSetData;
17/// use icu::properties::props::Alphabetic;
18///
19/// let alphabetic = CodePointSetData::new::<Alphabetic>();
20///
21/// assert!(!alphabetic.contains('3'));
22/// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
23/// assert!(alphabetic.contains('A'));
24/// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
25/// ```
26#[derive(#[automatically_derived]
impl ::core::fmt::Debug for CodePointSetData {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f,
"CodePointSetData", "data", &&self.data)
}
}Debug)]
27pub struct CodePointSetData {
28 data: DataPayload<ErasedMarker<PropertyCodePointSet<'static>>>,
29}
3031impl CodePointSetData {
32/// Creates a new [`CodePointSetDataBorrowed`] for a [`BinaryProperty`].
33 ///
34 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
35 ///
36 /// [📚 Help choosing a constructor](icu_provider::constructors)
37#[expect(clippy::new_ret_no_self)]
38 #[cfg(feature = "compiled_data")]
39pub const fn new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static> {
40CodePointSetDataBorrowed::new::<P>()
41 }
4243#[cfg(feature = "serde")]
44 #[doc = icu_provider::gen_buffer_unstable_docs!(BUFFER, Self::new)]
45pub fn try_new_with_buffer_provider<P: BinaryProperty>(
46 provider: &(impl BufferProvider + ?Sized),
47 ) -> Result<CodePointSetData, DataError> {
48use icu_provider::buf::AsDeserializingBufferProvider;
49Self::try_new_unstable::<P>(&provider.as_deserializing())
50 }
5152#[doc = "A version of [`Self::new`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
53pub fn try_new_unstable<P: BinaryProperty>(
54 provider: &(impl DataProvider<P::DataMarker> + ?Sized),
55 ) -> Result<CodePointSetData, DataError> {
56Ok(CodePointSetData::from_data(
57provider.load(Default::default())?.payload,
58 ))
59 }
6061/// Construct a borrowed version of this type that can be queried.
62 ///
63 /// This owned version if returned by functions that use a runtime data provider.
64#[inline]
65pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> {
66CodePointSetDataBorrowed {
67 set: self.data.get(),
68 }
69 }
7071/// Construct a new one from loaded data
72 ///
73 /// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead
74pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
75where
76M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,
77 {
78Self { data: data.cast() }
79 }
8081/// Construct a new owned [`CodePointInversionList`]
82pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self {
83let set = PropertyCodePointSet::from_code_point_inversion_list(set);
84CodePointSetData::from_data(
85 DataPayload::<ErasedMarker<PropertyCodePointSet<'static>>>::from_owned(set),
86 )
87 }
8889/// Convert this type to a [`CodePointInversionList`] as a borrowed value.
90 ///
91 /// The data backing this is extensible and supports multiple implementations.
92 /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
93 /// added, and users may select which at data generation time.
94 ///
95 /// This method returns an `Option` in order to return `None` when the backing data provider
96 /// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time
97 /// constraint.
98pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> {
99self.data.get().as_code_point_inversion_list()
100 }
101102/// Convert this type to a [`CodePointInversionList`], borrowing if possible,
103 /// otherwise allocating a new [`CodePointInversionList`].
104 ///
105 /// The data backing this is extensible and supports multiple implementations.
106 /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
107 /// added, and users may select which at data generation time.
108 ///
109 /// The performance of the conversion to this specific return type will vary
110 /// depending on the data structure that is backing `self`.
111pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
112self.data.get().to_code_point_inversion_list()
113 }
114}
115116/// A borrowed wrapper around code point set data, returned by
117/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
118#[derive(#[automatically_derived]
impl<'a> ::core::clone::Clone for CodePointSetDataBorrowed<'a> {
#[inline]
fn clone(&self) -> CodePointSetDataBorrowed<'a> {
let _:
::core::clone::AssertParamIsClone<&'a PropertyCodePointSet<'a>>;
*self
}
}Clone, #[automatically_derived]
impl<'a> ::core::marker::Copy for CodePointSetDataBorrowed<'a> { }Copy, #[automatically_derived]
impl<'a> ::core::fmt::Debug for CodePointSetDataBorrowed<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f,
"CodePointSetDataBorrowed", "set", &&self.set)
}
}Debug)]
119pub struct CodePointSetDataBorrowed<'a> {
120 set: &'a PropertyCodePointSet<'a>,
121}
122123impl CodePointSetDataBorrowed<'static> {
124/// Creates a new [`CodePointSetData`] for a [`BinaryProperty`].
125 ///
126 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
127 ///
128 /// [📚 Help choosing a constructor](icu_provider::constructors)
129#[inline]
130 #[cfg(feature = "compiled_data")]
131pub const fn new<P: BinaryProperty>() -> Self {
132CodePointSetDataBorrowed { set: P::SINGLETON }
133 }
134/// Cheaply converts a [`CodePointSetDataBorrowed<'static>`] into a [`CodePointSetData`].
135 ///
136 /// Note: Due to branching and indirection, using [`CodePointSetData`] might inhibit some
137 /// compile-time optimizations that are possible with [`CodePointSetDataBorrowed`].
138pub const fn static_to_owned(self) -> CodePointSetData {
139CodePointSetData {
140 data: DataPayload::from_static_ref(self.set),
141 }
142 }
143}
144145impl<'a> CodePointSetDataBorrowed<'a> {
146/// Check if the set contains a character
147 ///
148 /// ```rust
149 /// use icu::properties::CodePointSetData;
150 /// use icu::properties::props::Alphabetic;
151 ///
152 /// let alphabetic = CodePointSetData::new::<Alphabetic>();
153 ///
154 /// assert!(!alphabetic.contains('3'));
155 /// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
156 /// assert!(alphabetic.contains('A'));
157 /// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
158 /// ```
159#[inline]
160pub fn contains(self, ch: char) -> bool {
161self.set.contains(ch)
162 }
163164/// See [`Self::contains`].
165#[inline]
166pub fn contains32(self, ch: u32) -> bool {
167self.set.contains32(ch)
168 }
169170// Yields an [`Iterator`] returning the ranges of the code points that are
171/// included in the [`CodePointSetData`]
172 ///
173 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
174 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
175 /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
176 ///
177 /// # Example
178 ///
179 /// ```
180 /// use icu::properties::props::Alphabetic;
181 /// use icu::properties::CodePointSetData;
182 ///
183 /// let alphabetic = CodePointSetData::new::<Alphabetic>();
184 /// let mut ranges = alphabetic.iter_ranges();
185 ///
186 /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
187 /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
188 /// ```
189#[inline]
190pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
191self.set.iter_ranges()
192 }
193194// Yields an [`Iterator`] returning the ranges of the code points that are
195/// *not* included in the [`CodePointSetData`]
196 ///
197 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
198 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
199 /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
200 ///
201 /// # Example
202 ///
203 /// ```
204 /// use icu::properties::props::Alphabetic;
205 /// use icu::properties::CodePointSetData;
206 ///
207 /// let alphabetic = CodePointSetData::new::<Alphabetic>();
208 /// let mut ranges = alphabetic.iter_ranges();
209 ///
210 /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
211 /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
212 /// ```
213#[inline]
214pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
215self.set.iter_ranges_complemented()
216 }
217}
218219/// A binary Unicode character property.
220///
221/// The descriptions of most properties are taken from [`TR44`], the documentation for the
222/// Unicode Character Database. Some properties are instead defined in [`TR18`], the
223/// documentation for Unicode regular expressions. In particular, Annex C of this document
224/// defines properties for POSIX compatibility.
225///
226/// <div class="stab unstable">
227/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
228/// trait, please consider using a type from the implementors listed below.
229/// </div>
230///
231/// [`TR44`]: https://www.unicode.org/reports/tr44
232/// [`TR18`]: https://www.unicode.org/reports/tr18
233pub trait BinaryProperty: crate::private::Sealed + Sized {
234#[doc(hidden)]
235type DataMarker: DataMarker<DataStruct = PropertyCodePointSet<'static>>;
236#[doc(hidden)]
237 #[cfg(feature = "compiled_data")]
238const SINGLETON: &'static PropertyCodePointSet<'static>;
239/// The name of this property
240const NAME: &'static [u8];
241/// The abbreviated name of this property, if it exists, otherwise the name
242const SHORT_NAME: &'static [u8];
243244/// Convenience method for `CodePointSetData::new().contains(ch)`
245 ///
246 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
247#[cfg(feature = "compiled_data")]
248fn for_char(ch: char) -> bool {
249CodePointSetData::new::<Self>().contains(ch)
250 }
251}
252253#[cfg(test)]
254mod tests {
255#[test]
256fn test_general_category() {
257use crate::props::GeneralCategory;
258use crate::props::GeneralCategoryGroup;
259use crate::CodePointMapData;
260261let digits_data = CodePointMapData::<GeneralCategory>::new()
262 .get_set_for_value_group(GeneralCategoryGroup::Number);
263let digits = digits_data.as_borrowed();
264265assert!(digits.contains('5'));
266assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE
267assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE
268269assert!(!digits.contains('A'));
270 }
271272#[test]
273fn test_script() {
274use crate::props::Script;
275use crate::CodePointMapData;
276277let thai_data = CodePointMapData::<Script>::new().get_set_for_value(Script::Thai);
278let thai = thai_data.as_borrowed();
279280assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI
281assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO
282283assert!(!thai.contains('A'));
284assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT
285}
286287#[test]
288fn test_gc_groupings() {
289use crate::props::{GeneralCategory, GeneralCategoryGroup};
290use crate::CodePointMapData;
291use icu_collections::codepointinvlist::CodePointInversionListBuilder;
292293let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| {
294let category_set =
295 CodePointMapData::<GeneralCategory>::new().get_set_for_value_group(category);
296let category_set = category_set
297 .as_code_point_inversion_list()
298 .expect("The data should be valid");
299300let mut builder = CodePointInversionListBuilder::new();
301for &subcategory in subcategories {
302let gc_set_data =
303 CodePointMapData::<GeneralCategory>::new().get_set_for_value(subcategory);
304let gc_set = gc_set_data.as_borrowed();
305for range in gc_set.iter_ranges() {
306 builder.add_range32(range);
307 }
308 }
309let combined_set = builder.build();
310println!("{category:?} {subcategories:?}");
311assert_eq!(
312 category_set.get_inversion_list_vec(),
313 combined_set.get_inversion_list_vec()
314 );
315 };
316317 test_group(
318 GeneralCategoryGroup::Letter,
319&[
320 GeneralCategory::UppercaseLetter,
321 GeneralCategory::LowercaseLetter,
322 GeneralCategory::TitlecaseLetter,
323 GeneralCategory::ModifierLetter,
324 GeneralCategory::OtherLetter,
325 ],
326 );
327 test_group(
328 GeneralCategoryGroup::Other,
329&[
330 GeneralCategory::Control,
331 GeneralCategory::Format,
332 GeneralCategory::Unassigned,
333 GeneralCategory::PrivateUse,
334 GeneralCategory::Surrogate,
335 ],
336 );
337 test_group(
338 GeneralCategoryGroup::Mark,
339&[
340 GeneralCategory::SpacingMark,
341 GeneralCategory::EnclosingMark,
342 GeneralCategory::NonspacingMark,
343 ],
344 );
345 test_group(
346 GeneralCategoryGroup::Number,
347&[
348 GeneralCategory::DecimalNumber,
349 GeneralCategory::LetterNumber,
350 GeneralCategory::OtherNumber,
351 ],
352 );
353 test_group(
354 GeneralCategoryGroup::Punctuation,
355&[
356 GeneralCategory::ConnectorPunctuation,
357 GeneralCategory::DashPunctuation,
358 GeneralCategory::ClosePunctuation,
359 GeneralCategory::FinalPunctuation,
360 GeneralCategory::InitialPunctuation,
361 GeneralCategory::OtherPunctuation,
362 GeneralCategory::OpenPunctuation,
363 ],
364 );
365 test_group(
366 GeneralCategoryGroup::Symbol,
367&[
368 GeneralCategory::CurrencySymbol,
369 GeneralCategory::ModifierSymbol,
370 GeneralCategory::MathSymbol,
371 GeneralCategory::OtherSymbol,
372 ],
373 );
374 test_group(
375 GeneralCategoryGroup::Separator,
376&[
377 GeneralCategory::LineSeparator,
378 GeneralCategory::ParagraphSeparator,
379 GeneralCategory::SpaceSeparator,
380 ],
381 );
382 }
383384#[test]
385fn test_gc_surrogate() {
386use crate::props::GeneralCategory;
387use crate::CodePointMapData;
388389let surrogates_data = CodePointMapData::<GeneralCategory>::new()
390 .get_set_for_value(GeneralCategory::Surrogate);
391let surrogates = surrogates_data.as_borrowed();
392393assert!(surrogates.contains32(0xd800));
394assert!(surrogates.contains32(0xd900));
395assert!(surrogates.contains32(0xdfff));
396397assert!(!surrogates.contains('A'));
398 }
399}